diff options
Diffstat (limited to 'net/netfilter')
103 files changed, 13864 insertions, 2564 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 6e839b6dff2..e9410d17619 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -413,6 +413,127 @@ config NETFILTER_SYNPROXY  endif # NF_CONNTRACK +config NF_TABLES +	select NETFILTER_NETLINK +	tristate "Netfilter nf_tables support" +	help +	  nftables is the new packet classification framework that intends to +	  replace the existing {ip,ip6,arp,eb}_tables infrastructure. It +	  provides a pseudo-state machine with an extensible instruction-set +	  (also known as expressions) that the userspace 'nft' utility +	  (http://www.netfilter.org/projects/nftables) uses to build the +	  rule-set. It also comes with the generic set infrastructure that +	  allows you to construct mappings between matchings and actions +	  for performance lookups. + +	  To compile it as a module, choose M here. + +config NF_TABLES_INET +	depends on NF_TABLES && IPV6 +	select NF_TABLES_IPV4 +	select NF_TABLES_IPV6 +	tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" +	help +	  This option enables support for a mixed IPv4/IPv6 "inet" table. + +config NFT_EXTHDR +	depends on NF_TABLES +	tristate "Netfilter nf_tables IPv6 exthdr module" +	help +	  This option adds the "exthdr" expression that you can use to match +	  IPv6 extension headers. + +config NFT_META +	depends on NF_TABLES +	tristate "Netfilter nf_tables meta module" +	help +	  This option adds the "meta" expression that you can use to match and +	  to set packet metainformation such as the packet mark. + +config NFT_CT +	depends on NF_TABLES +	depends on NF_CONNTRACK +	tristate "Netfilter nf_tables conntrack module" +	help +	  This option adds the "meta" expression that you can use to match +	  connection tracking information such as the flow state. + +config NFT_RBTREE +	depends on NF_TABLES +	tristate "Netfilter nf_tables rbtree set module" +	help +	  This option adds the "rbtree" set type (Red Black tree) that is used +	  to build interval-based sets. + +config NFT_HASH +	depends on NF_TABLES +	tristate "Netfilter nf_tables hash set module" +	help +	  This option adds the "hash" set type that is used to build one-way +	  mappings between matchings and actions. + +config NFT_COUNTER +	depends on NF_TABLES +	tristate "Netfilter nf_tables counter module" +	help +	  This option adds the "counter" expression that you can use to +	  include packet and byte counters in a rule. + +config NFT_LOG +	depends on NF_TABLES +	tristate "Netfilter nf_tables log module" +	help +	  This option adds the "log" expression that you can use to log +	  packets matching some criteria. + +config NFT_LIMIT +	depends on NF_TABLES +	tristate "Netfilter nf_tables limit module" +	help +	  This option adds the "limit" expression that you can use to +	  ratelimit rule matchings. + +config NFT_NAT +	depends on NF_TABLES +	depends on NF_CONNTRACK +	depends on NF_NAT +	tristate "Netfilter nf_tables nat module" +	help +	  This option adds the "nat" expression that you can use to perform +	  typical Network Address Translation (NAT) packet transformations. + +config NFT_QUEUE +	depends on NF_TABLES +	depends on NETFILTER_XTABLES +	depends on NETFILTER_NETLINK_QUEUE +	tristate "Netfilter nf_tables queue module" +	help +	  This is required if you intend to use the userspace queueing +	  infrastructure (also known as NFQUEUE) from nftables. + +config NFT_REJECT +	depends on NF_TABLES +	default m if NETFILTER_ADVANCED=n +	tristate "Netfilter nf_tables reject support" +	help +	  This option adds the "reject" expression that you can use to +	  explicitly deny and notify via TCP reset/ICMP informational errors +	  unallowed traffic. + +config NFT_REJECT_INET +	depends on NF_TABLES_INET +	default NFT_REJECT +	tristate + +config NFT_COMPAT +	depends on NF_TABLES +	depends on NETFILTER_XTABLES +	tristate "Netfilter x_tables over nf_tables module" +	help +	  This is required if you intend to use any of existing +	  x_tables match/target extensions over the nf_tables +	  framework. +  config NETFILTER_XTABLES  	tristate "Netfilter Xtables support (required for ip_tables)"  	default m if NETFILTER_ADVANCED=n @@ -806,6 +927,16 @@ config NETFILTER_XT_MATCH_BPF  	  To compile it as a module, choose M here.  If unsure, say N. +config NETFILTER_XT_MATCH_CGROUP +	tristate '"control group" match support' +	depends on NETFILTER_ADVANCED +	depends on CGROUPS +	select CGROUP_NET_CLASSID +	---help--- +	Socket/process control group matching allows you to match locally +	generated packets based on which net_cls control group processes +	belong to. +  config NETFILTER_XT_MATCH_CLUSTER  	tristate '"cluster" match support'  	depends on NF_CONNTRACK @@ -857,7 +988,7 @@ config NETFILTER_XT_MATCH_CONNLABEL  	  connection simultaneously.  config NETFILTER_XT_MATCH_CONNLIMIT -	tristate '"connlimit" match support"' +	tristate '"connlimit" match support'  	depends on NF_CONNTRACK  	depends on NETFILTER_ADVANCED  	---help--- @@ -983,6 +1114,15 @@ config NETFILTER_XT_MATCH_HL  	in the IPv6 header, or the time-to-live field in the IPv4  	header of the packet. +config NETFILTER_XT_MATCH_IPCOMP +	tristate '"ipcomp" match support' +	depends on NETFILTER_ADVANCED +	help +	  This match extension allows you to match a range of CPIs(16 bits) +	  inside IPComp header of IPSec packets. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config NETFILTER_XT_MATCH_IPRANGE  	tristate '"iprange" address range match support'  	depends on NETFILTER_ADVANCED @@ -1003,6 +1143,16 @@ config NETFILTER_XT_MATCH_IPVS  	  If unsure, say N. +config NETFILTER_XT_MATCH_L2TP +	tristate '"l2tp" match support' +	depends on NETFILTER_ADVANCED +	default L2TP +	---help--- +	This option adds an "L2TP" match, which allows you to match against +	L2TP protocol header fields. + +	To compile it as a module, choose M here. If unsure, say N. +  config NETFILTER_XT_MATCH_LENGTH  	tristate '"length" match support'  	depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index c3a0a12907f..bffdad774da 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -64,6 +64,27 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o  # SYNPROXY  obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o +# nf_tables +nf_tables-objs += nf_tables_core.o nf_tables_api.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o +nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o + +obj-$(CONFIG_NF_TABLES)		+= nf_tables.o +obj-$(CONFIG_NF_TABLES_INET)	+= nf_tables_inet.o +obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o +obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o +obj-$(CONFIG_NFT_META)		+= nft_meta.o +obj-$(CONFIG_NFT_CT)		+= nft_ct.o +obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o +obj-$(CONFIG_NFT_NAT)		+= nft_nat.o +obj-$(CONFIG_NFT_QUEUE)		+= nft_queue.o +obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET)	+= nft_reject_inet.o +obj-$(CONFIG_NFT_RBTREE)	+= nft_rbtree.o +obj-$(CONFIG_NFT_HASH)		+= nft_hash.o +obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o +obj-$(CONFIG_NFT_LOG)		+= nft_log.o +  # generic X tables   obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o @@ -115,8 +136,10 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o  obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o  obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o  obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o  obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o  obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o +obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o  obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o  obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o  obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o @@ -124,6 +147,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o  obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o  obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o  obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o  obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o  obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o  obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 593b16ea45e..1fbab0cdd30 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -146,7 +146,7 @@ unsigned int nf_iterate(struct list_head *head,  		/* Optimization: we don't need to hold module  		   reference here, since function can't sleep. --RR */  repeat: -		verdict = (*elemp)->hook(hook, skb, indev, outdev, okfn); +		verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn);  		if (verdict != NF_ACCEPT) {  #ifdef CONFIG_NETFILTER_DEBUG  			if (unlikely((verdict & NF_VERDICT_MASK) diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index ba36c283d83..2f7f5c32c6f 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -1,7 +1,7 @@  menuconfig IP_SET  	tristate "IP set support"  	depends on INET && NETFILTER -	depends on NETFILTER_NETLINK +	select NETFILTER_NETLINK  	help  	  This option adds IP set support to the kernel.  	  In order to define and use the sets, you need the userspace utility @@ -21,7 +21,7 @@ config IP_SET_MAX  	  You can define here default value of the maximum number   	  of IP sets for the kernel. -	  The value can be overriden by the 'max_sets' module +	  The value can be overridden by the 'max_sets' module  	  parameter of the 'ip_set' module.  config IP_SET_BITMAP_IP @@ -61,6 +61,15 @@ config IP_SET_HASH_IP  	  To compile it as a module, choose M here.  If unsure, say N. +config IP_SET_HASH_IPMARK +	tristate "hash:ip,mark set support" +	depends on IP_SET +	help +	  This option adds the hash:ip,mark set type support, by which one +	  can store IPv4/IPv6 address and mark pairs. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config IP_SET_HASH_IPPORT  	tristate "hash:ip,port set support"  	depends on IP_SET @@ -90,6 +99,15 @@ config IP_SET_HASH_IPPORTNET  	  To compile it as a module, choose M here.  If unsure, say N. +config IP_SET_HASH_NETPORTNET +	tristate "hash:net,port,net set support" +	depends on IP_SET +	help +	  This option adds the hash:net,port,net set type support, by which +	  one can store two IPv4/IPv6 subnets, and a protocol/port in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config IP_SET_HASH_NET  	tristate "hash:net set support"  	depends on IP_SET @@ -99,6 +117,15 @@ config IP_SET_HASH_NET  	  To compile it as a module, choose M here.  If unsure, say N. +config IP_SET_HASH_NETNET +	tristate "hash:net,net set support" +	depends on IP_SET +	help +	  This option adds the hash:net,net  set type support, by which +	  one can store IPv4/IPv6 network address/prefix pairs in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config IP_SET_HASH_NETPORT  	tristate "hash:net,port set support"  	depends on IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile index 6e965ecd544..231f10196cb 100644 --- a/net/netfilter/ipset/Makefile +++ b/net/netfilter/ipset/Makefile @@ -14,12 +14,15 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o  # hash types  obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o  obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o  obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o  obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o  obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o  obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o  obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o +obj-$(CONFIG_IP_SET_HASH_NETNET) += ip_set_hash_netnet.o +obj-$(CONFIG_IP_SET_HASH_NETPORTNET) += ip_set_hash_netportnet.o  # list types  obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 25243379b88..f2c7d83dc23 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -8,38 +8,32 @@  #ifndef __IP_SET_BITMAP_IP_GEN_H  #define __IP_SET_BITMAP_IP_GEN_H -#define CONCAT(a, b)		a##b -#define TOKEN(a,b)		CONCAT(a, b) - -#define mtype_do_test		TOKEN(MTYPE, _do_test) -#define mtype_gc_test		TOKEN(MTYPE, _gc_test) -#define mtype_is_filled		TOKEN(MTYPE, _is_filled) -#define mtype_do_add		TOKEN(MTYPE, _do_add) -#define mtype_do_del		TOKEN(MTYPE, _do_del) -#define mtype_do_list		TOKEN(MTYPE, _do_list) -#define mtype_do_head		TOKEN(MTYPE, _do_head) -#define mtype_adt_elem		TOKEN(MTYPE, _adt_elem) -#define mtype_add_timeout	TOKEN(MTYPE, _add_timeout) -#define mtype_gc_init		TOKEN(MTYPE, _gc_init) -#define mtype_kadt		TOKEN(MTYPE, _kadt) -#define mtype_uadt		TOKEN(MTYPE, _uadt) -#define mtype_destroy		TOKEN(MTYPE, _destroy) -#define mtype_flush		TOKEN(MTYPE, _flush) -#define mtype_head		TOKEN(MTYPE, _head) -#define mtype_same_set		TOKEN(MTYPE, _same_set) -#define mtype_elem		TOKEN(MTYPE, _elem) -#define mtype_test		TOKEN(MTYPE, _test) -#define mtype_add		TOKEN(MTYPE, _add) -#define mtype_del		TOKEN(MTYPE, _del) -#define mtype_list		TOKEN(MTYPE, _list) -#define mtype_gc		TOKEN(MTYPE, _gc) +#define mtype_do_test		IPSET_TOKEN(MTYPE, _do_test) +#define mtype_gc_test		IPSET_TOKEN(MTYPE, _gc_test) +#define mtype_is_filled		IPSET_TOKEN(MTYPE, _is_filled) +#define mtype_do_add		IPSET_TOKEN(MTYPE, _do_add) +#define mtype_ext_cleanup	IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_do_del		IPSET_TOKEN(MTYPE, _do_del) +#define mtype_do_list		IPSET_TOKEN(MTYPE, _do_list) +#define mtype_do_head		IPSET_TOKEN(MTYPE, _do_head) +#define mtype_adt_elem		IPSET_TOKEN(MTYPE, _adt_elem) +#define mtype_add_timeout	IPSET_TOKEN(MTYPE, _add_timeout) +#define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_kadt		IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt		IPSET_TOKEN(MTYPE, _uadt) +#define mtype_destroy		IPSET_TOKEN(MTYPE, _destroy) +#define mtype_flush		IPSET_TOKEN(MTYPE, _flush) +#define mtype_head		IPSET_TOKEN(MTYPE, _head) +#define mtype_same_set		IPSET_TOKEN(MTYPE, _same_set) +#define mtype_elem		IPSET_TOKEN(MTYPE, _elem) +#define mtype_test		IPSET_TOKEN(MTYPE, _test) +#define mtype_add		IPSET_TOKEN(MTYPE, _add) +#define mtype_del		IPSET_TOKEN(MTYPE, _del) +#define mtype_list		IPSET_TOKEN(MTYPE, _list) +#define mtype_gc		IPSET_TOKEN(MTYPE, _gc)  #define mtype			MTYPE -#define ext_timeout(e, m)	\ -	(unsigned long *)((e) + (m)->offset[IPSET_OFFSET_TIMEOUT]) -#define ext_counter(e, m)	\ -	(struct ip_set_counter *)((e) + (m)->offset[IPSET_OFFSET_COUNTER]) -#define get_ext(map, id)	((map)->extensions + (map)->dsize * (id)) +#define get_ext(set, map, id)	((map)->extensions + (set)->dsize * (id))  static void  mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) @@ -49,11 +43,22 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))  	init_timer(&map->gc);  	map->gc.data = (unsigned long) set;  	map->gc.function = gc; -	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;  	add_timer(&map->gc);  }  static void +mtype_ext_cleanup(struct ip_set *set) +{ +	struct mtype *map = set->data; +	u32 id; + +	for (id = 0; id < map->elements; id++) +		if (test_bit(id, map->members)) +			ip_set_ext_destroy(set, get_ext(set, map, id)); +} + +static void  mtype_destroy(struct ip_set *set)  {  	struct mtype *map = set->data; @@ -62,8 +67,11 @@ mtype_destroy(struct ip_set *set)  		del_timer_sync(&map->gc);  	ip_set_free(map->members); -	if (map->dsize) +	if (set->dsize) { +		if (set->extensions & IPSET_EXT_DESTROY) +			mtype_ext_cleanup(set);  		ip_set_free(map->extensions); +	}  	kfree(map);  	set->data = NULL; @@ -74,6 +82,8 @@ mtype_flush(struct ip_set *set)  {  	struct mtype *map = set->data; +	if (set->extensions & IPSET_EXT_DESTROY) +		mtype_ext_cleanup(set);  	memset(map->members, 0, map->memsize);  } @@ -91,12 +101,9 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)  	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE,  			  htonl(sizeof(*map) +  				map->memsize + -				map->dsize * map->elements)) || -	    (SET_WITH_TIMEOUT(set) && -	     nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) || -	    (SET_WITH_COUNTER(set) && -	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, -			   htonl(IPSET_FLAG_WITH_COUNTERS)))) +				set->dsize * map->elements))) +		goto nla_put_failure; +	if (unlikely(ip_set_put_flags(skb, set)))  		goto nla_put_failure;  	ipset_nest_end(skb, nested); @@ -111,16 +118,16 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,  {  	struct mtype *map = set->data;  	const struct mtype_adt_elem *e = value; -	void *x = get_ext(map, e->id); -	int ret = mtype_do_test(e, map); +	void *x = get_ext(set, map, e->id); +	int ret = mtype_do_test(e, map, set->dsize);  	if (ret <= 0)  		return ret;  	if (SET_WITH_TIMEOUT(set) && -	    ip_set_timeout_expired(ext_timeout(x, map))) +	    ip_set_timeout_expired(ext_timeout(x, set)))  		return 0;  	if (SET_WITH_COUNTER(set)) -		ip_set_update_counter(ext_counter(x, map), ext, mext, flags); +		ip_set_update_counter(ext_counter(x, set), ext, mext, flags);  	return 1;  } @@ -130,26 +137,30 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,  {  	struct mtype *map = set->data;  	const struct mtype_adt_elem *e = value; -	void *x = get_ext(map, e->id); -	int ret = mtype_do_add(e, map, flags); +	void *x = get_ext(set, map, e->id); +	int ret = mtype_do_add(e, map, flags, set->dsize);  	if (ret == IPSET_ADD_FAILED) {  		if (SET_WITH_TIMEOUT(set) && -		    ip_set_timeout_expired(ext_timeout(x, map))) +		    ip_set_timeout_expired(ext_timeout(x, set)))  			ret = 0;  		else if (!(flags & IPSET_FLAG_EXIST))  			return -IPSET_ERR_EXIST; +		/* Element is re-added, cleanup extensions */ +		ip_set_ext_destroy(set, x);  	}  	if (SET_WITH_TIMEOUT(set))  #ifdef IP_SET_BITMAP_STORED_TIMEOUT -		mtype_add_timeout(ext_timeout(x, map), e, ext, map, ret); +		mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret);  #else -		ip_set_timeout_set(ext_timeout(x, map), ext->timeout); +		ip_set_timeout_set(ext_timeout(x, set), ext->timeout);  #endif  	if (SET_WITH_COUNTER(set)) -		ip_set_init_counter(ext_counter(x, map), ext); +		ip_set_init_counter(ext_counter(x, set), ext); +	if (SET_WITH_COMMENT(set)) +		ip_set_init_comment(ext_comment(x, set), ext);  	return 0;  } @@ -159,16 +170,27 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,  {  	struct mtype *map = set->data;  	const struct mtype_adt_elem *e = value; -	const void *x = get_ext(map, e->id); +	void *x = get_ext(set, map, e->id); -	if (mtype_do_del(e, map) || -	    (SET_WITH_TIMEOUT(set) && -	     ip_set_timeout_expired(ext_timeout(x, map)))) +	if (mtype_do_del(e, map)) +		return -IPSET_ERR_EXIST; + +	ip_set_ext_destroy(set, x); +	if (SET_WITH_TIMEOUT(set) && +	    ip_set_timeout_expired(ext_timeout(x, set)))  		return -IPSET_ERR_EXIST;  	return 0;  } +#ifndef IP_SET_BITMAP_STORED_TIMEOUT +static inline bool +mtype_is_filled(const struct mtype_elem *x) +{ +	return true; +} +#endif +  static int  mtype_list(const struct ip_set *set,  	   struct sk_buff *skb, struct netlink_callback *cb) @@ -176,20 +198,21 @@ mtype_list(const struct ip_set *set,  	struct mtype *map = set->data;  	struct nlattr *adt, *nested;  	void *x; -	u32 id, first = cb->args[2]; +	u32 id, first = cb->args[IPSET_CB_ARG0];  	adt = ipset_nest_start(skb, IPSET_ATTR_ADT);  	if (!adt)  		return -EMSGSIZE; -	for (; cb->args[2] < map->elements; cb->args[2]++) { -		id = cb->args[2]; -		x = get_ext(map, id); +	for (; cb->args[IPSET_CB_ARG0] < map->elements; +	     cb->args[IPSET_CB_ARG0]++) { +		id = cb->args[IPSET_CB_ARG0]; +		x = get_ext(set, map, id);  		if (!test_bit(id, map->members) ||  		    (SET_WITH_TIMEOUT(set) &&  #ifdef IP_SET_BITMAP_STORED_TIMEOUT  		     mtype_is_filled((const struct mtype_elem *) x) &&  #endif -		     ip_set_timeout_expired(ext_timeout(x, map)))) +		     ip_set_timeout_expired(ext_timeout(x, set))))  			continue;  		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);  		if (!nested) { @@ -199,40 +222,27 @@ mtype_list(const struct ip_set *set,  			} else  				goto nla_put_failure;  		} -		if (mtype_do_list(skb, map, id)) +		if (mtype_do_list(skb, map, id, set->dsize))  			goto nla_put_failure; -		if (SET_WITH_TIMEOUT(set)) { -#ifdef IP_SET_BITMAP_STORED_TIMEOUT -			if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, -					  htonl(ip_set_timeout_stored(map, id, -							ext_timeout(x, map))))) -				goto nla_put_failure; -#else -			if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, -					  htonl(ip_set_timeout_get( -							ext_timeout(x, map))))) -				goto nla_put_failure; -#endif -		} -		if (SET_WITH_COUNTER(set) && -		    ip_set_put_counter(skb, ext_counter(x, map))) +		if (ip_set_put_extensions(skb, set, x, +		    mtype_is_filled((const struct mtype_elem *) x)))  			goto nla_put_failure;  		ipset_nest_end(skb, nested);  	}  	ipset_nest_end(skb, adt);  	/* Set listing finished */ -	cb->args[2] = 0; +	cb->args[IPSET_CB_ARG0] = 0;  	return 0;  nla_put_failure:  	nla_nest_cancel(skb, nested); -	ipset_nest_end(skb, adt);  	if (unlikely(id == first)) { -		cb->args[2] = 0; +		cb->args[IPSET_CB_ARG0] = 0;  		return -EMSGSIZE;  	} +	ipset_nest_end(skb, adt);  	return 0;  } @@ -241,21 +251,23 @@ mtype_gc(unsigned long ul_set)  {  	struct ip_set *set = (struct ip_set *) ul_set;  	struct mtype *map = set->data; -	const void *x; +	void *x;  	u32 id;  	/* We run parallel with other readers (test element)  	 * but adding/deleting new entries is locked out */  	read_lock_bh(&set->lock);  	for (id = 0; id < map->elements; id++) -		if (mtype_gc_test(id, map)) { -			x = get_ext(map, id); -			if (ip_set_timeout_expired(ext_timeout(x, map))) +		if (mtype_gc_test(id, map, set->dsize)) { +			x = get_ext(set, map, id); +			if (ip_set_timeout_expired(ext_timeout(x, set))) {  				clear_bit(id, map->members); +				ip_set_ext_destroy(set, x); +			}  		}  	read_unlock_bh(&set->lock); -	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;  	add_timer(&map->gc);  } diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index f1a8128bef0..6f1f9f49480 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -25,12 +25,13 @@  #include <linux/netfilter/ipset/ip_set.h>  #include <linux/netfilter/ipset/ip_set_bitmap.h> -#define REVISION_MIN	0 -#define REVISION_MAX	1	/* Counter support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counter support added */ +#define IPSET_TYPE_REV_MAX	2	/* Comment support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("bitmap:ip", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_bitmap:ip");  #define MTYPE		bitmap_ip @@ -44,10 +45,7 @@ struct bitmap_ip {  	u32 elements;		/* number of max elements in the set */  	u32 hosts;		/* number of hosts in a subnet */  	size_t memsize;		/* members size */ -	size_t dsize;		/* extensions struct size */ -	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */  	u8 netmask;		/* subnet netmask */ -	u32 timeout;		/* timeout parameter */  	struct timer_list gc;	/* garbage collection */  }; @@ -65,20 +63,21 @@ ip_to_id(const struct bitmap_ip *m, u32 ip)  /* Common functions */  static inline int -bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) +bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, +		  struct bitmap_ip *map, size_t dsize)  {  	return !!test_bit(e->id, map->members);  }  static inline int -bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map) +bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize)  {  	return !!test_bit(id, map->members);  }  static inline int  bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, -		 u32 flags) +		 u32 flags, size_t dsize)  {  	return !!test_and_set_bit(e->id, map->members);  } @@ -90,7 +89,8 @@ bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)  }  static inline int -bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id) +bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, +		  size_t dsize)  {  	return nla_put_ipaddr4(skb, IPSET_ATTR_IP,  			htonl(map->first_ip + id * map->hosts)); @@ -113,7 +113,7 @@ bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,  	struct bitmap_ip *map = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct bitmap_ip_adt_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	u32 ip;  	ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); @@ -131,9 +131,9 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],  {  	struct bitmap_ip *map = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt]; -	u32 ip, ip_to; +	u32 ip = 0, ip_to = 0;  	struct bitmap_ip_adt_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(map); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	int ret = 0;  	if (unlikely(!tb[IPSET_ATTR_IP] || @@ -200,7 +200,7 @@ bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)  	return x->first_ip == y->first_ip &&  	       x->last_ip == y->last_ip &&  	       x->netmask == y->netmask && -	       x->timeout == y->timeout && +	       a->timeout == b->timeout &&  	       a->extensions == b->extensions;  } @@ -209,25 +209,6 @@ bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)  struct bitmap_ip_elem {  }; -/* Timeout variant */ - -struct bitmap_ipt_elem { -	unsigned long timeout; -}; - -/* Plain variant with counter */ - -struct bitmap_ipc_elem { -	struct ip_set_counter counter; -}; - -/* Timeout variant with counter */ - -struct bitmap_ipct_elem { -	unsigned long timeout; -	struct ip_set_counter counter; -}; -  #include "ip_set_bitmap_gen.h"  /* Create bitmap:ip type of sets */ @@ -240,8 +221,8 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,  	map->members = ip_set_alloc(map->memsize);  	if (!map->members)  		return false; -	if (map->dsize) { -		map->extensions = ip_set_alloc(map->dsize * elements); +	if (set->dsize) { +		map->extensions = ip_set_alloc(set->dsize * elements);  		if (!map->extensions) {  			kfree(map->members);  			return false; @@ -252,7 +233,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,  	map->elements = elements;  	map->hosts = hosts;  	map->netmask = netmask; -	map->timeout = IPSET_NO_TIMEOUT; +	set->timeout = IPSET_NO_TIMEOUT;  	set->data = map;  	set->family = NFPROTO_IPV4; @@ -261,10 +242,11 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,  }  static int -bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], +		 u32 flags)  {  	struct bitmap_ip *map; -	u32 first_ip, last_ip, hosts, cadt_flags = 0; +	u32 first_ip = 0, last_ip = 0, hosts;  	u64 elements;  	u8 netmask = 32;  	int ret; @@ -336,61 +318,15 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)  	map->memsize = bitmap_bytes(0, elements - 1);  	set->variant = &bitmap_ip; -	if (tb[IPSET_ATTR_CADT_FLAGS]) -		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); -	if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { -		set->extensions |= IPSET_EXT_COUNTER; -		if (tb[IPSET_ATTR_TIMEOUT]) { -			map->dsize = sizeof(struct bitmap_ipct_elem); -			map->offset[IPSET_OFFSET_TIMEOUT] = -				offsetof(struct bitmap_ipct_elem, timeout); -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct bitmap_ipct_elem, counter); - -			if (!init_map_ip(set, map, first_ip, last_ip, -					 elements, hosts, netmask)) { -				kfree(map); -				return -ENOMEM; -			} - -			map->timeout = ip_set_timeout_uget( -				tb[IPSET_ATTR_TIMEOUT]); -			set->extensions |= IPSET_EXT_TIMEOUT; - -			bitmap_ip_gc_init(set, bitmap_ip_gc); -		} else { -			map->dsize = sizeof(struct bitmap_ipc_elem); -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct bitmap_ipc_elem, counter); - -			if (!init_map_ip(set, map, first_ip, last_ip, -					 elements, hosts, netmask)) { -				kfree(map); -				return -ENOMEM; -			} -		} -	} else if (tb[IPSET_ATTR_TIMEOUT]) { -		map->dsize = sizeof(struct bitmap_ipt_elem); -		map->offset[IPSET_OFFSET_TIMEOUT] = -			offsetof(struct bitmap_ipt_elem, timeout); - -		if (!init_map_ip(set, map, first_ip, last_ip, -				 elements, hosts, netmask)) { -			kfree(map); -			return -ENOMEM; -		} - -		map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); -		set->extensions |= IPSET_EXT_TIMEOUT; - +	set->dsize = ip_set_elem_len(set, tb, 0); +	if (!init_map_ip(set, map, first_ip, last_ip, +			 elements, hosts, netmask)) { +		kfree(map); +		return -ENOMEM; +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);  		bitmap_ip_gc_init(set, bitmap_ip_gc); -	} else { -		map->dsize = 0; -		if (!init_map_ip(set, map, first_ip, last_ip, -				 elements, hosts, netmask)) { -			kfree(map); -			return -ENOMEM; -		}  	}  	return 0;  } @@ -401,8 +337,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {  	.features	= IPSET_TYPE_IP,  	.dimension	= IPSET_DIM_ONE,  	.family		= NFPROTO_IPV4, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= bitmap_ip_create,  	.create_policy	= {  		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, @@ -420,6 +356,7 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 3b30e0bef89..740eabededd 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -25,12 +25,13 @@  #include <linux/netfilter/ipset/ip_set.h>  #include <linux/netfilter/ipset/ip_set_bitmap.h> -#define REVISION_MIN	0 -#define REVISION_MAX	1	/* Counter support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counter support added */ +#define IPSET_TYPE_REV_MAX	2	/* Comment support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("bitmap:ip,mac", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_bitmap:ip,mac");  #define MTYPE		bitmap_ipmac @@ -48,11 +49,8 @@ struct bitmap_ipmac {  	u32 first_ip;		/* host byte order, included in range */  	u32 last_ip;		/* host byte order, included in range */  	u32 elements;		/* number of max elements in the set */ -	u32 timeout;		/* timeout value */ -	struct timer_list gc;	/* garbage collector */  	size_t memsize;		/* members size */ -	size_t dsize;		/* size of element */ -	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ +	struct timer_list gc;	/* garbage collector */  };  /* ADT structure for generic function args */ @@ -82,13 +80,13 @@ get_elem(void *extensions, u16 id, size_t dsize)  static inline int  bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, -		     const struct bitmap_ipmac *map) +		     const struct bitmap_ipmac *map, size_t dsize)  {  	const struct bitmap_ipmac_elem *elem;  	if (!test_bit(e->id, map->members))  		return 0; -	elem = get_elem(map->extensions, e->id, map->dsize); +	elem = get_elem(map->extensions, e->id, dsize);  	if (elem->filled == MAC_FILLED)  		return e->ether == NULL ||  		       ether_addr_equal(e->ether, elem->ether); @@ -97,13 +95,13 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,  }  static inline int -bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map) +bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)  {  	const struct bitmap_ipmac_elem *elem;  	if (!test_bit(id, map->members))  		return 0; -	elem = get_elem(map->extensions, id, map->dsize); +	elem = get_elem(map->extensions, id, dsize);  	/* Timer not started for the incomplete elements */  	return elem->filled == MAC_FILLED;  } @@ -117,13 +115,13 @@ bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem)  static inline int  bitmap_ipmac_add_timeout(unsigned long *timeout,  			 const struct bitmap_ipmac_adt_elem *e, -			 const struct ip_set_ext *ext, +			 const struct ip_set_ext *ext, struct ip_set *set,  			 struct bitmap_ipmac *map, int mode)  {  	u32 t = ext->timeout;  	if (mode == IPSET_ADD_START_STORED_TIMEOUT) { -		if (t == map->timeout) +		if (t == set->timeout)  			/* Timeout was not specified, get stored one */  			t = *timeout;  		ip_set_timeout_set(timeout, t); @@ -142,11 +140,11 @@ bitmap_ipmac_add_timeout(unsigned long *timeout,  static inline int  bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, -		    struct bitmap_ipmac *map, u32 flags) +		    struct bitmap_ipmac *map, u32 flags, size_t dsize)  {  	struct bitmap_ipmac_elem *elem; -	elem = get_elem(map->extensions, e->id, map->dsize); +	elem = get_elem(map->extensions, e->id, dsize);  	if (test_and_set_bit(e->id, map->members)) {  		if (elem->filled == MAC_FILLED) {  			if (e->ether && (flags & IPSET_FLAG_EXIST)) @@ -178,22 +176,12 @@ bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e,  	return !test_and_clear_bit(e->id, map->members);  } -static inline unsigned long -ip_set_timeout_stored(struct bitmap_ipmac *map, u32 id, unsigned long *timeout) -{ -	const struct bitmap_ipmac_elem *elem = -		get_elem(map->extensions, id, map->dsize); - -	return elem->filled == MAC_FILLED ? ip_set_timeout_get(timeout) : -					    *timeout; -} -  static inline int  bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, -		     u32 id) +		     u32 id, size_t dsize)  {  	const struct bitmap_ipmac_elem *elem = -		get_elem(map->extensions, id, map->dsize); +		get_elem(map->extensions, id, dsize);  	return nla_put_ipaddr4(skb, IPSET_ATTR_IP,  			       htonl(map->first_ip + id)) || @@ -216,7 +204,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,  	struct bitmap_ipmac *map = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct bitmap_ipmac_adt_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	u32 ip;  	/* MAC can be src only */ @@ -245,8 +233,8 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct bitmap_ipmac *map = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct bitmap_ipmac_adt_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(map); -	u32 ip; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0;  	int ret = 0;  	if (unlikely(!tb[IPSET_ATTR_IP] || @@ -285,43 +273,12 @@ bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)  	return x->first_ip == y->first_ip &&  	       x->last_ip == y->last_ip && -	       x->timeout == y->timeout && +	       a->timeout == b->timeout &&  	       a->extensions == b->extensions;  }  /* Plain variant */ -/* Timeout variant */ - -struct bitmap_ipmact_elem { -	struct { -		unsigned char ether[ETH_ALEN]; -		unsigned char filled; -	} __attribute__ ((aligned)); -	unsigned long timeout; -}; - -/* Plain variant with counter */ - -struct bitmap_ipmacc_elem { -	struct { -		unsigned char ether[ETH_ALEN]; -		unsigned char filled; -	} __attribute__ ((aligned)); -	struct ip_set_counter counter; -}; - -/* Timeout variant with counter */ - -struct bitmap_ipmacct_elem { -	struct { -		unsigned char ether[ETH_ALEN]; -		unsigned char filled; -	} __attribute__ ((aligned)); -	unsigned long timeout; -	struct ip_set_counter counter; -}; -  #include "ip_set_bitmap_gen.h"  /* Create bitmap:ip,mac type of sets */ @@ -330,11 +287,11 @@ static bool  init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,  	       u32 first_ip, u32 last_ip, u32 elements)  { -	map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize); +	map->members = ip_set_alloc(map->memsize);  	if (!map->members)  		return false; -	if (map->dsize) { -		map->extensions = ip_set_alloc(map->dsize * elements); +	if (set->dsize) { +		map->extensions = ip_set_alloc(set->dsize * elements);  		if (!map->extensions) {  			kfree(map->members);  			return false; @@ -343,7 +300,7 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,  	map->first_ip = first_ip;  	map->last_ip = last_ip;  	map->elements = elements; -	map->timeout = IPSET_NO_TIMEOUT; +	set->timeout = IPSET_NO_TIMEOUT;  	set->data = map;  	set->family = NFPROTO_IPV4; @@ -352,10 +309,10 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,  }  static int -bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], +bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],  		    u32 flags)  { -	u32 first_ip, last_ip, cadt_flags = 0; +	u32 first_ip = 0, last_ip = 0;  	u64 elements;  	struct bitmap_ipmac *map;  	int ret; @@ -399,57 +356,15 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],  	map->memsize = bitmap_bytes(0, elements - 1);  	set->variant = &bitmap_ipmac; -	if (tb[IPSET_ATTR_CADT_FLAGS]) -		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); -	if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { -		set->extensions |= IPSET_EXT_COUNTER; -		if (tb[IPSET_ATTR_TIMEOUT]) { -			map->dsize = sizeof(struct bitmap_ipmacct_elem); -			map->offset[IPSET_OFFSET_TIMEOUT] = -				offsetof(struct bitmap_ipmacct_elem, timeout); -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct bitmap_ipmacct_elem, counter); - -			if (!init_map_ipmac(set, map, first_ip, last_ip, -					    elements)) { -				kfree(map); -				return -ENOMEM; -			} -			map->timeout = ip_set_timeout_uget( -				tb[IPSET_ATTR_TIMEOUT]); -			set->extensions |= IPSET_EXT_TIMEOUT; -			bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); -		} else { -			map->dsize = sizeof(struct bitmap_ipmacc_elem); -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct bitmap_ipmacc_elem, counter); - -			if (!init_map_ipmac(set, map, first_ip, last_ip, -					    elements)) { -				kfree(map); -				return -ENOMEM; -			} -		} -	} else if (tb[IPSET_ATTR_TIMEOUT]) { -		map->dsize = sizeof(struct bitmap_ipmact_elem); -		map->offset[IPSET_OFFSET_TIMEOUT] = -			offsetof(struct bitmap_ipmact_elem, timeout); - -		if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { -			kfree(map); -			return -ENOMEM; -		} -		map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); -		set->extensions |= IPSET_EXT_TIMEOUT; +	set->dsize = ip_set_elem_len(set, tb, +				     sizeof(struct bitmap_ipmac_elem)); +	if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { +		kfree(map); +		return -ENOMEM; +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);  		bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); -	} else { -		map->dsize = sizeof(struct bitmap_ipmac_elem); - -		if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { -			kfree(map); -			return -ENOMEM; -		} -		set->variant = &bitmap_ipmac;  	}  	return 0;  } @@ -460,8 +375,8 @@ static struct ip_set_type bitmap_ipmac_type = {  	.features	= IPSET_TYPE_IP | IPSET_TYPE_MAC,  	.dimension	= IPSET_DIM_TWO,  	.family		= NFPROTO_IPV4, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= bitmap_ipmac_create,  	.create_policy	= {  		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, @@ -478,6 +393,7 @@ static struct ip_set_type bitmap_ipmac_type = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 8207d1fda52..cf99676e69f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -20,12 +20,13 @@  #include <linux/netfilter/ipset/ip_set_bitmap.h>  #include <linux/netfilter/ipset/ip_set_getport.h> -#define REVISION_MIN	0 -#define REVISION_MAX	1	/* Counter support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counter support added */ +#define IPSET_TYPE_REV_MAX	2	/* Comment support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("bitmap:port", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_bitmap:port");  #define MTYPE		bitmap_port @@ -38,9 +39,6 @@ struct bitmap_port {  	u16 last_port;		/* host byte order, included in range */  	u32 elements;		/* number of max elements in the set */  	size_t memsize;		/* members size */ -	size_t dsize;		/* extensions struct size */ -	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ -	u32 timeout;		/* timeout parameter */  	struct timer_list gc;	/* garbage collection */  }; @@ -59,20 +57,20 @@ port_to_id(const struct bitmap_port *m, u16 port)  static inline int  bitmap_port_do_test(const struct bitmap_port_adt_elem *e, -		    const struct bitmap_port *map) +		    const struct bitmap_port *map, size_t dsize)  {  	return !!test_bit(e->id, map->members);  }  static inline int -bitmap_port_gc_test(u16 id, const struct bitmap_port *map) +bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize)  {  	return !!test_bit(id, map->members);  }  static inline int  bitmap_port_do_add(const struct bitmap_port_adt_elem *e, -		   struct bitmap_port *map, u32 flags) +		   struct bitmap_port *map, u32 flags, size_t dsize)  {  	return !!test_and_set_bit(e->id, map->members);  } @@ -85,7 +83,8 @@ bitmap_port_do_del(const struct bitmap_port_adt_elem *e,  }  static inline int -bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id) +bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, +		    size_t dsize)  {  	return nla_put_net16(skb, IPSET_ATTR_PORT,  			     htons(map->first_port + id)); @@ -106,7 +105,7 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,  	struct bitmap_port *map = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct bitmap_port_adt_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	__be16 __port;  	u16 port = 0; @@ -131,7 +130,7 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],  	struct bitmap_port *map = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct bitmap_port_adt_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(map); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	u32 port;	/* wraparound */  	u16 port_to;  	int ret = 0; @@ -191,7 +190,7 @@ bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)  	return x->first_port == y->first_port &&  	       x->last_port == y->last_port && -	       x->timeout == y->timeout && +	       a->timeout == b->timeout &&  	       a->extensions == b->extensions;  } @@ -200,25 +199,6 @@ bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)  struct bitmap_port_elem {  }; -/* Timeout variant */ - -struct bitmap_portt_elem { -	unsigned long timeout; -}; - -/* Plain variant with counter */ - -struct bitmap_portc_elem { -	struct ip_set_counter counter; -}; - -/* Timeout variant with counter */ - -struct bitmap_portct_elem { -	unsigned long timeout; -	struct ip_set_counter counter; -}; -  #include "ip_set_bitmap_gen.h"  /* Create bitmap:ip type of sets */ @@ -230,8 +210,8 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,  	map->members = ip_set_alloc(map->memsize);  	if (!map->members)  		return false; -	if (map->dsize) { -		map->extensions = ip_set_alloc(map->dsize * map->elements); +	if (set->dsize) { +		map->extensions = ip_set_alloc(set->dsize * map->elements);  		if (!map->extensions) {  			kfree(map->members);  			return false; @@ -239,7 +219,7 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,  	}  	map->first_port = first_port;  	map->last_port = last_port; -	map->timeout = IPSET_NO_TIMEOUT; +	set->timeout = IPSET_NO_TIMEOUT;  	set->data = map;  	set->family = NFPROTO_UNSPEC; @@ -248,11 +228,11 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,  }  static int -bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], +		   u32 flags)  {  	struct bitmap_port *map;  	u16 first_port, last_port; -	u32 cadt_flags = 0;  	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||  		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -274,55 +254,16 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags)  		return -ENOMEM;  	map->elements = last_port - first_port + 1; -	map->memsize = map->elements * sizeof(unsigned long); +	map->memsize = bitmap_bytes(0, map->elements);  	set->variant = &bitmap_port; -	if (tb[IPSET_ATTR_CADT_FLAGS]) -		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); -	if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { -		set->extensions |= IPSET_EXT_COUNTER; -		if (tb[IPSET_ATTR_TIMEOUT]) { -			map->dsize = sizeof(struct bitmap_portct_elem); -			map->offset[IPSET_OFFSET_TIMEOUT] = -				offsetof(struct bitmap_portct_elem, timeout); -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct bitmap_portct_elem, counter); -			if (!init_map_port(set, map, first_port, last_port)) { -				kfree(map); -				return -ENOMEM; -			} - -			map->timeout = -				ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); -			set->extensions |= IPSET_EXT_TIMEOUT; -			bitmap_port_gc_init(set, bitmap_port_gc); -		} else { -			map->dsize = sizeof(struct bitmap_portc_elem); -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct bitmap_portc_elem, counter); -			if (!init_map_port(set, map, first_port, last_port)) { -				kfree(map); -				return -ENOMEM; -			} -		} -	} else if (tb[IPSET_ATTR_TIMEOUT]) { -		map->dsize = sizeof(struct bitmap_portt_elem); -		map->offset[IPSET_OFFSET_TIMEOUT] = -			offsetof(struct bitmap_portt_elem, timeout); -		if (!init_map_port(set, map, first_port, last_port)) { -			kfree(map); -			return -ENOMEM; -		} - -		map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); -		set->extensions |= IPSET_EXT_TIMEOUT; +	set->dsize = ip_set_elem_len(set, tb, 0); +	if (!init_map_port(set, map, first_port, last_port)) { +		kfree(map); +		return -ENOMEM; +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);  		bitmap_port_gc_init(set, bitmap_port_gc); -	} else { -		map->dsize = 0; -		if (!init_map_port(set, map, first_port, last_port)) { -			kfree(map); -			return -ENOMEM; -		} -  	}  	return 0;  } @@ -333,8 +274,8 @@ static struct ip_set_type bitmap_port_type = {  	.features	= IPSET_TYPE_PORT,  	.dimension	= IPSET_DIM_ONE,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= bitmap_port_create,  	.create_policy	= {  		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, @@ -349,6 +290,7 @@ static struct ip_set_type bitmap_port_type = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index f2e30fb31e7..ec8114fae50 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -17,6 +17,8 @@  #include <linux/spinlock.h>  #include <linux/rculist.h>  #include <net/netlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h>  #include <linux/netfilter.h>  #include <linux/netfilter/x_tables.h> @@ -27,8 +29,17 @@ static LIST_HEAD(ip_set_type_list);		/* all registered set types */  static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */  static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */ -static struct ip_set * __rcu *ip_set_list;	/* all individual sets */ -static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ +struct ip_set_net { +	struct ip_set * __rcu *ip_set_list;	/* all individual sets */ +	ip_set_id_t	ip_set_max;	/* max number of sets */ +	int		is_deleted;	/* deleted by ip_set_net_exit */ +}; +static int ip_set_net_id __read_mostly; + +static inline struct ip_set_net *ip_set_pernet(struct net *net) +{ +	return net_generic(net, ip_set_net_id); +}  #define IP_SET_INC	64  #define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0) @@ -43,10 +54,10 @@ MODULE_DESCRIPTION("core IP set support");  MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);  /* When the nfnl mutex is held: */ -#define nfnl_dereference(p)		\ +#define ip_set_dereference(p)		\  	rcu_dereference_protected(p, 1) -#define nfnl_set(id)			\ -	nfnl_dereference(ip_set_list)[id] +#define ip_set(inst, id)		\ +	ip_set_dereference((inst)->ip_set_list)[id]  /*   * The set types are implemented in modules and registered set types @@ -260,10 +271,7 @@ ip_set_free(void *members)  {  	pr_debug("%p: free with %s\n", members,  		 is_vmalloc_addr(members) ? "vfree" : "kfree"); -	if (is_vmalloc_addr(members)) -		vfree(members); -	else -		kfree(members); +	kvfree(members);  }  EXPORT_SYMBOL_GPL(ip_set_free); @@ -315,6 +323,62 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)  }  EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +typedef void (*destroyer)(void *); +/* ipset data extension types, in size order */ + +const struct ip_set_ext_type ip_set_extensions[] = { +	[IPSET_EXT_ID_COUNTER] = { +		.type	= IPSET_EXT_COUNTER, +		.flag	= IPSET_FLAG_WITH_COUNTERS, +		.len	= sizeof(struct ip_set_counter), +		.align	= __alignof__(struct ip_set_counter), +	}, +	[IPSET_EXT_ID_TIMEOUT] = { +		.type	= IPSET_EXT_TIMEOUT, +		.len	= sizeof(unsigned long), +		.align	= __alignof__(unsigned long), +	}, +	[IPSET_EXT_ID_COMMENT] = { +		.type	 = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, +		.flag	 = IPSET_FLAG_WITH_COMMENT, +		.len	 = sizeof(struct ip_set_comment), +		.align	 = __alignof__(struct ip_set_comment), +		.destroy = (destroyer) ip_set_comment_free, +	}, +}; +EXPORT_SYMBOL_GPL(ip_set_extensions); + +static inline bool +add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) +{ +	return ip_set_extensions[id].flag ? +		(flags & ip_set_extensions[id].flag) : +		!!tb[IPSET_ATTR_TIMEOUT]; +} + +size_t +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +{ +	enum ip_set_ext_id id; +	size_t offset = 0; +	u32 cadt_flags = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) +		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +	if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) +		set->flags |= IPSET_CREATE_FLAG_FORCEADD; +	for (id = 0; id < IPSET_EXT_ID_MAX; id++) { +		if (!add_extension(id, cadt_flags, tb)) +			continue; +		offset += ALIGN(len + offset, ip_set_extensions[id].align); +		set->offset[id] = offset; +		set->extensions |= ip_set_extensions[id].type; +		offset += ip_set_extensions[id].len; +	} +	return len + offset; +} +EXPORT_SYMBOL_GPL(ip_set_elem_len); +  int  ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],  		      struct ip_set_ext *ext) @@ -334,6 +398,12 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],  			ext->packets = be64_to_cpu(nla_get_be64(  						   tb[IPSET_ATTR_PACKETS]));  	} +	if (tb[IPSET_ATTR_COMMENT]) { +		if (!(set->extensions & IPSET_EXT_COMMENT)) +			return -IPSET_ERR_COMMENT; +		ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); +	} +  	return 0;  }  EXPORT_SYMBOL_GPL(ip_set_get_extensions); @@ -374,13 +444,14 @@ __ip_set_put(struct ip_set *set)   */  static inline struct ip_set * -ip_set_rcu_get(ip_set_id_t index) +ip_set_rcu_get(struct net *net, ip_set_id_t index)  {  	struct ip_set *set; +	struct ip_set_net *inst = ip_set_pernet(net);  	rcu_read_lock();  	/* ip_set_list itself needs to be protected */ -	set = rcu_dereference(ip_set_list)[index]; +	set = rcu_dereference(inst->ip_set_list)[index];  	rcu_read_unlock();  	return set; @@ -390,7 +461,8 @@ int  ip_set_test(ip_set_id_t index, const struct sk_buff *skb,  	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)  { -	struct ip_set *set = ip_set_rcu_get(index); +	struct ip_set *set = ip_set_rcu_get( +			dev_net(par->in ? par->in : par->out), index);  	int ret = 0;  	BUG_ON(set == NULL); @@ -428,7 +500,8 @@ int  ip_set_add(ip_set_id_t index, const struct sk_buff *skb,  	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)  { -	struct ip_set *set = ip_set_rcu_get(index); +	struct ip_set *set = ip_set_rcu_get( +			dev_net(par->in ? par->in : par->out), index);  	int ret;  	BUG_ON(set == NULL); @@ -436,7 +509,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,  	if (opt->dim < set->type->dimension ||  	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) -		return 0; +		return -IPSET_ERR_TYPE_MISMATCH;  	write_lock_bh(&set->lock);  	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); @@ -450,7 +523,8 @@ int  ip_set_del(ip_set_id_t index, const struct sk_buff *skb,  	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)  { -	struct ip_set *set = ip_set_rcu_get(index); +	struct ip_set *set = ip_set_rcu_get( +			dev_net(par->in ? par->in : par->out), index);  	int ret = 0;  	BUG_ON(set == NULL); @@ -458,7 +532,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,  	if (opt->dim < set->type->dimension ||  	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) -		return 0; +		return -IPSET_ERR_TYPE_MISMATCH;  	write_lock_bh(&set->lock);  	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); @@ -474,14 +548,15 @@ EXPORT_SYMBOL_GPL(ip_set_del);   *   */  ip_set_id_t -ip_set_get_byname(const char *name, struct ip_set **set) +ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)  {  	ip_set_id_t i, index = IPSET_INVALID_ID;  	struct ip_set *s; +	struct ip_set_net *inst = ip_set_pernet(net);  	rcu_read_lock(); -	for (i = 0; i < ip_set_max; i++) { -		s = rcu_dereference(ip_set_list)[i]; +	for (i = 0; i < inst->ip_set_max; i++) { +		s = rcu_dereference(inst->ip_set_list)[i];  		if (s != NULL && STREQ(s->name, name)) {  			__ip_set_get(s);  			index = i; @@ -501,17 +576,26 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname);   * to be valid, after calling this function.   *   */ -void -ip_set_put_byindex(ip_set_id_t index) + +static inline void +__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)  {  	struct ip_set *set;  	rcu_read_lock(); -	set = rcu_dereference(ip_set_list)[index]; +	set = rcu_dereference(inst->ip_set_list)[index];  	if (set != NULL)  		__ip_set_put(set);  	rcu_read_unlock();  } + +void +ip_set_put_byindex(struct net *net, ip_set_id_t index) +{ +	struct ip_set_net *inst = ip_set_pernet(net); + +	__ip_set_put_byindex(inst, index); +}  EXPORT_SYMBOL_GPL(ip_set_put_byindex);  /* @@ -522,9 +606,9 @@ EXPORT_SYMBOL_GPL(ip_set_put_byindex);   *   */  const char * -ip_set_name_byindex(ip_set_id_t index) +ip_set_name_byindex(struct net *net, ip_set_id_t index)  { -	const struct ip_set *set = ip_set_rcu_get(index); +	const struct ip_set *set = ip_set_rcu_get(net, index);  	BUG_ON(set == NULL);  	BUG_ON(set->ref == 0); @@ -540,48 +624,22 @@ EXPORT_SYMBOL_GPL(ip_set_name_byindex);   */  /* - * Find set by name, reference it once. The reference makes sure the - * thing pointed to, does not go away under our feet. - * - * The nfnl mutex is used in the function. - */ -ip_set_id_t -ip_set_nfnl_get(const char *name) -{ -	ip_set_id_t i, index = IPSET_INVALID_ID; -	struct ip_set *s; - -	nfnl_lock(NFNL_SUBSYS_IPSET); -	for (i = 0; i < ip_set_max; i++) { -		s = nfnl_set(i); -		if (s != NULL && STREQ(s->name, name)) { -			__ip_set_get(s); -			index = i; -			break; -		} -	} -	nfnl_unlock(NFNL_SUBSYS_IPSET); - -	return index; -} -EXPORT_SYMBOL_GPL(ip_set_nfnl_get); - -/*   * Find set by index, reference it once. The reference makes sure the   * thing pointed to, does not go away under our feet.   *   * The nfnl mutex is used in the function.   */  ip_set_id_t -ip_set_nfnl_get_byindex(ip_set_id_t index) +ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)  {  	struct ip_set *set; +	struct ip_set_net *inst = ip_set_pernet(net); -	if (index > ip_set_max) +	if (index > inst->ip_set_max)  		return IPSET_INVALID_ID;  	nfnl_lock(NFNL_SUBSYS_IPSET); -	set = nfnl_set(index); +	set = ip_set(inst, index);  	if (set)  		__ip_set_get(set);  	else @@ -600,13 +658,17 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);   * The nfnl mutex is used in the function.   */  void -ip_set_nfnl_put(ip_set_id_t index) +ip_set_nfnl_put(struct net *net, ip_set_id_t index)  {  	struct ip_set *set; +	struct ip_set_net *inst = ip_set_pernet(net); +  	nfnl_lock(NFNL_SUBSYS_IPSET); -	set = nfnl_set(index); -	if (set != NULL) -		__ip_set_put(set); +	if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ +		set = ip_set(inst, index); +		if (set != NULL) +			__ip_set_put(set); +	}  	nfnl_unlock(NFNL_SUBSYS_IPSET);  }  EXPORT_SYMBOL_GPL(ip_set_nfnl_put); @@ -664,14 +726,14 @@ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {  };  static struct ip_set * -find_set_and_id(const char *name, ip_set_id_t *id) +find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)  {  	struct ip_set *set = NULL;  	ip_set_id_t i;  	*id = IPSET_INVALID_ID; -	for (i = 0; i < ip_set_max; i++) { -		set = nfnl_set(i); +	for (i = 0; i < inst->ip_set_max; i++) { +		set = ip_set(inst, i);  		if (set != NULL && STREQ(set->name, name)) {  			*id = i;  			break; @@ -681,22 +743,23 @@ find_set_and_id(const char *name, ip_set_id_t *id)  }  static inline struct ip_set * -find_set(const char *name) +find_set(struct ip_set_net *inst, const char *name)  {  	ip_set_id_t id; -	return find_set_and_id(name, &id); +	return find_set_and_id(inst, name, &id);  }  static int -find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set) +find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, +	     struct ip_set **set)  {  	struct ip_set *s;  	ip_set_id_t i;  	*index = IPSET_INVALID_ID; -	for (i = 0;  i < ip_set_max; i++) { -		s = nfnl_set(i); +	for (i = 0;  i < inst->ip_set_max; i++) { +		s = ip_set(inst, i);  		if (s == NULL) {  			if (*index == IPSET_INVALID_ID)  				*index = i; @@ -725,6 +788,8 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,  	      const struct nlmsghdr *nlh,  	      const struct nlattr * const attr[])  { +	struct net *net = sock_net(ctnl); +	struct ip_set_net *inst = ip_set_pernet(net);  	struct ip_set *set, *clash = NULL;  	ip_set_id_t index = IPSET_INVALID_ID;  	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; @@ -783,7 +848,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,  		goto put_out;  	} -	ret = set->type->create(set, tb, flags); +	ret = set->type->create(net, set, tb, flags);  	if (ret != 0)  		goto put_out; @@ -794,7 +859,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,  	 * by the nfnl mutex. Find the first free index in ip_set_list  	 * and check clashing.  	 */ -	ret = find_free_id(set->name, &index, &clash); +	ret = find_free_id(inst, set->name, &index, &clash);  	if (ret == -EEXIST) {  		/* If this is the same set and requested, ignore error */  		if ((flags & IPSET_FLAG_EXIST) && @@ -807,9 +872,9 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,  		goto cleanup;  	} else if (ret == -IPSET_ERR_MAX_SETS) {  		struct ip_set **list, **tmp; -		ip_set_id_t i = ip_set_max + IP_SET_INC; +		ip_set_id_t i = inst->ip_set_max + IP_SET_INC; -		if (i < ip_set_max || i == IPSET_INVALID_ID) +		if (i < inst->ip_set_max || i == IPSET_INVALID_ID)  			/* Wraparound */  			goto cleanup; @@ -817,14 +882,14 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,  		if (!list)  			goto cleanup;  		/* nfnl mutex is held, both lists are valid */ -		tmp = nfnl_dereference(ip_set_list); -		memcpy(list, tmp, sizeof(struct ip_set *) * ip_set_max); -		rcu_assign_pointer(ip_set_list, list); +		tmp = ip_set_dereference(inst->ip_set_list); +		memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); +		rcu_assign_pointer(inst->ip_set_list, list);  		/* Make sure all current packets have passed through */  		synchronize_net();  		/* Use new list */ -		index = ip_set_max; -		ip_set_max = i; +		index = inst->ip_set_max; +		inst->ip_set_max = i;  		kfree(tmp);  		ret = 0;  	} else if (ret) @@ -834,7 +899,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,  	 * Finally! Add our shiny new set to the list, and be done.  	 */  	pr_debug("create: '%s' created with index %u!\n", set->name, index); -	nfnl_set(index) = set; +	ip_set(inst, index) = set;  	return ret; @@ -857,12 +922,12 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {  };  static void -ip_set_destroy_set(ip_set_id_t index) +ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)  { -	struct ip_set *set = nfnl_set(index); +	struct ip_set *set = ip_set(inst, index);  	pr_debug("set: %s\n",  set->name); -	nfnl_set(index) = NULL; +	ip_set(inst, index) = NULL;  	/* Must call it without holding any lock */  	set->variant->destroy(set); @@ -875,6 +940,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,  	       const struct nlmsghdr *nlh,  	       const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	struct ip_set *s;  	ip_set_id_t i;  	int ret = 0; @@ -894,21 +960,22 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,  	 */  	read_lock_bh(&ip_set_ref_lock);  	if (!attr[IPSET_ATTR_SETNAME]) { -		for (i = 0; i < ip_set_max; i++) { -			s = nfnl_set(i); +		for (i = 0; i < inst->ip_set_max; i++) { +			s = ip_set(inst, i);  			if (s != NULL && s->ref) {  				ret = -IPSET_ERR_BUSY;  				goto out;  			}  		}  		read_unlock_bh(&ip_set_ref_lock); -		for (i = 0; i < ip_set_max; i++) { -			s = nfnl_set(i); +		for (i = 0; i < inst->ip_set_max; i++) { +			s = ip_set(inst, i);  			if (s != NULL) -				ip_set_destroy_set(i); +				ip_set_destroy_set(inst, i);  		}  	} else { -		s = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &i); +		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), +				    &i);  		if (s == NULL) {  			ret = -ENOENT;  			goto out; @@ -918,7 +985,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,  		}  		read_unlock_bh(&ip_set_ref_lock); -		ip_set_destroy_set(i); +		ip_set_destroy_set(inst, i);  	}  	return 0;  out: @@ -943,6 +1010,7 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,  	     const struct nlmsghdr *nlh,  	     const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	struct ip_set *s;  	ip_set_id_t i; @@ -950,13 +1018,13 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,  		return -IPSET_ERR_PROTOCOL;  	if (!attr[IPSET_ATTR_SETNAME]) { -		for (i = 0; i < ip_set_max; i++) { -			s = nfnl_set(i); +		for (i = 0; i < inst->ip_set_max; i++) { +			s = ip_set(inst, i);  			if (s != NULL)  				ip_set_flush_set(s);  		}  	} else { -		s = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); +		s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));  		if (s == NULL)  			return -ENOENT; @@ -982,6 +1050,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,  	      const struct nlmsghdr *nlh,  	      const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	struct ip_set *set, *s;  	const char *name2;  	ip_set_id_t i; @@ -992,7 +1061,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,  		     attr[IPSET_ATTR_SETNAME2] == NULL))  		return -IPSET_ERR_PROTOCOL; -	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));  	if (set == NULL)  		return -ENOENT; @@ -1003,8 +1072,8 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,  	}  	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); -	for (i = 0; i < ip_set_max; i++) { -		s = nfnl_set(i); +	for (i = 0; i < inst->ip_set_max; i++) { +		s = ip_set(inst, i);  		if (s != NULL && STREQ(s->name, name2)) {  			ret = -IPSET_ERR_EXIST_SETNAME2;  			goto out; @@ -1031,6 +1100,7 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,  	    const struct nlmsghdr *nlh,  	    const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	struct ip_set *from, *to;  	ip_set_id_t from_id, to_id;  	char from_name[IPSET_MAXNAMELEN]; @@ -1040,11 +1110,13 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,  		     attr[IPSET_ATTR_SETNAME2] == NULL))  		return -IPSET_ERR_PROTOCOL; -	from = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); +	from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), +			       &from_id);  	if (from == NULL)  		return -ENOENT; -	to = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); +	to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), +			     &to_id);  	if (to == NULL)  		return -IPSET_ERR_EXIST_SETNAME2; @@ -1061,8 +1133,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,  	write_lock_bh(&ip_set_ref_lock);  	swap(from->ref, to->ref); -	nfnl_set(from_id) = to; -	nfnl_set(to_id) = from; +	ip_set(inst, from_id) = to; +	ip_set(inst, to_id) = from;  	write_unlock_bh(&ip_set_ref_lock);  	return 0; @@ -1081,9 +1153,12 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,  static int  ip_set_dump_done(struct netlink_callback *cb)  { -	if (cb->args[2]) { -		pr_debug("release set %s\n", nfnl_set(cb->args[1])->name); -		ip_set_put_byindex((ip_set_id_t) cb->args[1]); +	struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; +	if (cb->args[IPSET_CB_ARG0]) { +		pr_debug("release set %s\n", +			 ip_set(inst, cb->args[IPSET_CB_INDEX])->name); +		__ip_set_put_byindex(inst, +			(ip_set_id_t) cb->args[IPSET_CB_INDEX]);  	}  	return 0;  } @@ -1101,7 +1176,7 @@ dump_attrs(struct nlmsghdr *nlh)  }  static int -dump_init(struct netlink_callback *cb) +dump_init(struct netlink_callback *cb, struct ip_set_net *inst)  {  	struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);  	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); @@ -1114,21 +1189,22 @@ dump_init(struct netlink_callback *cb)  	nla_parse(cda, IPSET_ATTR_CMD_MAX,  		  attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); -	/* cb->args[0] : dump single set/all sets -	 *         [1] : set index -	 *         [..]: type specific +	/* cb->args[IPSET_CB_NET]:	net namespace +	 *         [IPSET_CB_DUMP]:	dump single set/all sets +	 *         [IPSET_CB_INDEX]: 	set index +	 *         [IPSET_CB_ARG0]:	type specific  	 */  	if (cda[IPSET_ATTR_SETNAME]) {  		struct ip_set *set; -		set = find_set_and_id(nla_data(cda[IPSET_ATTR_SETNAME]), +		set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),  				      &index);  		if (set == NULL)  			return -ENOENT;  		dump_type = DUMP_ONE; -		cb->args[1] = index; +		cb->args[IPSET_CB_INDEX] = index;  	} else  		dump_type = DUMP_ALL; @@ -1136,7 +1212,8 @@ dump_init(struct netlink_callback *cb)  		u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);  		dump_type |= (f << 16);  	} -	cb->args[0] = dump_type; +	cb->args[IPSET_CB_NET] = (unsigned long)inst; +	cb->args[IPSET_CB_DUMP] = dump_type;  	return 0;  } @@ -1148,11 +1225,12 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)  	struct ip_set *set = NULL;  	struct nlmsghdr *nlh = NULL;  	unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; +	struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));  	u32 dump_type, dump_flags;  	int ret = 0; -	if (!cb->args[0]) { -		ret = dump_init(cb); +	if (!cb->args[IPSET_CB_DUMP]) { +		ret = dump_init(cb, inst);  		if (ret < 0) {  			nlh = nlmsg_hdr(cb->skb);  			/* We have to create and send the error message @@ -1163,18 +1241,19 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)  		}  	} -	if (cb->args[1] >= ip_set_max) +	if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)  		goto out; -	dump_type = DUMP_TYPE(cb->args[0]); -	dump_flags = DUMP_FLAGS(cb->args[0]); -	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max; +	dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]); +	dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]); +	max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1 +				    : inst->ip_set_max;  dump_last: -	pr_debug("args[0]: %u %u args[1]: %ld\n", -		 dump_type, dump_flags, cb->args[1]); -	for (; cb->args[1] < max; cb->args[1]++) { -		index = (ip_set_id_t) cb->args[1]; -		set = nfnl_set(index); +	pr_debug("dump type, flag: %u %u index: %ld\n", +		 dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); +	for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { +		index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; +		set = ip_set(inst, index);  		if (set == NULL) {  			if (dump_type == DUMP_ONE) {  				ret = -ENOENT; @@ -1190,7 +1269,7 @@ dump_last:  		     !!(set->type->features & IPSET_DUMP_LAST)))  			continue;  		pr_debug("List set: %s\n", set->name); -		if (!cb->args[2]) { +		if (!cb->args[IPSET_CB_ARG0]) {  			/* Start listing: make sure set won't be destroyed */  			pr_debug("reference set\n");  			__ip_set_get(set); @@ -1207,7 +1286,7 @@ dump_last:  			goto nla_put_failure;  		if (dump_flags & IPSET_FLAG_LIST_SETNAME)  			goto next_set; -		switch (cb->args[2]) { +		switch (cb->args[IPSET_CB_ARG0]) {  		case 0:  			/* Core header data */  			if (nla_put_string(skb, IPSET_ATTR_TYPENAME, @@ -1227,7 +1306,7 @@ dump_last:  			read_lock_bh(&set->lock);  			ret = set->variant->list(set, skb, cb);  			read_unlock_bh(&set->lock); -			if (!cb->args[2]) +			if (!cb->args[IPSET_CB_ARG0])  				/* Set is done, proceed with next one */  				goto next_set;  			goto release_refcount; @@ -1236,8 +1315,8 @@ dump_last:  	/* If we dump all sets, continue with dumping last ones */  	if (dump_type == DUMP_ALL) {  		dump_type = DUMP_LAST; -		cb->args[0] = dump_type | (dump_flags << 16); -		cb->args[1] = 0; +		cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); +		cb->args[IPSET_CB_INDEX] = 0;  		goto dump_last;  	}  	goto out; @@ -1246,15 +1325,15 @@ nla_put_failure:  	ret = -EFAULT;  next_set:  	if (dump_type == DUMP_ONE) -		cb->args[1] = IPSET_INVALID_ID; +		cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID;  	else -		cb->args[1]++; +		cb->args[IPSET_CB_INDEX]++;  release_refcount:  	/* If there was an error or set is done, release set */ -	if (ret || !cb->args[2]) { -		pr_debug("release set %s\n", nfnl_set(index)->name); -		ip_set_put_byindex(index); -		cb->args[2] = 0; +	if (ret || !cb->args[IPSET_CB_ARG0]) { +		pr_debug("release set %s\n", ip_set(inst, index)->name); +		__ip_set_put_byindex(inst, index); +		cb->args[IPSET_CB_ARG0] = 0;  	}  out:  	if (nlh) { @@ -1356,6 +1435,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,  	    const struct nlmsghdr *nlh,  	    const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	struct ip_set *set;  	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};  	const struct nlattr *nla; @@ -1374,7 +1454,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,  		       attr[IPSET_ATTR_LINENO] == NULL))))  		return -IPSET_ERR_PROTOCOL; -	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));  	if (set == NULL)  		return -ENOENT; @@ -1410,6 +1490,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,  	    const struct nlmsghdr *nlh,  	    const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	struct ip_set *set;  	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};  	const struct nlattr *nla; @@ -1428,7 +1509,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,  		       attr[IPSET_ATTR_LINENO] == NULL))))  		return -IPSET_ERR_PROTOCOL; -	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));  	if (set == NULL)  		return -ENOENT; @@ -1464,6 +1545,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,  	     const struct nlmsghdr *nlh,  	     const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	struct ip_set *set;  	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};  	int ret = 0; @@ -1474,7 +1556,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,  		     !flag_nested(attr[IPSET_ATTR_DATA])))  		return -IPSET_ERR_PROTOCOL; -	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));  	if (set == NULL)  		return -ENOENT; @@ -1499,6 +1581,7 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,  	      const struct nlmsghdr *nlh,  	      const struct nlattr * const attr[])  { +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));  	const struct ip_set *set;  	struct sk_buff *skb2;  	struct nlmsghdr *nlh2; @@ -1508,7 +1591,7 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,  		     attr[IPSET_ATTR_SETNAME] == NULL))  		return -IPSET_ERR_PROTOCOL; -	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));  	if (set == NULL)  		return -ENOENT; @@ -1733,8 +1816,10 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)  	unsigned int *op;  	void *data;  	int copylen = *len, ret = 0; +	struct net *net = sock_net(sk); +	struct ip_set_net *inst = ip_set_pernet(net); -	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	if (optval != SO_IP_SET)  		return -EBADF; @@ -1783,22 +1868,39 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)  		}  		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';  		nfnl_lock(NFNL_SUBSYS_IPSET); -		find_set_and_id(req_get->set.name, &id); +		find_set_and_id(inst, req_get->set.name, &id);  		req_get->set.index = id;  		nfnl_unlock(NFNL_SUBSYS_IPSET);  		goto copy;  	} +	case IP_SET_OP_GET_FNAME: { +		struct ip_set_req_get_set_family *req_get = data; +		ip_set_id_t id; + +		if (*len != sizeof(struct ip_set_req_get_set_family)) { +			ret = -EINVAL; +			goto done; +		} +		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; +		nfnl_lock(NFNL_SUBSYS_IPSET); +		find_set_and_id(inst, req_get->set.name, &id); +		req_get->set.index = id; +		if (id != IPSET_INVALID_ID) +			req_get->family = ip_set(inst, id)->family; +		nfnl_unlock(NFNL_SUBSYS_IPSET); +		goto copy; +	}  	case IP_SET_OP_GET_BYINDEX: {  		struct ip_set_req_get_set *req_get = data;  		struct ip_set *set;  		if (*len != sizeof(struct ip_set_req_get_set) || -		    req_get->set.index >= ip_set_max) { +		    req_get->set.index >= inst->ip_set_max) {  			ret = -EINVAL;  			goto done;  		}  		nfnl_lock(NFNL_SUBSYS_IPSET); -		set = nfnl_set(req_get->set.index); +		set = ip_set(inst, req_get->set.index);  		strncpy(req_get->set.name, set ? set->name : "",  			IPSET_MAXNAMELEN);  		nfnl_unlock(NFNL_SUBSYS_IPSET); @@ -1827,49 +1929,81 @@ static struct nf_sockopt_ops so_set __read_mostly = {  	.owner		= THIS_MODULE,  }; -static int __init -ip_set_init(void) +static int __net_init +ip_set_net_init(struct net *net)  { +	struct ip_set_net *inst = ip_set_pernet(net);  	struct ip_set **list; -	int ret; -	if (max_sets) -		ip_set_max = max_sets; -	if (ip_set_max >= IPSET_INVALID_ID) -		ip_set_max = IPSET_INVALID_ID - 1; +	inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX; +	if (inst->ip_set_max >= IPSET_INVALID_ID) +		inst->ip_set_max = IPSET_INVALID_ID - 1; -	list = kzalloc(sizeof(struct ip_set *) * ip_set_max, GFP_KERNEL); +	list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);  	if (!list)  		return -ENOMEM; +	inst->is_deleted = 0; +	rcu_assign_pointer(inst->ip_set_list, list); +	return 0; +} + +static void __net_exit +ip_set_net_exit(struct net *net) +{ +	struct ip_set_net *inst = ip_set_pernet(net); + +	struct ip_set *set = NULL; +	ip_set_id_t i; + +	inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + +	for (i = 0; i < inst->ip_set_max; i++) { +		set = ip_set(inst, i); +		if (set != NULL) +			ip_set_destroy_set(inst, i); +	} +	kfree(rcu_dereference_protected(inst->ip_set_list, 1)); +} + +static struct pernet_operations ip_set_net_ops = { +	.init	= ip_set_net_init, +	.exit   = ip_set_net_exit, +	.id	= &ip_set_net_id, +	.size	= sizeof(struct ip_set_net) +}; -	rcu_assign_pointer(ip_set_list, list); -	ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + +static int __init +ip_set_init(void) +{ +	int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);  	if (ret != 0) {  		pr_err("ip_set: cannot register with nfnetlink.\n"); -		kfree(list);  		return ret;  	}  	ret = nf_register_sockopt(&so_set);  	if (ret != 0) {  		pr_err("SO_SET registry failed: %d\n", ret);  		nfnetlink_subsys_unregister(&ip_set_netlink_subsys); -		kfree(list);  		return ret;  	} - -	pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL); +	ret = register_pernet_subsys(&ip_set_net_ops); +	if (ret) { +		pr_err("ip_set: cannot register pernet_subsys.\n"); +		nf_unregister_sockopt(&so_set); +		nfnetlink_subsys_unregister(&ip_set_netlink_subsys); +		return ret; +	} +	pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);  	return 0;  }  static void __exit  ip_set_fini(void)  { -	struct ip_set **list = rcu_dereference_protected(ip_set_list, 1); - -	/* There can't be any existing set */ +	unregister_pernet_subsys(&ip_set_net_ops);  	nf_unregister_sockopt(&so_set);  	nfnetlink_subsys_unregister(&ip_set_netlink_subsys); -	kfree(list);  	pr_debug("these are the famous last words\n");  } diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index dac156f819a..29fb01ddff9 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -102,9 +102,25 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src,  	int protocol = iph->protocol;  	/* See comments at tcp_match in ip_tables.c */ -	if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET)) +	if (protocol <= 0)  		return false; +	if (ntohs(iph->frag_off) & IP_OFFSET) +		switch (protocol) { +		case IPPROTO_TCP: +		case IPPROTO_SCTP: +		case IPPROTO_UDP: +		case IPPROTO_UDPLITE: +		case IPPROTO_ICMP: +			/* Port info not available for fragment offset > 0 */ +			return false; +		default: +			/* Other protocols doesn't have ports, +			   so we can match fragments */ +			*proto = protocol; +			return true; +		} +  	return get_port(skb, protocol, protooff, src, port, proto);  }  EXPORT_SYMBOL_GPL(ip_set_get_ip4_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 707bc520d62..61c7fb05280 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -15,8 +15,7 @@  #define rcu_dereference_bh(p)	rcu_dereference(p)  #endif -#define CONCAT(a, b)		a##b -#define TOKEN(a, b)		CONCAT(a, b) +#define rcu_dereference_bh_nfnl(p)	rcu_dereference_bh_check(p, 1)  /* Hashing which uses arrays to resolve clashing. The hash table is resized   * (doubled) when searching becomes too long. @@ -78,10 +77,14 @@ struct htable {  #define hbucket(h, i)		(&((h)->bucket[i])) +#ifndef IPSET_NET_COUNT +#define IPSET_NET_COUNT		1 +#endif +  /* Book-keeping of the prefixes added to the set */  struct net_prefixes { -	u8 cidr;		/* the different cidr values in the set */ -	u32 nets;		/* number of elements per cidr */ +	u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ +	u8 cidr[IPSET_NET_COUNT];  /* the different cidr values in the set */  };  /* Compute the hash table size */ @@ -114,23 +117,6 @@ htable_bits(u32 hashsize)  	return bits;  } -/* Destroy the hashtable part of the set */ -static void -ahash_destroy(struct htable *t) -{ -	struct hbucket *n; -	u32 i; - -	for (i = 0; i < jhash_size(t->htable_bits); i++) { -		n = hbucket(t, i); -		if (n->size) -			/* FIXME: use slab cache */ -			kfree(n->value); -	} - -	ip_set_free(t); -} -  static int  hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)  { @@ -156,30 +142,30 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)  }  #ifdef IP_SET_HASH_WITH_NETS +#if IPSET_NET_COUNT > 1 +#define __CIDR(cidr, i)		(cidr[i]) +#else +#define __CIDR(cidr, i)		(cidr) +#endif  #ifdef IP_SET_HASH_WITH_NETS_PACKED  /* When cidr is packed with nomatch, cidr - 1 is stored in the entry */ -#define CIDR(cidr)		(cidr + 1) +#define CIDR(cidr, i)		(__CIDR(cidr, i) + 1)  #else -#define CIDR(cidr)		(cidr) +#define CIDR(cidr, i)		(__CIDR(cidr, i))  #endif  #define SET_HOST_MASK(family)	(family == AF_INET ? 32 : 128)  #ifdef IP_SET_HASH_WITH_MULTI -#define NETS_LENGTH(family)	(SET_HOST_MASK(family) + 1) +#define NLEN(family)		(SET_HOST_MASK(family) + 1)  #else -#define NETS_LENGTH(family)	SET_HOST_MASK(family) +#define NLEN(family)		SET_HOST_MASK(family)  #endif  #else -#define NETS_LENGTH(family)	0 +#define NLEN(family)		0  #endif /* IP_SET_HASH_WITH_NETS */ -#define ext_timeout(e, h)	\ -(unsigned long *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_TIMEOUT]) -#define ext_counter(e, h)	\ -(struct ip_set_counter *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_COUNTER]) -  #endif /* _IP_SET_HASH_GEN_H */  /* Family dependent templates */ @@ -194,6 +180,8 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)  #undef mtype_data_next  #undef mtype_elem +#undef mtype_ahash_destroy +#undef mtype_ext_cleanup  #undef mtype_add_cidr  #undef mtype_del_cidr  #undef mtype_ahash_memsize @@ -220,41 +208,43 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)  #undef HKEY -#define mtype_data_equal	TOKEN(MTYPE, _data_equal) +#define mtype_data_equal	IPSET_TOKEN(MTYPE, _data_equal)  #ifdef IP_SET_HASH_WITH_NETS -#define mtype_do_data_match	TOKEN(MTYPE, _do_data_match) +#define mtype_do_data_match	IPSET_TOKEN(MTYPE, _do_data_match)  #else  #define mtype_do_data_match(d)	1  #endif -#define mtype_data_set_flags	TOKEN(MTYPE, _data_set_flags) -#define mtype_data_reset_flags	TOKEN(MTYPE, _data_reset_flags) -#define mtype_data_netmask	TOKEN(MTYPE, _data_netmask) -#define mtype_data_list		TOKEN(MTYPE, _data_list) -#define mtype_data_next		TOKEN(MTYPE, _data_next) -#define mtype_elem		TOKEN(MTYPE, _elem) -#define mtype_add_cidr		TOKEN(MTYPE, _add_cidr) -#define mtype_del_cidr		TOKEN(MTYPE, _del_cidr) -#define mtype_ahash_memsize	TOKEN(MTYPE, _ahash_memsize) -#define mtype_flush		TOKEN(MTYPE, _flush) -#define mtype_destroy		TOKEN(MTYPE, _destroy) -#define mtype_gc_init		TOKEN(MTYPE, _gc_init) -#define mtype_same_set		TOKEN(MTYPE, _same_set) -#define mtype_kadt		TOKEN(MTYPE, _kadt) -#define mtype_uadt		TOKEN(MTYPE, _uadt) +#define mtype_data_set_flags	IPSET_TOKEN(MTYPE, _data_set_flags) +#define mtype_data_reset_elem	IPSET_TOKEN(MTYPE, _data_reset_elem) +#define mtype_data_reset_flags	IPSET_TOKEN(MTYPE, _data_reset_flags) +#define mtype_data_netmask	IPSET_TOKEN(MTYPE, _data_netmask) +#define mtype_data_list		IPSET_TOKEN(MTYPE, _data_list) +#define mtype_data_next		IPSET_TOKEN(MTYPE, _data_next) +#define mtype_elem		IPSET_TOKEN(MTYPE, _elem) +#define mtype_ahash_destroy	IPSET_TOKEN(MTYPE, _ahash_destroy) +#define mtype_ext_cleanup	IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_add_cidr		IPSET_TOKEN(MTYPE, _add_cidr) +#define mtype_del_cidr		IPSET_TOKEN(MTYPE, _del_cidr) +#define mtype_ahash_memsize	IPSET_TOKEN(MTYPE, _ahash_memsize) +#define mtype_flush		IPSET_TOKEN(MTYPE, _flush) +#define mtype_destroy		IPSET_TOKEN(MTYPE, _destroy) +#define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_same_set		IPSET_TOKEN(MTYPE, _same_set) +#define mtype_kadt		IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt		IPSET_TOKEN(MTYPE, _uadt)  #define mtype			MTYPE -#define mtype_elem		TOKEN(MTYPE, _elem) -#define mtype_add		TOKEN(MTYPE, _add) -#define mtype_del		TOKEN(MTYPE, _del) -#define mtype_test_cidrs	TOKEN(MTYPE, _test_cidrs) -#define mtype_test		TOKEN(MTYPE, _test) -#define mtype_expire		TOKEN(MTYPE, _expire) -#define mtype_resize		TOKEN(MTYPE, _resize) -#define mtype_head		TOKEN(MTYPE, _head) -#define mtype_list		TOKEN(MTYPE, _list) -#define mtype_gc		TOKEN(MTYPE, _gc) -#define mtype_variant		TOKEN(MTYPE, _variant) -#define mtype_data_match	TOKEN(MTYPE, _data_match) +#define mtype_add		IPSET_TOKEN(MTYPE, _add) +#define mtype_del		IPSET_TOKEN(MTYPE, _del) +#define mtype_test_cidrs	IPSET_TOKEN(MTYPE, _test_cidrs) +#define mtype_test		IPSET_TOKEN(MTYPE, _test) +#define mtype_expire		IPSET_TOKEN(MTYPE, _expire) +#define mtype_resize		IPSET_TOKEN(MTYPE, _resize) +#define mtype_head		IPSET_TOKEN(MTYPE, _head) +#define mtype_list		IPSET_TOKEN(MTYPE, _list) +#define mtype_gc		IPSET_TOKEN(MTYPE, _gc) +#define mtype_variant		IPSET_TOKEN(MTYPE, _variant) +#define mtype_data_match	IPSET_TOKEN(MTYPE, _data_match)  #ifndef HKEY_DATALEN  #define HKEY_DATALEN		sizeof(struct mtype_elem) @@ -269,13 +259,13 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)  /* The generic hash structure */  struct htype { -	struct htable *table;	/* the hash table */ +	struct htable __rcu *table; /* the hash table */  	u32 maxelem;		/* max elements in the hash */  	u32 elements;		/* current element (vs timeout) */  	u32 initval;		/* random jhash init value */ -	u32 timeout;		/* timeout value, if enabled */ -	size_t dsize;		/* data struct size */ -	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ +#ifdef IP_SET_HASH_WITH_MARKMASK +	u32 markmask;		/* markmask value for mark mask to store */ +#endif  	struct timer_list gc;	/* garbage collection when timeout enabled */  	struct mtype_elem next; /* temporary storage for uadd */  #ifdef IP_SET_HASH_WITH_MULTI @@ -297,49 +287,49 @@ struct htype {  /* Network cidr size book keeping when the hash stores different   * sized networks */  static void -mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length) +mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)  {  	int i, j;  	/* Add in increasing prefix order, so larger cidr first */ -	for (i = 0, j = -1; i < nets_length && h->nets[i].nets; i++) { +	for (i = 0, j = -1; i < nets_length && h->nets[i].nets[n]; i++) {  		if (j != -1)  			continue; -		else if (h->nets[i].cidr < cidr) +		else if (h->nets[i].cidr[n] < cidr)  			j = i; -		else if (h->nets[i].cidr == cidr) { -			h->nets[i].nets++; +		else if (h->nets[i].cidr[n] == cidr) { +			h->nets[i].nets[n]++;  			return;  		}  	}  	if (j != -1) {  		for (; i > j; i--) { -			h->nets[i].cidr = h->nets[i - 1].cidr; -			h->nets[i].nets = h->nets[i - 1].nets; +			h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; +			h->nets[i].nets[n] = h->nets[i - 1].nets[n];  		}  	} -	h->nets[i].cidr = cidr; -	h->nets[i].nets = 1; +	h->nets[i].cidr[n] = cidr; +	h->nets[i].nets[n] = 1;  }  static void -mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length) +mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)  {  	u8 i, j, net_end = nets_length - 1;  	for (i = 0; i < nets_length; i++) { -	        if (h->nets[i].cidr != cidr) +	        if (h->nets[i].cidr[n] != cidr)  	                continue; -                if (h->nets[i].nets > 1 || i == net_end || -                    h->nets[i + 1].nets == 0) { -                        h->nets[i].nets--; +                if (h->nets[i].nets[n] > 1 || i == net_end || +                    h->nets[i + 1].nets[n] == 0) { +                        h->nets[i].nets[n]--;                          return;                  } -                for (j = i; j < net_end && h->nets[j].nets; j++) { -		        h->nets[j].cidr = h->nets[j + 1].cidr; -		        h->nets[j].nets = h->nets[j + 1].nets; +                for (j = i; j < net_end && h->nets[j].nets[n]; j++) { +		        h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; +		        h->nets[j].nets[n] = h->nets[j + 1].nets[n];                  } -                h->nets[j].nets = 0; +                h->nets[j].nets[n] = 0;                  return;  	}  } @@ -347,10 +337,10 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length)  /* Calculate the actual memory size of the set data */  static size_t -mtype_ahash_memsize(const struct htype *h, u8 nets_length) +mtype_ahash_memsize(const struct htype *h, const struct htable *t, +		    u8 nets_length, size_t dsize)  {  	u32 i; -	struct htable *t = h->table;  	size_t memsize = sizeof(*h)  			 + sizeof(*t)  #ifdef IP_SET_HASH_WITH_NETS @@ -359,35 +349,70 @@ mtype_ahash_memsize(const struct htype *h, u8 nets_length)  			 + jhash_size(t->htable_bits) * sizeof(struct hbucket);  	for (i = 0; i < jhash_size(t->htable_bits); i++) -		memsize += t->bucket[i].size * h->dsize; +		memsize += t->bucket[i].size * dsize;  	return memsize;  } +/* Get the ith element from the array block n */ +#define ahash_data(n, i, dsize)	\ +	((struct mtype_elem *)((n)->value + ((i) * (dsize)))) + +static void +mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) +{ +	int i; + +	for (i = 0; i < n->pos; i++) +		ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); +} +  /* Flush a hash type of set: destroy all elements */  static void  mtype_flush(struct ip_set *set)  {  	struct htype *h = set->data; -	struct htable *t = h->table; +	struct htable *t;  	struct hbucket *n;  	u32 i; +	t = rcu_dereference_bh_nfnl(h->table);  	for (i = 0; i < jhash_size(t->htable_bits); i++) {  		n = hbucket(t, i);  		if (n->size) { +			if (set->extensions & IPSET_EXT_DESTROY) +				mtype_ext_cleanup(set, n);  			n->size = n->pos = 0;  			/* FIXME: use slab cache */  			kfree(n->value);  		}  	}  #ifdef IP_SET_HASH_WITH_NETS -	memset(h->nets, 0, sizeof(struct net_prefixes) -			   * NETS_LENGTH(set->family)); +	memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));  #endif  	h->elements = 0;  } +/* Destroy the hashtable part of the set */ +static void +mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) +{ +	struct hbucket *n; +	u32 i; + +	for (i = 0; i < jhash_size(t->htable_bits); i++) { +		n = hbucket(t, i); +		if (n->size) { +			if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) +				mtype_ext_cleanup(set, n); +			/* FIXME: use slab cache */ +			kfree(n->value); +		} +	} + +	ip_set_free(t); +} +  /* Destroy a hash type of set */  static void  mtype_destroy(struct ip_set *set) @@ -397,7 +422,7 @@ mtype_destroy(struct ip_set *set)  	if (set->extensions & IPSET_EXT_TIMEOUT)  		del_timer_sync(&h->gc); -	ahash_destroy(h->table); +	mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true);  #ifdef IP_SET_HASH_WITH_RBTREE  	rbtree_destroy(&h->rbtree);  #endif @@ -414,10 +439,10 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))  	init_timer(&h->gc);  	h->gc.data = (unsigned long) set;  	h->gc.function = gc; -	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ; +	h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;  	add_timer(&h->gc);  	pr_debug("gc initialized, run in every %u\n", -		 IPSET_GC_PERIOD(h->timeout)); +		 IPSET_GC_PERIOD(set->timeout));  }  static bool @@ -428,37 +453,43 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)  	/* Resizing changes htable_bits, so we ignore it */  	return x->maxelem == y->maxelem && -	       x->timeout == y->timeout && +	       a->timeout == b->timeout &&  #ifdef IP_SET_HASH_WITH_NETMASK  	       x->netmask == y->netmask &&  #endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	       x->markmask == y->markmask && +#endif  	       a->extensions == b->extensions;  } -/* Get the ith element from the array block n */ -#define ahash_data(n, i, dsize)	\ -	((struct mtype_elem *)((n)->value + ((i) * (dsize)))) -  /* Delete expired elements from the hashtable */  static void -mtype_expire(struct htype *h, u8 nets_length, size_t dsize) +mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)  { -	struct htable *t = h->table; +	struct htable *t;  	struct hbucket *n;  	struct mtype_elem *data;  	u32 i;  	int j; +#ifdef IP_SET_HASH_WITH_NETS +	u8 k; +#endif +	rcu_read_lock_bh(); +	t = rcu_dereference_bh(h->table);  	for (i = 0; i < jhash_size(t->htable_bits); i++) {  		n = hbucket(t, i);  		for (j = 0; j < n->pos; j++) {  			data = ahash_data(n, j, dsize); -			if (ip_set_timeout_expired(ext_timeout(data, h))) { +			if (ip_set_timeout_expired(ext_timeout(data, set))) {  				pr_debug("expired %u/%u\n", i, j);  #ifdef IP_SET_HASH_WITH_NETS -				mtype_del_cidr(h, CIDR(data->cidr), -					       nets_length); +				for (k = 0; k < IPSET_NET_COUNT; k++) +					mtype_del_cidr(h, CIDR(data->cidr, k), +						       nets_length, k);  #endif +				ip_set_ext_destroy(set, data);  				if (j != n->pos - 1)  					/* Not last one */  					memcpy(data, @@ -481,6 +512,7 @@ mtype_expire(struct htype *h, u8 nets_length, size_t dsize)  			n->value = tmp;  		}  	} +	rcu_read_unlock_bh();  }  static void @@ -491,10 +523,10 @@ mtype_gc(unsigned long ul_set)  	pr_debug("called\n");  	write_lock_bh(&set->lock); -	mtype_expire(h, NETS_LENGTH(set->family), h->dsize); +	mtype_expire(set, h, NLEN(set->family), set->dsize);  	write_unlock_bh(&set->lock); -	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ; +	h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;  	add_timer(&h->gc);  } @@ -505,7 +537,7 @@ static int  mtype_resize(struct ip_set *set, bool retried)  {  	struct htype *h = set->data; -	struct htable *t, *orig = h->table; +	struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table);  	u8 htable_bits = orig->htable_bits;  #ifdef IP_SET_HASH_WITH_NETS  	u8 flags; @@ -520,8 +552,7 @@ mtype_resize(struct ip_set *set, bool retried)  	if (SET_WITH_TIMEOUT(set) && !retried) {  		i = h->elements;  		write_lock_bh(&set->lock); -		mtype_expire(set->data, NETS_LENGTH(set->family), -			     h->dsize); +		mtype_expire(set, set->data, NLEN(set->family), set->dsize);  		write_unlock_bh(&set->lock);  		if (h->elements < i)  			return 0; @@ -548,25 +579,25 @@ retry:  	for (i = 0; i < jhash_size(orig->htable_bits); i++) {  		n = hbucket(orig, i);  		for (j = 0; j < n->pos; j++) { -			data = ahash_data(n, j, h->dsize); +			data = ahash_data(n, j, set->dsize);  #ifdef IP_SET_HASH_WITH_NETS  			flags = 0;  			mtype_data_reset_flags(data, &flags);  #endif  			m = hbucket(t, HKEY(data, h->initval, htable_bits)); -			ret = hbucket_elem_add(m, AHASH_MAX(h), h->dsize); +			ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize);  			if (ret < 0) {  #ifdef IP_SET_HASH_WITH_NETS  				mtype_data_reset_flags(data, &flags);  #endif  				read_unlock_bh(&set->lock); -				ahash_destroy(t); +				mtype_ahash_destroy(set, t, false);  				if (ret == -EAGAIN)  					goto retry;  				return ret;  			} -			d = ahash_data(m, m->pos++, h->dsize); -			memcpy(d, data, h->dsize); +			d = ahash_data(m, m->pos++, set->dsize); +			memcpy(d, data, set->dsize);  #ifdef IP_SET_HASH_WITH_NETS  			mtype_data_reset_flags(d, &flags);  #endif @@ -581,7 +612,7 @@ retry:  	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,  		 orig->htable_bits, orig, t->htable_bits, t); -	ahash_destroy(orig); +	mtype_ahash_destroy(set, orig, false);  	return 0;  } @@ -602,9 +633,21 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	bool flag_exist = flags & IPSET_FLAG_EXIST;  	u32 key, multi = 0; +	if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) { +		rcu_read_lock_bh(); +		t = rcu_dereference_bh(h->table); +		key = HKEY(value, h->initval, t->htable_bits); +		n = hbucket(t,key); +		if (n->pos) { +			/* Choosing the first entry in the array to replace */ +			j = 0; +			goto reuse_slot; +		} +		rcu_read_unlock_bh(); +	}  	if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)  		/* FIXME: when set is full, we slow down here */ -		mtype_expire(h, NETS_LENGTH(set->family), h->dsize); +		mtype_expire(set, h, NLEN(set->family), set->dsize);  	if (h->elements >= h->maxelem) {  		if (net_ratelimit()) @@ -618,11 +661,11 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	key = HKEY(value, h->initval, t->htable_bits);  	n = hbucket(t, key);  	for (i = 0; i < n->pos; i++) { -		data = ahash_data(n, i, h->dsize); +		data = ahash_data(n, i, set->dsize);  		if (mtype_data_equal(data, d, &multi)) {  			if (flag_exist ||  			    (SET_WITH_TIMEOUT(set) && -			     ip_set_timeout_expired(ext_timeout(data, h)))) { +			     ip_set_timeout_expired(ext_timeout(data, set)))) {  				/* Just the extensions could be overwritten */  				j = i;  				goto reuse_slot; @@ -633,30 +676,37 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,  		}  		/* Reuse first timed out entry */  		if (SET_WITH_TIMEOUT(set) && -		    ip_set_timeout_expired(ext_timeout(data, h)) && +		    ip_set_timeout_expired(ext_timeout(data, set)) &&  		    j != AHASH_MAX(h) + 1)  			j = i;  	}  reuse_slot:  	if (j != AHASH_MAX(h) + 1) {  		/* Fill out reused slot */ -		data = ahash_data(n, j, h->dsize); +		data = ahash_data(n, j, set->dsize);  #ifdef IP_SET_HASH_WITH_NETS -		mtype_del_cidr(h, CIDR(data->cidr), NETS_LENGTH(set->family)); -		mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +		for (i = 0; i < IPSET_NET_COUNT; i++) { +			mtype_del_cidr(h, CIDR(data->cidr, i), +				       NLEN(set->family), i); +			mtype_add_cidr(h, CIDR(d->cidr, i), +				       NLEN(set->family), i); +		}  #endif +		ip_set_ext_destroy(set, data);  	} else {  		/* Use/create a new slot */  		TUNE_AHASH_MAX(h, multi); -		ret = hbucket_elem_add(n, AHASH_MAX(h), h->dsize); +		ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize);  		if (ret != 0) {  			if (ret == -EAGAIN)  				mtype_data_next(&h->next, d);  			goto out;  		} -		data = ahash_data(n, n->pos++, h->dsize); +		data = ahash_data(n, n->pos++, set->dsize);  #ifdef IP_SET_HASH_WITH_NETS -		mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +		for (i = 0; i < IPSET_NET_COUNT; i++) +			mtype_add_cidr(h, CIDR(d->cidr, i), NLEN(set->family), +				       i);  #endif  		h->elements++;  	} @@ -665,9 +715,11 @@ reuse_slot:  	mtype_data_set_flags(data, flags);  #endif  	if (SET_WITH_TIMEOUT(set)) -		ip_set_timeout_set(ext_timeout(data, h), ext->timeout); +		ip_set_timeout_set(ext_timeout(data, set), ext->timeout);  	if (SET_WITH_COUNTER(set)) -		ip_set_init_counter(ext_counter(data, h), ext); +		ip_set_init_counter(ext_counter(data, set), ext); +	if (SET_WITH_COMMENT(set)) +		ip_set_init_comment(ext_comment(data, set), ext);  out:  	rcu_read_unlock_bh(); @@ -682,47 +734,60 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	  struct ip_set_ext *mext, u32 flags)  {  	struct htype *h = set->data; -	struct htable *t = h->table; +	struct htable *t;  	const struct mtype_elem *d = value;  	struct mtype_elem *data;  	struct hbucket *n; -	int i; +	int i, ret = -IPSET_ERR_EXIST; +#ifdef IP_SET_HASH_WITH_NETS +	u8 j; +#endif  	u32 key, multi = 0; +	rcu_read_lock_bh(); +	t = rcu_dereference_bh(h->table);  	key = HKEY(value, h->initval, t->htable_bits);  	n = hbucket(t, key);  	for (i = 0; i < n->pos; i++) { -		data = ahash_data(n, i, h->dsize); +		data = ahash_data(n, i, set->dsize);  		if (!mtype_data_equal(data, d, &multi))  			continue;  		if (SET_WITH_TIMEOUT(set) && -		    ip_set_timeout_expired(ext_timeout(data, h))) -			return -IPSET_ERR_EXIST; +		    ip_set_timeout_expired(ext_timeout(data, set))) +			goto out;  		if (i != n->pos - 1)  			/* Not last one */ -			memcpy(data, ahash_data(n, n->pos - 1, h->dsize), -			       h->dsize); +			memcpy(data, ahash_data(n, n->pos - 1, set->dsize), +			       set->dsize);  		n->pos--;  		h->elements--;  #ifdef IP_SET_HASH_WITH_NETS -		mtype_del_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +		for (j = 0; j < IPSET_NET_COUNT; j++) +			mtype_del_cidr(h, CIDR(d->cidr, j), NLEN(set->family), +				       j);  #endif +		ip_set_ext_destroy(set, data);  		if (n->pos + AHASH_INIT_SIZE < n->size) {  			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) -					    * h->dsize, +					    * set->dsize,  					    GFP_ATOMIC); -			if (!tmp) -				return 0; +			if (!tmp) { +				ret = 0; +				goto out; +			}  			n->size -= AHASH_INIT_SIZE; -			memcpy(tmp, n->value, n->size * h->dsize); +			memcpy(tmp, n->value, n->size * set->dsize);  			kfree(n->value);  			n->value = tmp;  		} -		return 0; +		ret = 0; +		goto out;  	} -	return -IPSET_ERR_EXIST; +out: +	rcu_read_unlock_bh(); +	return ret;  }  static inline int @@ -730,8 +795,7 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,  		 struct ip_set_ext *mext, struct ip_set *set, u32 flags)  {  	if (SET_WITH_COUNTER(set)) -		ip_set_update_counter(ext_counter(data, -						  (struct htype *)(set->data)), +		ip_set_update_counter(ext_counter(data, set),  				      ext, mext, flags);  	return mtype_do_data_match(data);  } @@ -745,25 +809,38 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,  		 struct ip_set_ext *mext, u32 flags)  {  	struct htype *h = set->data; -	struct htable *t = h->table; +	struct htable *t = rcu_dereference_bh(h->table);  	struct hbucket *n;  	struct mtype_elem *data; +#if IPSET_NET_COUNT == 2 +	struct mtype_elem orig = *d; +	int i, j = 0, k; +#else  	int i, j = 0; +#endif  	u32 key, multi = 0; -	u8 nets_length = NETS_LENGTH(set->family); +	u8 nets_length = NLEN(set->family);  	pr_debug("test by nets\n"); -	for (; j < nets_length && h->nets[j].nets && !multi; j++) { -		mtype_data_netmask(d, h->nets[j].cidr); +	for (; j < nets_length && h->nets[j].nets[0] && !multi; j++) { +#if IPSET_NET_COUNT == 2 +		mtype_data_reset_elem(d, &orig); +		mtype_data_netmask(d, h->nets[j].cidr[0], false); +		for (k = 0; k < nets_length && h->nets[k].nets[1] && !multi; +		     k++) { +			mtype_data_netmask(d, h->nets[k].cidr[1], true); +#else +		mtype_data_netmask(d, h->nets[j].cidr[0]); +#endif  		key = HKEY(d, h->initval, t->htable_bits);  		n = hbucket(t, key);  		for (i = 0; i < n->pos; i++) { -			data = ahash_data(n, i, h->dsize); +			data = ahash_data(n, i, set->dsize);  			if (!mtype_data_equal(data, d, &multi))  				continue;  			if (SET_WITH_TIMEOUT(set)) {  				if (!ip_set_timeout_expired( -							ext_timeout(data, h))) +						ext_timeout(data, set)))  					return mtype_data_match(data, ext,  								mext, set,  								flags); @@ -774,6 +851,9 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,  				return mtype_data_match(data, ext,  							mext, set, flags);  		} +#if IPSET_NET_COUNT == 2 +		} +#endif  	}  	return 0;  } @@ -785,30 +865,41 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	   struct ip_set_ext *mext, u32 flags)  {  	struct htype *h = set->data; -	struct htable *t = h->table; +	struct htable *t;  	struct mtype_elem *d = value;  	struct hbucket *n;  	struct mtype_elem *data; -	int i; +	int i, ret = 0;  	u32 key, multi = 0; +	rcu_read_lock_bh(); +	t = rcu_dereference_bh(h->table);  #ifdef IP_SET_HASH_WITH_NETS  	/* If we test an IP address and not a network address,  	 * try all possible network sizes */ -	if (CIDR(d->cidr) == SET_HOST_MASK(set->family)) -		return mtype_test_cidrs(set, d, ext, mext, flags); +	for (i = 0; i < IPSET_NET_COUNT; i++) +		if (CIDR(d->cidr, i) != SET_HOST_MASK(set->family)) +			break; +	if (i == IPSET_NET_COUNT) { +		ret = mtype_test_cidrs(set, d, ext, mext, flags); +		goto out; +	}  #endif  	key = HKEY(d, h->initval, t->htable_bits);  	n = hbucket(t, key);  	for (i = 0; i < n->pos; i++) { -		data = ahash_data(n, i, h->dsize); +		data = ahash_data(n, i, set->dsize);  		if (mtype_data_equal(data, d, &multi) &&  		    !(SET_WITH_TIMEOUT(set) && -		      ip_set_timeout_expired(ext_timeout(data, h)))) -			return mtype_data_match(data, ext, mext, set, flags); +		      ip_set_timeout_expired(ext_timeout(data, set)))) { +			ret = mtype_data_match(data, ext, mext, set, flags); +			goto out; +		}  	} -	return 0; +out: +	rcu_read_unlock_bh(); +	return ret;  }  /* Reply a HEADER request: fill out the header part of the set */ @@ -816,18 +907,18 @@ static int  mtype_head(struct ip_set *set, struct sk_buff *skb)  {  	const struct htype *h = set->data; +	const struct htable *t;  	struct nlattr *nested;  	size_t memsize; -	read_lock_bh(&set->lock); -	memsize = mtype_ahash_memsize(h, NETS_LENGTH(set->family)); -	read_unlock_bh(&set->lock); +	t = rcu_dereference_bh_nfnl(h->table); +	memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);  	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);  	if (!nested)  		goto nla_put_failure;  	if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, -			  htonl(jhash_size(h->table->htable_bits))) || +			  htonl(jhash_size(t->htable_bits))) ||  	    nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))  		goto nla_put_failure;  #ifdef IP_SET_HASH_WITH_NETMASK @@ -835,13 +926,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)  	    nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))  		goto nla_put_failure;  #endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) +		goto nla_put_failure; +#endif  	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || -	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || -	    ((set->extensions & IPSET_EXT_TIMEOUT) && -	     nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout))) || -	    ((set->extensions & IPSET_EXT_COUNTER) && -	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, -			   htonl(IPSET_FLAG_WITH_COUNTERS)))) +	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) +		goto nla_put_failure; +	if (unlikely(ip_set_put_flags(skb, set)))  		goto nla_put_failure;  	ipset_nest_end(skb, nested); @@ -856,11 +948,11 @@ mtype_list(const struct ip_set *set,  	   struct sk_buff *skb, struct netlink_callback *cb)  {  	const struct htype *h = set->data; -	const struct htable *t = h->table; +	const struct htable *t = rcu_dereference_bh_nfnl(h->table);  	struct nlattr *atd, *nested;  	const struct hbucket *n;  	const struct mtype_elem *e; -	u32 first = cb->args[2]; +	u32 first = cb->args[IPSET_CB_ARG0];  	/* We assume that one hash bucket fills into one page */  	void *incomplete;  	int i; @@ -869,20 +961,22 @@ mtype_list(const struct ip_set *set,  	if (!atd)  		return -EMSGSIZE;  	pr_debug("list hash set %s\n", set->name); -	for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) { +	for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); +	     cb->args[IPSET_CB_ARG0]++) {  		incomplete = skb_tail_pointer(skb); -		n = hbucket(t, cb->args[2]); -		pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n); +		n = hbucket(t, cb->args[IPSET_CB_ARG0]); +		pr_debug("cb->arg bucket: %lu, t %p n %p\n", +			 cb->args[IPSET_CB_ARG0], t, n);  		for (i = 0; i < n->pos; i++) { -			e = ahash_data(n, i, h->dsize); +			e = ahash_data(n, i, set->dsize);  			if (SET_WITH_TIMEOUT(set) && -			    ip_set_timeout_expired(ext_timeout(e, h))) +			    ip_set_timeout_expired(ext_timeout(e, set)))  				continue;  			pr_debug("list hash %lu hbucket %p i %u, data %p\n", -				 cb->args[2], n, i, e); +				 cb->args[IPSET_CB_ARG0], n, i, e);  			nested = ipset_nest_start(skb, IPSET_ATTR_DATA);  			if (!nested) { -				if (cb->args[2] == first) { +				if (cb->args[IPSET_CB_ARG0] == first) {  					nla_nest_cancel(skb, atd);  					return -EMSGSIZE;  				} else @@ -890,43 +984,37 @@ mtype_list(const struct ip_set *set,  			}  			if (mtype_data_list(skb, e))  				goto nla_put_failure; -			if (SET_WITH_TIMEOUT(set) && -			    nla_put_net32(skb, IPSET_ATTR_TIMEOUT, -					  htonl(ip_set_timeout_get( -						ext_timeout(e, h))))) -				goto nla_put_failure; -			if (SET_WITH_COUNTER(set) && -			    ip_set_put_counter(skb, ext_counter(e, h))) +			if (ip_set_put_extensions(skb, set, e, true))  				goto nla_put_failure;  			ipset_nest_end(skb, nested);  		}  	}  	ipset_nest_end(skb, atd);  	/* Set listing finished */ -	cb->args[2] = 0; +	cb->args[IPSET_CB_ARG0] = 0;  	return 0;  nla_put_failure:  	nlmsg_trim(skb, incomplete); -	ipset_nest_end(skb, atd); -	if (unlikely(first == cb->args[2])) { +	if (unlikely(first == cb->args[IPSET_CB_ARG0])) {  		pr_warning("Can't list set %s: one bucket does not fit into "  			   "a message. Please report it!\n", set->name); -		cb->args[2] = 0; +		cb->args[IPSET_CB_ARG0] = 0;  		return -EMSGSIZE;  	} +	ipset_nest_end(skb, atd);  	return 0;  }  static int -TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, -	      const struct xt_action_param *par, -	      enum ipset_adt adt, struct ip_set_adt_opt *opt); +IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, +	    const struct xt_action_param *par, +	    enum ipset_adt adt, struct ip_set_adt_opt *opt);  static int -TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], -	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); +IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], +	    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);  static const struct ip_set_type_variant mtype_variant = {  	.kadt	= mtype_kadt, @@ -946,19 +1034,27 @@ static const struct ip_set_type_variant mtype_variant = {  #ifdef IP_SET_EMIT_CREATE  static int -TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags) +IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, +			    struct nlattr *tb[], u32 flags)  {  	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; -	u32 cadt_flags = 0; +#ifdef IP_SET_HASH_WITH_MARKMASK +	u32 markmask; +#endif  	u8 hbits;  #ifdef IP_SET_HASH_WITH_NETMASK  	u8 netmask;  #endif  	size_t hsize;  	struct HTYPE *h; +	struct htable *t;  	if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))  		return -IPSET_ERR_INVALID_FAMILY; + +#ifdef IP_SET_HASH_WITH_MARKMASK +	markmask = 0xffffffff; +#endif  #ifdef IP_SET_HASH_WITH_NETMASK  	netmask = set->family == NFPROTO_IPV4 ? 32 : 128;  	pr_debug("Create set %s with family %s\n", @@ -967,6 +1063,9 @@ TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)  	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||  		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || +#ifdef IP_SET_HASH_WITH_MARKMASK +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || +#endif  		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||  		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))  		return -IPSET_ERR_PROTOCOL; @@ -990,6 +1089,14 @@ TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)  			return -IPSET_ERR_INVALID_NETMASK;  	}  #endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	if (tb[IPSET_ATTR_MARKMASK]) { +		markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + +		if ((markmask > 4294967295u) || markmask == 0) +			return -IPSET_ERR_INVALID_MARKMASK; +	} +#endif  	hsize = sizeof(*h);  #ifdef IP_SET_HASH_WITH_NETS @@ -1004,8 +1111,11 @@ TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)  #ifdef IP_SET_HASH_WITH_NETMASK  	h->netmask = netmask;  #endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	h->markmask = markmask; +#endif  	get_random_bytes(&h->initval, sizeof(h->initval)); -	h->timeout = IPSET_NO_TIMEOUT; +	set->timeout = IPSET_NO_TIMEOUT;  	hbits = htable_bits(hashsize);  	hsize = htable_size(hbits); @@ -1013,91 +1123,37 @@ TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)  		kfree(h);  		return -ENOMEM;  	} -	h->table = ip_set_alloc(hsize); -	if (!h->table) { +	t = ip_set_alloc(hsize); +	if (!t) {  		kfree(h);  		return -ENOMEM;  	} -	h->table->htable_bits = hbits; +	t->htable_bits = hbits; +	rcu_assign_pointer(h->table, t);  	set->data = h; -	if (set->family ==  NFPROTO_IPV4) -		set->variant = &TOKEN(HTYPE, 4_variant); -	else -		set->variant = &TOKEN(HTYPE, 6_variant); - -	if (tb[IPSET_ATTR_CADT_FLAGS]) -		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); -	if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { -		set->extensions |= IPSET_EXT_COUNTER; -		if (tb[IPSET_ATTR_TIMEOUT]) { -			h->timeout = -				ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); -			set->extensions |= IPSET_EXT_TIMEOUT; -			if (set->family == NFPROTO_IPV4) { -				h->dsize = -					sizeof(struct TOKEN(HTYPE, 4ct_elem)); -				h->offset[IPSET_OFFSET_TIMEOUT] = -					offsetof(struct TOKEN(HTYPE, 4ct_elem), -						 timeout); -				h->offset[IPSET_OFFSET_COUNTER] = -					offsetof(struct TOKEN(HTYPE, 4ct_elem), -						 counter); -				TOKEN(HTYPE, 4_gc_init)(set, -					TOKEN(HTYPE, 4_gc)); -			} else { -				h->dsize = -					sizeof(struct TOKEN(HTYPE, 6ct_elem)); -				h->offset[IPSET_OFFSET_TIMEOUT] = -					offsetof(struct TOKEN(HTYPE, 6ct_elem), -						 timeout); -				h->offset[IPSET_OFFSET_COUNTER] = -					offsetof(struct TOKEN(HTYPE, 6ct_elem), -						 counter); -				TOKEN(HTYPE, 6_gc_init)(set, -					TOKEN(HTYPE, 6_gc)); -			} -		} else { -			if (set->family == NFPROTO_IPV4) { -				h->dsize = -					sizeof(struct TOKEN(HTYPE, 4c_elem)); -				h->offset[IPSET_OFFSET_COUNTER] = -					offsetof(struct TOKEN(HTYPE, 4c_elem), -						 counter); -			} else { -				h->dsize = -					sizeof(struct TOKEN(HTYPE, 6c_elem)); -				h->offset[IPSET_OFFSET_COUNTER] = -					offsetof(struct TOKEN(HTYPE, 6c_elem), -						 counter); -			} -		} -	} else if (tb[IPSET_ATTR_TIMEOUT]) { -		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); -		set->extensions |= IPSET_EXT_TIMEOUT; -		if (set->family == NFPROTO_IPV4) { -			h->dsize = sizeof(struct TOKEN(HTYPE, 4t_elem)); -			h->offset[IPSET_OFFSET_TIMEOUT] = -				offsetof(struct TOKEN(HTYPE, 4t_elem), -					 timeout); -			TOKEN(HTYPE, 4_gc_init)(set, TOKEN(HTYPE, 4_gc)); -		} else { -			h->dsize = sizeof(struct TOKEN(HTYPE, 6t_elem)); -			h->offset[IPSET_OFFSET_TIMEOUT] = -				offsetof(struct TOKEN(HTYPE, 6t_elem), -					 timeout); -			TOKEN(HTYPE, 6_gc_init)(set, TOKEN(HTYPE, 6_gc)); -		} +	if (set->family == NFPROTO_IPV4) { +		set->variant = &IPSET_TOKEN(HTYPE, 4_variant); +		set->dsize = ip_set_elem_len(set, tb, +				sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));  	} else { +		set->variant = &IPSET_TOKEN(HTYPE, 6_variant); +		set->dsize = ip_set_elem_len(set, tb, +				sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);  		if (set->family == NFPROTO_IPV4) -			h->dsize = sizeof(struct TOKEN(HTYPE, 4_elem)); +			IPSET_TOKEN(HTYPE, 4_gc_init)(set, +				IPSET_TOKEN(HTYPE, 4_gc));  		else -			h->dsize = sizeof(struct TOKEN(HTYPE, 6_elem)); +			IPSET_TOKEN(HTYPE, 6_gc_init)(set, +				IPSET_TOKEN(HTYPE, 6_gc));  	}  	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", -		 set->name, jhash_size(h->table->htable_bits), -		 h->table->htable_bits, h->maxelem, set->data, h->table); +		 set->name, jhash_size(t->htable_bits), +		 t->htable_bits, h->maxelem, set->data, t);  	return 0;  } diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index c74e6e14cd9..dd40607f878 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -23,19 +23,21 @@  #include <linux/netfilter/ipset/ip_set.h>  #include <linux/netfilter/ipset/ip_set_hash.h> -#define REVISION_MIN	0 -#define REVISION_MAX	1	/* Counters support */ +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counters support */ +/*				2	   Comments support */ +#define IPSET_TYPE_REV_MAX	3	/* Forceadd support */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("hash:ip", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_hash:ip");  /* Type specific function prefix */  #define HTYPE		hash_ip  #define IP_SET_HASH_WITH_NETMASK -/* IPv4 variants */ +/* IPv4 variant */  /* Member elements */  struct hash_ip4_elem { @@ -43,22 +45,6 @@ struct hash_ip4_elem {  	__be32 ip;  }; -struct hash_ip4t_elem { -	__be32 ip; -	unsigned long timeout; -}; - -struct hash_ip4c_elem { -	__be32 ip; -	struct ip_set_counter counter; -}; - -struct hash_ip4ct_elem { -	__be32 ip; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -99,7 +85,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_ip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ip4_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	__be32 ip;  	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); @@ -118,8 +104,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ip4_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); -	u32 ip, ip_to, hosts; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, hosts;  	int ret = 0;  	if (unlikely(!tb[IPSET_ATTR_IP] || @@ -178,29 +164,13 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],  	return ret;  } -/* IPv6 variants */ +/* IPv6 variant */  /* Member elements */  struct hash_ip6_elem {  	union nf_inet_addr ip;  }; -struct hash_ip6t_elem { -	union nf_inet_addr ip; -	unsigned long timeout; -}; - -struct hash_ip6c_elem { -	union nf_inet_addr ip; -	struct ip_set_counter counter; -}; - -struct hash_ip6ct_elem { -	union nf_inet_addr ip; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -253,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_ip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ip6_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);  	hash_ip6_netmask(&e.ip, h->netmask); @@ -270,7 +240,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ip6_elem e = {}; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	int ret;  	if (unlikely(!tb[IPSET_ATTR_IP] || @@ -304,8 +274,8 @@ static struct ip_set_type hash_ip_type __read_mostly = {  	.features	= IPSET_TYPE_IP,  	.dimension	= IPSET_DIM_ONE,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= hash_ip_create,  	.create_policy	= {  		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, @@ -324,6 +294,7 @@ static struct ip_set_type hash_ip_type __read_mostly = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c new file mode 100644 index 00000000000..4eff0a29725 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -0,0 +1,321 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,mark type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +#define IPSET_TYPE_REV_MAX	1	/* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); +IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,mark"); + +/* Type specific function prefix */ +#define HTYPE		hash_ipmark +#define IP_SET_HASH_WITH_MARKMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipmark4_elem { +	__be32 ip; +	__u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, +			const struct hash_ipmark4_elem *ip2, +			u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->mark == ip2->mark; +} + +static bool +hash_ipmark4_data_list(struct sk_buff *skb, +		       const struct hash_ipmark4_elem *data) +{ +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipmark4_data_next(struct hash_ipmark4_elem *next, +		       const struct hash_ipmark4_elem *d) +{ +	next->ip = d->ip; +} + +#define MTYPE           hash_ipmark4 +#define PF              4 +#define HOST_MASK       32 +#define HKEY_DATALEN	sizeof(struct hash_ipmark4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, +		  const struct xt_action_param *par, +		  enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.mark = skb->mark; +	e.mark &= h->markmask; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], +		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip, ip_to = 0; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); +	e.mark &= h->markmask; + +	if (adt == IPSET_TEST || +	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip = ntohl(e.ip); +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (!cidr || cidr > 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(ip, ip_to, cidr); +	} + +	if (retried) +		ip = ntohl(h->next.ip); +	for (; !before(ip_to, ip); ip++) { +		e.ip = htonl(ip); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_ipmark6_elem { +	union nf_inet_addr ip; +	__u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, +			const struct hash_ipmark6_elem *ip2, +			u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ip1->mark == ip2->mark; +} + +static bool +hash_ipmark6_data_list(struct sk_buff *skb, +		       const struct hash_ipmark6_elem *data) +{ +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipmark6_data_next(struct hash_ipmark4_elem *next, +		       const struct hash_ipmark6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE		hash_ipmark6 +#define PF		6 +#define HOST_MASK	128 +#define HKEY_DATALEN	sizeof(struct hash_ipmark6_elem) +#define	IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + + +static int +hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, +		  const struct xt_action_param *par, +		  enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.mark = skb->mark; +	e.mark &= h->markmask; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], +		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || +		     tb[IPSET_ATTR_IP_TO] || +		     tb[IPSET_ATTR_CIDR])) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); +	e.mark &= h->markmask; + +	if (adt == IPSET_TEST) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ret = adtfn(set, &e, &ext, &ext, flags); +	if (ret && !ip_set_eexist(ret, flags)) +		return ret; +	else +		ret = 0; + +	return ret; +} + +static struct ip_set_type hash_ipmark_type __read_mostly = { +	.name		= "hash:ip,mark", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_MARK, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_ipmark_create, +	.create_policy	= { +		[IPSET_ATTR_MARKMASK]	= { .type = NLA_U32 }, +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_MARK]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_ipmark_init(void) +{ +	return ip_set_type_register(&hash_ipmark_type); +} + +static void __exit +hash_ipmark_fini(void) +{ +	ip_set_type_unregister(&hash_ipmark_type); +} + +module_init(hash_ipmark_init); +module_exit(hash_ipmark_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7a2d2bd98d0..7597b82a8b0 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -24,19 +24,21 @@  #include <linux/netfilter/ipset/ip_set_getport.h>  #include <linux/netfilter/ipset/ip_set_hash.h> -#define REVISION_MIN	0 -/*			1    SCTP and UDPLITE support added */ -#define REVISION_MAX	2 /* Counters support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Counters support added */ +/*				3    Comments support added */ +#define IPSET_TYPE_REV_MAX	4 /* Forceadd support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("hash:ip,port", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_hash:ip,port");  /* Type specific function prefix */  #define HTYPE		hash_ipport -/* IPv4 variants */ +/* IPv4 variant */  /* Member elements */  struct hash_ipport4_elem { @@ -46,31 +48,6 @@ struct hash_ipport4_elem {  	u8 padding;  }; -struct hash_ipport4t_elem { -	__be32 ip; -	__be16 port; -	u8 proto; -	u8 padding; -	unsigned long timeout; -}; - -struct hash_ipport4c_elem { -	__be32 ip; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -}; - -struct hash_ipport4ct_elem { -	__be32 ip; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -116,10 +93,9 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,  		  const struct xt_action_param *par,  		  enum ipset_adt adt, struct ip_set_adt_opt *opt)  { -	const struct hash_ipport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipport4_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,  				 &e.port, &e.proto)) @@ -136,8 +112,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ipport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipport4_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); -	u32 ip, ip_to, p = 0, port, port_to; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip, ip_to = 0, p = 0, port, port_to;  	bool with_ports = false;  	int ret; @@ -222,7 +198,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],  	return ret;  } -/* IPv6 variants */ +/* IPv6 variant */  struct hash_ipport6_elem {  	union nf_inet_addr ip; @@ -231,31 +207,6 @@ struct hash_ipport6_elem {  	u8 padding;  }; -struct hash_ipport6t_elem { -	union nf_inet_addr ip; -	__be16 port; -	u8 proto; -	u8 padding; -	unsigned long timeout; -}; - -struct hash_ipport6c_elem { -	union nf_inet_addr ip; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -}; - -struct hash_ipport6ct_elem { -	union nf_inet_addr ip; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -306,10 +257,9 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,  		  const struct xt_action_param *par,  		  enum ipset_adt adt, struct ip_set_adt_opt *opt)  { -	const struct hash_ipport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipport6_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,  				 &e.port, &e.proto)) @@ -326,7 +276,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ipport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipport6_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	u32 port, port_to;  	bool with_ports = false;  	int ret; @@ -396,8 +346,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {  	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,  	.dimension	= IPSET_DIM_TWO,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= hash_ipport_create,  	.create_policy	= {  		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, @@ -419,6 +369,7 @@ static struct ip_set_type hash_ipport_type __read_mostly = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 34e8a1acce4..672655ffd57 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -24,19 +24,21 @@  #include <linux/netfilter/ipset/ip_set_getport.h>  #include <linux/netfilter/ipset/ip_set_hash.h> -#define REVISION_MIN	0 -/*			1    SCTP and UDPLITE support added */ -#define REVISION_MAX	2 /* Counters support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Counters support added */ +/*				3    Comments support added */ +#define IPSET_TYPE_REV_MAX	4 /* Forceadd support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("hash:ip,port,ip", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_hash:ip,port,ip");  /* Type specific function prefix */  #define HTYPE		hash_ipportip -/* IPv4 variants */ +/* IPv4 variant */  /* Member elements  */  struct hash_ipportip4_elem { @@ -47,34 +49,6 @@ struct hash_ipportip4_elem {  	u8 padding;  }; -struct hash_ipportip4t_elem { -	__be32 ip; -	__be32 ip2; -	__be16 port; -	u8 proto; -	u8 padding; -	unsigned long timeout; -}; - -struct hash_ipportip4c_elem { -	__be32 ip; -	__be32 ip2; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -}; - -struct hash_ipportip4ct_elem { -	__be32 ip; -	__be32 ip2; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  static inline bool  hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,  			  const struct hash_ipportip4_elem *ip2, @@ -120,10 +94,9 @@ hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,  		    const struct xt_action_param *par,  		    enum ipset_adt adt, struct ip_set_adt_opt *opt)  { -	const struct hash_ipportip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportip4_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,  				 &e.port, &e.proto)) @@ -141,8 +114,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ipportip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportip4_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); -	u32 ip, ip_to, p = 0, port, port_to; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip, ip_to = 0, p = 0, port, port_to;  	bool with_ports = false;  	int ret; @@ -231,7 +204,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],  	return ret;  } -/* IPv6 variants */ +/* IPv6 variant */  struct hash_ipportip6_elem {  	union nf_inet_addr ip; @@ -241,34 +214,6 @@ struct hash_ipportip6_elem {  	u8 padding;  }; -struct hash_ipportip6t_elem { -	union nf_inet_addr ip; -	union nf_inet_addr ip2; -	__be16 port; -	u8 proto; -	u8 padding; -	unsigned long timeout; -}; - -struct hash_ipportip6c_elem { -	union nf_inet_addr ip; -	union nf_inet_addr ip2; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -}; - -struct hash_ipportip6ct_elem { -	union nf_inet_addr ip; -	union nf_inet_addr ip2; -	__be16 port; -	u8 proto; -	u8 padding; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -319,10 +264,9 @@ hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,  		    const struct xt_action_param *par,  		    enum ipset_adt adt, struct ip_set_adt_opt *opt)  { -	const struct hash_ipportip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportip6_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,  				 &e.port, &e.proto)) @@ -340,7 +284,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ipportip *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportip6_elem e = { }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	u32 port, port_to;  	bool with_ports = false;  	int ret; @@ -414,8 +358,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {  	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,  	.dimension	= IPSET_DIM_THREE,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= hash_ipportip_create,  	.create_policy	= {  		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, @@ -437,6 +381,7 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index f15f3e28b9c..7308d84f927 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -24,15 +24,17 @@  #include <linux/netfilter/ipset/ip_set_getport.h>  #include <linux/netfilter/ipset/ip_set_hash.h> -#define REVISION_MIN	0 -/*			1    SCTP and UDPLITE support added */ -/*			2    Range as input support for IPv4 added */ -/*			3    nomatch flag support added */ -#define REVISION_MAX	4 /* Counters support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Range as input support for IPv4 added */ +/*				3    nomatch flag support added */ +/*				4    Counters support added */ +/*				5    Comments support added */ +#define IPSET_TYPE_REV_MAX	6 /* Forceadd support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("hash:ip,port,net", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_hash:ip,port,net");  /* Type specific function prefix */ @@ -46,7 +48,7 @@ MODULE_ALIAS("ip_set_hash:ip,port,net");  #define IP_SET_HASH_WITH_PROTO  #define IP_SET_HASH_WITH_NETS -/* IPv4 variants */ +/* IPv4 variant */  /* Member elements */  struct hash_ipportnet4_elem { @@ -58,37 +60,6 @@ struct hash_ipportnet4_elem {  	u8 proto;  }; -struct hash_ipportnet4t_elem { -	__be32 ip; -	__be32 ip2; -	__be16 port; -	u8 cidr:7; -	u8 nomatch:1; -	u8 proto; -	unsigned long timeout; -}; - -struct hash_ipportnet4c_elem { -	__be32 ip; -	__be32 ip2; -	__be16 port; -	u8 cidr:7; -	u8 nomatch:1; -	u8 proto; -	struct ip_set_counter counter; -}; - -struct hash_ipportnet4ct_elem { -	__be32 ip; -	__be32 ip2; -	__be16 port; -	u8 cidr:7; -	u8 nomatch:1; -	u8 proto; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -170,9 +141,9 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_ipportnet *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportnet4_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (adt == IPSET_TEST)  		e.cidr = HOST_MASK - 1; @@ -195,9 +166,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ipportnet *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); -	u32 ip, ip_to, p = 0, port, port_to; -	u32 ip2_from, ip2_to, ip2_last, ip2; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, p = 0, port, port_to; +	u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;  	bool with_ports = false;  	u8 cidr;  	int ret; @@ -272,7 +243,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],  		if (ip > ip_to)  			swap(ip, ip_to);  	} else if (tb[IPSET_ATTR_CIDR]) { -		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);  		if (!cidr || cidr > 32)  			return -IPSET_ERR_INVALID_CIDR; @@ -306,9 +277,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],  						       : port;  		for (; p <= port_to; p++) {  			e.port = htons(p); -			ip2 = retried -			      && ip == ntohl(h->next.ip) -			      && p == ntohs(h->next.port) +			ip2 = retried && +			      ip == ntohl(h->next.ip) && +			      p == ntohs(h->next.port)  				? ntohl(h->next.ip2) : ip2_from;  			while (!after(ip2, ip2_to)) {  				e.ip2 = htonl(ip2); @@ -328,7 +299,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],  	return ret;  } -/* IPv6 variants */ +/* IPv6 variant */  struct hash_ipportnet6_elem {  	union nf_inet_addr ip; @@ -339,37 +310,6 @@ struct hash_ipportnet6_elem {  	u8 proto;  }; -struct hash_ipportnet6t_elem { -	union nf_inet_addr ip; -	union nf_inet_addr ip2; -	__be16 port; -	u8 cidr:7; -	u8 nomatch:1; -	u8 proto; -	unsigned long timeout; -}; - -struct hash_ipportnet6c_elem { -	union nf_inet_addr ip; -	union nf_inet_addr ip2; -	__be16 port; -	u8 cidr:7; -	u8 nomatch:1; -	u8 proto; -	struct ip_set_counter counter; -}; - -struct hash_ipportnet6ct_elem { -	union nf_inet_addr ip; -	union nf_inet_addr ip2; -	__be16 port; -	u8 cidr:7; -	u8 nomatch:1; -	u8 proto; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -454,9 +394,9 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_ipportnet *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportnet6_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (adt == IPSET_TEST)  		e.cidr = HOST_MASK - 1; @@ -479,7 +419,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_ipportnet *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	u32 port, port_to;  	bool with_ports = false;  	u8 cidr; @@ -574,8 +514,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {  			  IPSET_TYPE_NOMATCH,  	.dimension	= IPSET_DIM_THREE,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= hash_ipportnet_create,  	.create_policy	= {  		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, @@ -600,6 +540,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 223e9f546d0..4c7d495783a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -22,21 +22,23 @@  #include <linux/netfilter/ipset/ip_set.h>  #include <linux/netfilter/ipset/ip_set_hash.h> -#define REVISION_MIN	0 -/*			1    Range as input support for IPv4 added */ -/*			2    nomatch flag support added */ -#define REVISION_MAX	3 /* Counters support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1    Range as input support for IPv4 added */ +/*				2    nomatch flag support added */ +/*				3    Counters support added */ +/*				4    Comments support added */ +#define IPSET_TYPE_REV_MAX	5 /* Forceadd support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("hash:net", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_hash:net");  /* Type specific function prefix */  #define HTYPE		hash_net  #define IP_SET_HASH_WITH_NETS -/* IPv4 variants */ +/* IPv4 variant */  /* Member elements  */  struct hash_net4_elem { @@ -46,31 +48,6 @@ struct hash_net4_elem {  	u8 cidr;  }; -struct hash_net4t_elem { -	__be32 ip; -	u16 padding0; -	u8 nomatch; -	u8 cidr; -	unsigned long timeout; -}; - -struct hash_net4c_elem { -	__be32 ip; -	u16 padding0; -	u8 nomatch; -	u8 cidr; -	struct ip_set_counter counter; -}; - -struct hash_net4ct_elem { -	__be32 ip; -	u16 padding0; -	u8 nomatch; -	u8 cidr; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -143,9 +120,9 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_net *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_net4_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (e.cidr == 0)  		return -EINVAL; @@ -165,8 +142,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_net *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_net4_elem e = { .cidr = HOST_MASK }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); -	u32 ip = 0, ip_to, last; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, last;  	int ret;  	if (unlikely(!tb[IPSET_ATTR_IP] || @@ -228,7 +205,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],  	return ret;  } -/* IPv6 variants */ +/* IPv6 variant */  struct hash_net6_elem {  	union nf_inet_addr ip; @@ -237,31 +214,6 @@ struct hash_net6_elem {  	u8 cidr;  }; -struct hash_net6t_elem { -	union nf_inet_addr ip; -	u16 padding0; -	u8 nomatch; -	u8 cidr; -	unsigned long timeout; -}; - -struct hash_net6c_elem { -	union nf_inet_addr ip; -	u16 padding0; -	u8 nomatch; -	u8 cidr; -	struct ip_set_counter counter; -}; - -struct hash_net6ct_elem { -	union nf_inet_addr ip; -	u16 padding0; -	u8 nomatch; -	u8 cidr; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -338,9 +290,9 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_net *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_net6_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (e.cidr == 0)  		return -EINVAL; @@ -357,10 +309,9 @@ static int  hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],  	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)  { -	const struct hash_net *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_net6_elem e = { .cidr = HOST_MASK }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	int ret;  	if (unlikely(!tb[IPSET_ATTR_IP] || @@ -406,8 +357,8 @@ static struct ip_set_type hash_net_type __read_mostly = {  	.features	= IPSET_TYPE_IP | IPSET_TYPE_NOMATCH,  	.dimension	= IPSET_DIM_ONE,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= hash_net_create,  	.create_policy	= {  		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, @@ -425,6 +376,7 @@ static struct ip_set_type hash_net_type __read_mostly = {  		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 7d798d5d5cd..db2606805b3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -23,14 +23,16 @@  #include <linux/netfilter/ipset/ip_set.h>  #include <linux/netfilter/ipset/ip_set_hash.h> -#define REVISION_MIN	0 -/*			1    nomatch flag support added */ -/*			2    /0 support added */ -#define REVISION_MAX	3 /* Counters support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1    nomatch flag support added */ +/*				2    /0 support added */ +/*				3    Counters support added */ +/*				4    Comments support added */ +#define IPSET_TYPE_REV_MAX	5 /* Forceadd support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("hash:net,iface", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_hash:net,iface");  /* Interface name rbtree */ @@ -45,31 +47,12 @@ struct iface_node {  static void  rbtree_destroy(struct rb_root *root)  { -	struct rb_node *p, *n = root->rb_node; -	struct iface_node *node; - -	/* Non-recursive destroy, like in ext3 */ -	while (n) { -		if (n->rb_left) { -			n = n->rb_left; -			continue; -		} -		if (n->rb_right) { -			n = n->rb_right; -			continue; -		} -		p = rb_parent(n); -		node = rb_entry(n, struct iface_node, node); -		if (!p) -			*root = RB_ROOT; -		else if (p->rb_left == n) -			p->rb_left = NULL; -		else if (p->rb_right == n) -			p->rb_right = NULL; +	struct iface_node *node, *next; +	rbtree_postorder_for_each_entry_safe(node, next, root, node)  		kfree(node); -		n = p; -	} + +	*root = RB_ROOT;  }  static int @@ -134,7 +117,7 @@ iface_add(struct rb_root *root, const char **iface)  #define STREQ(a, b)	(strcmp(a, b) == 0) -/* IPv4 variants */ +/* IPv4 variant */  struct hash_netiface4_elem_hashed {  	__be32 ip; @@ -144,7 +127,7 @@ struct hash_netiface4_elem_hashed {  	u8 elem;  }; -/* Member elements without timeout */ +/* Member elements */  struct hash_netiface4_elem {  	__be32 ip;  	u8 physdev; @@ -154,37 +137,6 @@ struct hash_netiface4_elem {  	const char *iface;  }; -struct hash_netiface4t_elem { -	__be32 ip; -	u8 physdev; -	u8 cidr; -	u8 nomatch; -	u8 elem; -	const char *iface; -	unsigned long timeout; -}; - -struct hash_netiface4c_elem { -	__be32 ip; -	u8 physdev; -	u8 cidr; -	u8 nomatch; -	u8 elem; -	const char *iface; -	struct ip_set_counter counter; -}; - -struct hash_netiface4ct_elem { -	__be32 ip; -	u8 physdev; -	u8 cidr; -	u8 nomatch; -	u8 elem; -	const char *iface; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -265,10 +217,10 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,  	struct hash_netiface *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netiface4_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK, +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),  		.elem = 1,  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	int ret;  	if (e.cidr == 0) @@ -319,8 +271,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],  	struct hash_netiface *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); -	u32 ip = 0, ip_to, last; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, last;  	char iface[IFNAMSIZ];  	int ret; @@ -399,7 +351,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],  	return ret;  } -/* IPv6 variants */ +/* IPv6 variant */  struct hash_netiface6_elem_hashed {  	union nf_inet_addr ip; @@ -418,37 +370,6 @@ struct hash_netiface6_elem {  	const char *iface;  }; -struct hash_netiface6t_elem { -	union nf_inet_addr ip; -	u8 physdev; -	u8 cidr; -	u8 nomatch; -	u8 elem; -	const char *iface; -	unsigned long timeout; -}; - -struct hash_netiface6c_elem { -	union nf_inet_addr ip; -	u8 physdev; -	u8 cidr; -	u8 nomatch; -	u8 elem; -	const char *iface; -	struct ip_set_counter counter; -}; - -struct hash_netiface6ct_elem { -	union nf_inet_addr ip; -	u8 physdev; -	u8 cidr; -	u8 nomatch; -	u8 elem; -	const char *iface; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -534,10 +455,10 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,  	struct hash_netiface *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netiface6_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK, +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),  		.elem = 1,  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	int ret;  	if (e.cidr == 0) @@ -584,7 +505,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],  	struct hash_netiface *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	char iface[IFNAMSIZ];  	int ret; @@ -645,8 +566,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = {  			  IPSET_TYPE_NOMATCH,  	.dimension	= IPSET_DIM_TWO,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= hash_netiface_create,  	.create_policy	= {  		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, @@ -668,6 +589,7 @@ static struct ip_set_type hash_netiface_type __read_mostly = {  		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c new file mode 100644 index 00000000000..3e99987e4bf --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -0,0 +1,481 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +#define IPSET_TYPE_REV_MAX	1	/* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,net"); + +/* Type specific function prefix */ +#define HTYPE		hash_netnet +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variants */ + +/* Member elements  */ +struct hash_netnet4_elem { +	union { +		__be32 ip[2]; +		__be64 ipcmp; +	}; +	u8 nomatch; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +}; + +/* Common functions */ + +static inline bool +hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, +		     const struct hash_netnet4_elem *ip2, +		     u32 *multi) +{ +	return ip1->ipcmp == ip2->ipcmp && +	       ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, +			  struct hash_netnet4_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) +{ +	if (inner) { +		elem->ip[1] &= ip_set_netmask(cidr); +		elem->cidr[1] = cidr; +	} else { +		elem->ip[0] &= ip_set_netmask(cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netnet4_data_list(struct sk_buff *skb, +		    const struct hash_netnet4_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || +	    nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return false; + +nla_put_failure: +	return true; +} + +static inline void +hash_netnet4_data_next(struct hash_netnet4_elem *next, +		    const struct hash_netnet4_elem *d) +{ +	next->ipcmp = d->ipcmp; +} + +#define MTYPE		hash_netnet4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); +	ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); +	e.ip[0] &= ip_set_netmask(e.cidr[0]); +	e.ip[1] &= ip_set_netmask(e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], +	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, last; +	u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; +	u8 cidr, cidr2; +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[0] = cidr; +	} + +	if (tb[IPSET_ATTR_CIDR2]) { +		cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); +		if (!cidr2 || cidr2 > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[1] = cidr2; +	} + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] && +				   tb[IPSET_ATTR_IP2_TO])) { +		e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); +		e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip; +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip_to < ip) +			swap(ip, ip_to); +		if (ip + UINT_MAX == ip_to) +			return -IPSET_ERR_HASH_RANGE; +	} + +	ip2_to = ip2_from; +	if (tb[IPSET_ATTR_IP2_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); +		if (ret) +			return ret; +		if (ip2_to < ip2_from) +			swap(ip2_from, ip2_to); +		if (ip2_from + UINT_MAX == ip2_to) +			return -IPSET_ERR_HASH_RANGE; + +	} + +	if (retried) +		ip = ntohl(h->next.ip[0]); + +	while (!after(ip, ip_to)) { +		e.ip[0] = htonl(ip); +		last = ip_set_range_to_cidr(ip, ip_to, &cidr); +		e.cidr[0] = cidr; +		ip2 = (retried && +		       ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) +						   : ip2_from; +		while (!after(ip2, ip2_to)) { +			e.ip[1] = htonl(ip2); +			last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); +			e.cidr[1] = cidr2; +			ret = adtfn(set, &e, &ext, &ext, flags); +			if (ret && !ip_set_eexist(ret, flags)) +				return ret; +			else +				ret = 0; +			ip2 = last2 + 1; +		} +		ip = last + 1; +	} +	return ret; +} + +/* IPv6 variants */ + +struct hash_netnet6_elem { +	union nf_inet_addr ip[2]; +	u8 nomatch; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +}; + +/* Common functions */ + +static inline bool +hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, +		     const struct hash_netnet6_elem *ip2, +		     u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && +	       ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && +	       ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, +			  struct hash_netnet6_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) +{ +	if (inner) { +		ip6_netmask(&elem->ip[1], cidr); +		elem->cidr[1] = cidr; +	} else { +		ip6_netmask(&elem->ip[0], cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netnet6_data_list(struct sk_buff *skb, +		    const struct hash_netnet6_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || +	    nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return false; + +nla_put_failure: +	return true; +} + +static inline void +hash_netnet6_data_next(struct hash_netnet4_elem *next, +		    const struct hash_netnet6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_netnet6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); +	ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], +	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || +	      ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) +		e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +	if (tb[IPSET_ATTR_CIDR2]) +		e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + +	if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || +	    e.cidr[1] > HOST_MASK) +		return -IPSET_ERR_INVALID_CIDR; + +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	ret = adtfn(set, &e, &ext, &ext, flags); + +	return ip_set_enomatch(ret, flags, adt, set) ? -ret : +	       ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_netnet_type __read_mostly = { +	.name		= "hash:net,net", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_netnet_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CIDR2]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_netnet_init(void) +{ +	return ip_set_type_register(&hash_netnet_type); +} + +static void __exit +hash_netnet_fini(void) +{ +	ip_set_type_unregister(&hash_netnet_type); +} + +module_init(hash_netnet_init); +module_exit(hash_netnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 09d6690bee6..1c645fbd09c 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -23,15 +23,17 @@  #include <linux/netfilter/ipset/ip_set_getport.h>  #include <linux/netfilter/ipset/ip_set_hash.h> -#define REVISION_MIN	0 -/*			1    SCTP and UDPLITE support added */ -/*			2    Range as input support for IPv4 added */ -/*			3    nomatch flag support added */ -#define REVISION_MAX	4 /* Counters support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Range as input support for IPv4 added */ +/*				3    nomatch flag support added */ +/*				4    Counters support added */ +/*				5    Comments support added */ +#define IPSET_TYPE_REV_MAX	6 /* Forceadd support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("hash:net,port", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_hash:net,port");  /* Type specific function prefix */ @@ -45,7 +47,7 @@ MODULE_ALIAS("ip_set_hash:net,port");   */  #define IP_SET_HASH_WITH_NETS_PACKED -/* IPv4 variants */ +/* IPv4 variant */  /* Member elements */  struct hash_netport4_elem { @@ -56,34 +58,6 @@ struct hash_netport4_elem {  	u8 nomatch:1;  }; -struct hash_netport4t_elem { -	__be32 ip; -	__be16 port; -	u8 proto; -	u8 cidr:7; -	u8 nomatch:1; -	unsigned long timeout; -}; - -struct hash_netport4c_elem { -	__be32 ip; -	__be16 port; -	u8 proto; -	u8 cidr:7; -	u8 nomatch:1; -	struct ip_set_counter counter; -}; - -struct hash_netport4ct_elem { -	__be32 ip; -	__be16 port; -	u8 proto; -	u8 cidr:7; -	u8 nomatch:1; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -162,9 +136,9 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_netport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netport4_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (adt == IPSET_TEST)  		e.cidr = HOST_MASK - 1; @@ -186,8 +160,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_netport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); -	u32 port, port_to, p = 0, ip = 0, ip_to, last; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to, p = 0, ip = 0, ip_to = 0, last;  	bool with_ports = false;  	u8 cidr;  	int ret; @@ -287,7 +261,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],  	return ret;  } -/* IPv6 variants */ +/* IPv6 variant */  struct hash_netport6_elem {  	union nf_inet_addr ip; @@ -297,34 +271,6 @@ struct hash_netport6_elem {  	u8 nomatch:1;  }; -struct hash_netport6t_elem { -	union nf_inet_addr ip; -	__be16 port; -	u8 proto; -	u8 cidr:7; -	u8 nomatch:1; -	unsigned long timeout; -}; - -struct hash_netport6c_elem { -	union nf_inet_addr ip; -	__be16 port; -	u8 proto; -	u8 cidr:7; -	u8 nomatch:1; -	struct ip_set_counter counter; -}; - -struct hash_netport6ct_elem { -	union nf_inet_addr ip; -	__be16 port; -	u8 proto; -	u8 cidr:7; -	u8 nomatch:1; -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  /* Common functions */  static inline bool @@ -407,9 +353,9 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,  	const struct hash_netport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netport6_elem e = { -		.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1, +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,  	}; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	if (adt == IPSET_TEST)  		e.cidr = HOST_MASK - 1; @@ -431,7 +377,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],  	const struct hash_netport *h = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct hash_netport6_elem e = { .cidr = HOST_MASK  - 1 }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(h); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	u32 port, port_to;  	bool with_ports = false;  	u8 cidr; @@ -518,8 +464,8 @@ static struct ip_set_type hash_netport_type __read_mostly = {  	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH,  	.dimension	= IPSET_DIM_TWO,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= hash_netport_create,  	.create_policy	= {  		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, @@ -542,6 +488,7 @@ static struct ip_set_type hash_netport_type __read_mostly = {  		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c new file mode 100644 index 00000000000..c0d2ba73f8b --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -0,0 +1,587 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				0    Comments support added */ +#define IPSET_TYPE_REV_MAX	1 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,port,net"); + +/* Type specific function prefix */ +#define HTYPE		hash_netportnet +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variant */ + +/* Member elements */ +struct hash_netportnet4_elem { +	union { +		__be32 ip[2]; +		__be64 ipcmp; +	}; +	__be16 port; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +	u8 nomatch:1; +	u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, +			   const struct hash_netportnet4_elem *ip2, +			   u32 *multi) +{ +	return ip1->ipcmp == ip2->ipcmp && +	       ip1->ccmp == ip2->ccmp && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, +				struct hash_netportnet4_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, +			      u8 cidr, bool inner) +{ +	if (inner) { +		elem->ip[1] &= ip_set_netmask(cidr); +		elem->cidr[1] = cidr; +	} else { +		elem->ip[0] &= ip_set_netmask(cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netportnet4_data_list(struct sk_buff *skb, +			  const struct hash_netportnet4_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || +	    nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netportnet4_data_next(struct hash_netportnet4_elem *next, +			  const struct hash_netportnet4_elem *d) +{ +	next->ipcmp = d->ipcmp; +	next->port = d->port; +} + +#define MTYPE		hash_netportnet4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, +		     const struct xt_action_param *par, +		     enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + +	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); +	ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]); +	e.ip[0] &= ip_set_netmask(e.cidr[0]); +	e.ip[1] &= ip_set_netmask(e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], +		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; +	u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; +	bool with_ports = false; +	u8 cidr, cidr2; +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[0] = cidr; +	} + +	if (tb[IPSET_ATTR_CIDR2]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[1] = cidr; +	} + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMP)) +		e.port = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; +	if (adt == IPSET_TEST || +	    !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { +		e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); +		e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip; +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +		if (unlikely(ip + UINT_MAX == ip_to)) +			return -IPSET_ERR_HASH_RANGE; +	} + +	port_to = port = ntohs(e.port); +	if (tb[IPSET_ATTR_PORT_TO]) { +		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +		if (port > port_to) +			swap(port, port_to); +	} + +	ip2_to = ip2_from; +	if (tb[IPSET_ATTR_IP2_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); +		if (ret) +			return ret; +		if (ip2_from > ip2_to) +			swap(ip2_from, ip2_to); +		if (unlikely(ip2_from + UINT_MAX == ip2_to)) +			return -IPSET_ERR_HASH_RANGE; +	} + +	if (retried) +		ip = ntohl(h->next.ip[0]); + +	while (!after(ip, ip_to)) { +		e.ip[0] = htonl(ip); +		ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); +		e.cidr[0] = cidr; +		p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) +							  : port; +		for (; p <= port_to; p++) { +			e.port = htons(p); +			ip2 = (retried && ip == ntohl(h->next.ip[0]) && +			       p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) +							 : ip2_from; +			while (!after(ip2, ip2_to)) { +				e.ip[1] = htonl(ip2); +				ip2_last = ip_set_range_to_cidr(ip2, ip2_to, +								&cidr2); +				e.cidr[1] = cidr2; +				ret = adtfn(set, &e, &ext, &ext, flags); +				if (ret && !ip_set_eexist(ret, flags)) +					return ret; +				else +					ret = 0; +				ip2 = ip2_last + 1; +			} +		} +		ip = ip_last + 1; +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_netportnet6_elem { +	union nf_inet_addr ip[2]; +	__be16 port; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +	u8 nomatch:1; +	u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, +			   const struct hash_netportnet6_elem *ip2, +			   u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && +	       ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && +	       ip1->ccmp == ip2->ccmp && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, +				struct hash_netportnet6_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, +			      u8 cidr, bool inner) +{ +	if (inner) { +		ip6_netmask(&elem->ip[1], cidr); +		elem->cidr[1] = cidr; +	} else { +		ip6_netmask(&elem->ip[0], cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netportnet6_data_list(struct sk_buff *skb, +			  const struct hash_netportnet6_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || +	    nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netportnet6_data_next(struct hash_netportnet4_elem *next, +			  const struct hash_netportnet6_elem *d) +{ +	next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_netportnet6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, +		     const struct xt_action_param *par, +		     enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; + +	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); +	ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6); +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], +		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to; +	bool with_ports = false; +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || +	      ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) +		e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +	if (tb[IPSET_ATTR_CIDR2]) +		e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + +	if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || +		     e.cidr[1] > HOST_MASK)) +		return -IPSET_ERR_INVALID_CIDR; + +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMPV6)) +		e.port = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	port = ntohs(e.port); +	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +	if (port > port_to) +		swap(port, port_to); + +	if (retried) +		port = ntohs(h->next.port); +	for (; port <= port_to; port++) { +		e.port = htons(port); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static struct ip_set_type hash_netportnet_type __read_mostly = { +	.name		= "hash:net,port,net", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | +			  IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_THREE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_netportnet_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CIDR2]	= { .type = NLA_U8 }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_netportnet_init(void) +{ +	return ip_set_type_register(&hash_netportnet_type); +} + +static void __exit +hash_netportnet_fini(void) +{ +	ip_set_type_unregister(&hash_netportnet_type); +} + +module_init(hash_netportnet_init); +module_exit(hash_netportnet_fini); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 979b8c90e42..3e2317f3cf6 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -15,12 +15,13 @@  #include <linux/netfilter/ipset/ip_set.h>  #include <linux/netfilter/ipset/ip_set_list.h> -#define REVISION_MIN	0 -#define REVISION_MAX	1 /* Counters support added */ +#define IPSET_TYPE_REV_MIN	0 +/*				1    Counters support added */ +#define IPSET_TYPE_REV_MAX	2 /* Comments support added */  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -IP_SET_MODULE_DESC("list:set", REVISION_MIN, REVISION_MAX); +IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);  MODULE_ALIAS("ip_set_list:set");  /* Member elements  */ @@ -28,28 +29,6 @@ struct set_elem {  	ip_set_id_t id;  }; -struct sett_elem { -	struct { -		ip_set_id_t id; -	} __attribute__ ((aligned)); -	unsigned long timeout; -}; - -struct setc_elem { -	struct { -		ip_set_id_t id; -	} __attribute__ ((aligned)); -	struct ip_set_counter counter; -}; - -struct setct_elem { -	struct { -		ip_set_id_t id; -	} __attribute__ ((aligned)); -	struct ip_set_counter counter; -	unsigned long timeout; -}; -  struct set_adt_elem {  	ip_set_id_t id;  	ip_set_id_t refid; @@ -58,24 +37,14 @@ struct set_adt_elem {  /* Type structure */  struct list_set { -	size_t dsize;		/* element size */ -	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */  	u32 size;		/* size of set list array */ -	u32 timeout;		/* timeout value */  	struct timer_list gc;	/* garbage collection */ +	struct net *net;	/* namespace */  	struct set_elem members[0]; /* the set members */  }; -static inline struct set_elem * -list_set_elem(const struct list_set *map, u32 id) -{ -	return (struct set_elem *)((void *)map->members + id * map->dsize); -} - -#define ext_timeout(e, m)	\ -(unsigned long *)((void *)(e) + (m)->offset[IPSET_OFFSET_TIMEOUT]) -#define ext_counter(e, m)	\ -(struct ip_set_counter *)((void *)(e) + (m)->offset[IPSET_OFFSET_COUNTER]) +#define list_set_elem(set, map, id)	\ +	(struct set_elem *)((void *)(map)->members + (id) * (set)->dsize)  static int  list_set_ktest(struct ip_set *set, const struct sk_buff *skb, @@ -92,16 +61,16 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,  	if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)  		opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;  	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			return 0;  		if (SET_WITH_TIMEOUT(set) && -		    ip_set_timeout_expired(ext_timeout(e, map))) +		    ip_set_timeout_expired(ext_timeout(e, set)))  			continue;  		ret = ip_set_test(e->id, skb, par, opt);  		if (ret > 0) {  			if (SET_WITH_COUNTER(set)) -				ip_set_update_counter(ext_counter(e, map), +				ip_set_update_counter(ext_counter(e, set),  						      ext, &opt->ext,  						      cmdflags);  			return ret; @@ -121,11 +90,11 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb,  	int ret;  	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			return 0;  		if (SET_WITH_TIMEOUT(set) && -		    ip_set_timeout_expired(ext_timeout(e, map))) +		    ip_set_timeout_expired(ext_timeout(e, set)))  			continue;  		ret = ip_set_add(e->id, skb, par, opt);  		if (ret == 0) @@ -145,11 +114,11 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb,  	int ret;  	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			return 0;  		if (SET_WITH_TIMEOUT(set) && -		    ip_set_timeout_expired(ext_timeout(e, map))) +		    ip_set_timeout_expired(ext_timeout(e, set)))  			continue;  		ret = ip_set_del(e->id, skb, par, opt);  		if (ret == 0) @@ -163,8 +132,7 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,  	      const struct xt_action_param *par,  	      enum ipset_adt adt, struct ip_set_adt_opt *opt)  { -	struct list_set *map = set->data; -	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);  	switch (adt) {  	case IPSET_TEST: @@ -188,10 +156,10 @@ id_eq(const struct ip_set *set, u32 i, ip_set_id_t id)  	if (i >= map->size)  		return 0; -	e = list_set_elem(map, i); +	e = list_set_elem(set, map, i);  	return !!(e->id == id &&  		 !(SET_WITH_TIMEOUT(set) && -		   ip_set_timeout_expired(ext_timeout(e, map)))); +		   ip_set_timeout_expired(ext_timeout(e, set))));  }  static int @@ -199,28 +167,36 @@ list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,  	     const struct ip_set_ext *ext)  {  	struct list_set *map = set->data; -	struct set_elem *e = list_set_elem(map, i); +	struct set_elem *e = list_set_elem(set, map, i);  	if (e->id != IPSET_INVALID_ID) { -		if (i == map->size - 1) +		if (i == map->size - 1) {  			/* Last element replaced: e.g. add new,before,last */ -			ip_set_put_byindex(e->id); -		else { -			struct set_elem *x = list_set_elem(map, map->size - 1); +			ip_set_put_byindex(map->net, e->id); +			ip_set_ext_destroy(set, e); +		} else { +			struct set_elem *x = list_set_elem(set, map, +							   map->size - 1);  			/* Last element pushed off */ -			if (x->id != IPSET_INVALID_ID) -				ip_set_put_byindex(x->id); -			memmove(list_set_elem(map, i + 1), e, -				map->dsize * (map->size - (i + 1))); +			if (x->id != IPSET_INVALID_ID) { +				ip_set_put_byindex(map->net, x->id); +				ip_set_ext_destroy(set, x); +			} +			memmove(list_set_elem(set, map, i + 1), e, +				set->dsize * (map->size - (i + 1))); +			/* Extensions must be initialized to zero */ +			memset(e, 0, set->dsize);  		}  	}  	e->id = d->id;  	if (SET_WITH_TIMEOUT(set)) -		ip_set_timeout_set(ext_timeout(e, map), ext->timeout); +		ip_set_timeout_set(ext_timeout(e, set), ext->timeout);  	if (SET_WITH_COUNTER(set)) -		ip_set_init_counter(ext_counter(e, map), ext); +		ip_set_init_counter(ext_counter(e, set), ext); +	if (SET_WITH_COMMENT(set)) +		ip_set_init_comment(ext_comment(e, set), ext);  	return 0;  } @@ -228,16 +204,17 @@ static int  list_set_del(struct ip_set *set, u32 i)  {  	struct list_set *map = set->data; -	struct set_elem *e = list_set_elem(map, i); +	struct set_elem *e = list_set_elem(set, map, i); -	ip_set_put_byindex(e->id); +	ip_set_put_byindex(map->net, e->id); +	ip_set_ext_destroy(set, e);  	if (i < map->size - 1) -		memmove(e, list_set_elem(map, i + 1), -			map->dsize * (map->size - (i + 1))); +		memmove(e, list_set_elem(set, map, i + 1), +			set->dsize * (map->size - (i + 1)));  	/* Last element */ -	e = list_set_elem(map, map->size - 1); +	e = list_set_elem(set, map, map->size - 1);  	e->id = IPSET_INVALID_ID;  	return 0;  } @@ -247,13 +224,16 @@ set_cleanup_entries(struct ip_set *set)  {  	struct list_set *map = set->data;  	struct set_elem *e; -	u32 i; +	u32 i = 0; -	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +	while (i < map->size) { +		e = list_set_elem(set, map, i);  		if (e->id != IPSET_INVALID_ID && -		    ip_set_timeout_expired(ext_timeout(e, map))) +		    ip_set_timeout_expired(ext_timeout(e, set)))  			list_set_del(set, i); +			/* Check element moved to position i in next loop */ +		else +			i++;  	}  } @@ -268,11 +248,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	int ret;  	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			return 0;  		else if (SET_WITH_TIMEOUT(set) && -			 ip_set_timeout_expired(ext_timeout(e, map))) +			 ip_set_timeout_expired(ext_timeout(e, set)))  			continue;  		else if (e->id != d->id)  			continue; @@ -299,14 +279,14 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	bool flag_exist = flags & IPSET_FLAG_EXIST;  	u32 i, ret = 0; +	if (SET_WITH_TIMEOUT(set)) +		set_cleanup_entries(set); +  	/* Check already added element */  	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			goto insert; -		else if (SET_WITH_TIMEOUT(set) && -			 ip_set_timeout_expired(ext_timeout(e, map))) -			continue;  		else if (e->id != d->id)  			continue; @@ -319,18 +299,22 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,  			/* Can't re-add */  			return -IPSET_ERR_EXIST;  		/* Update extensions */ +		ip_set_ext_destroy(set, e); +  		if (SET_WITH_TIMEOUT(set)) -			ip_set_timeout_set(ext_timeout(e, map), ext->timeout); +			ip_set_timeout_set(ext_timeout(e, set), ext->timeout);  		if (SET_WITH_COUNTER(set)) -			ip_set_init_counter(ext_counter(e, map), ext); +			ip_set_init_counter(ext_counter(e, set), ext); +		if (SET_WITH_COMMENT(set)) +			ip_set_init_comment(ext_comment(e, set), ext);  		/* Set is already added to the list */ -		ip_set_put_byindex(d->id); +		ip_set_put_byindex(map->net, d->id);  		return 0;  	}  insert:  	ret = -IPSET_ERR_LIST_FULL;  	for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			ret = d->before != 0 ? -IPSET_ERR_REF_EXIST  				: list_set_add(set, i, d, ext); @@ -355,12 +339,12 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	u32 i;  	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			return d->before != 0 ? -IPSET_ERR_REF_EXIST  					      : -IPSET_ERR_EXIST;  		else if (SET_WITH_TIMEOUT(set) && -			 ip_set_timeout_expired(ext_timeout(e, map))) +			 ip_set_timeout_expired(ext_timeout(e, set)))  			continue;  		else if (e->id != d->id)  			continue; @@ -386,7 +370,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],  	struct list_set *map = set->data;  	ipset_adtfn adtfn = set->variant->adt[adt];  	struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; -	struct ip_set_ext ext = IP_SET_INIT_UEXT(map); +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);  	struct ip_set *s;  	int ret = 0; @@ -403,7 +387,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],  	ret = ip_set_get_extensions(set, tb, &ext);  	if (ret)  		return ret; -	e.id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); +	e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s);  	if (e.id == IPSET_INVALID_ID)  		return -IPSET_ERR_NAME;  	/* "Loop detection" */ @@ -423,7 +407,8 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],  	}  	if (tb[IPSET_ATTR_NAMEREF]) { -		e.refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), +		e.refid = ip_set_get_byname(map->net, +					    nla_data(tb[IPSET_ATTR_NAMEREF]),  					    &s);  		if (e.refid == IPSET_INVALID_ID) {  			ret = -IPSET_ERR_NAMEREF; @@ -439,9 +424,9 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],  finish:  	if (e.refid != IPSET_INVALID_ID) -		ip_set_put_byindex(e.refid); +		ip_set_put_byindex(map->net, e.refid);  	if (adt != IPSET_ADD || ret) -		ip_set_put_byindex(e.id); +		ip_set_put_byindex(map->net, e.id);  	return ip_set_eexist(ret, flags) ? 0 : ret;  } @@ -454,9 +439,10 @@ list_set_flush(struct ip_set *set)  	u32 i;  	for (i = 0; i < map->size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		if (e->id != IPSET_INVALID_ID) { -			ip_set_put_byindex(e->id); +			ip_set_put_byindex(map->net, e->id); +			ip_set_ext_destroy(set, e);  			e->id = IPSET_INVALID_ID;  		}  	} @@ -485,14 +471,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)  	if (!nested)  		goto nla_put_failure;  	if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || -	    (SET_WITH_TIMEOUT(set) && -	     nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) || -	    (SET_WITH_COUNTER(set) && -	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, -			   htonl(IPSET_FLAG_WITH_COUNTERS))) ||  	    nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||  	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, -			  htonl(sizeof(*map) + map->size * map->dsize))) +			  htonl(sizeof(*map) + map->size * set->dsize))) +		goto nla_put_failure; +	if (unlikely(ip_set_put_flags(skb, set)))  		goto nla_put_failure;  	ipset_nest_end(skb, nested); @@ -507,19 +490,20 @@ list_set_list(const struct ip_set *set,  {  	const struct list_set *map = set->data;  	struct nlattr *atd, *nested; -	u32 i, first = cb->args[2]; +	u32 i, first = cb->args[IPSET_CB_ARG0];  	const struct set_elem *e;  	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);  	if (!atd)  		return -EMSGSIZE; -	for (; cb->args[2] < map->size; cb->args[2]++) { -		i = cb->args[2]; -		e = list_set_elem(map, i); +	for (; cb->args[IPSET_CB_ARG0] < map->size; +	     cb->args[IPSET_CB_ARG0]++) { +		i = cb->args[IPSET_CB_ARG0]; +		e = list_set_elem(set, map, i);  		if (e->id == IPSET_INVALID_ID)  			goto finish;  		if (SET_WITH_TIMEOUT(set) && -		    ip_set_timeout_expired(ext_timeout(e, map))) +		    ip_set_timeout_expired(ext_timeout(e, set)))  			continue;  		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);  		if (!nested) { @@ -530,31 +514,25 @@ list_set_list(const struct ip_set *set,  				goto nla_put_failure;  		}  		if (nla_put_string(skb, IPSET_ATTR_NAME, -				   ip_set_name_byindex(e->id))) -			goto nla_put_failure; -		if (SET_WITH_TIMEOUT(set) && -		    nla_put_net32(skb, IPSET_ATTR_TIMEOUT, -				  htonl(ip_set_timeout_get( -						ext_timeout(e, map))))) +				   ip_set_name_byindex(map->net, e->id)))  			goto nla_put_failure; -		if (SET_WITH_COUNTER(set) && -		    ip_set_put_counter(skb, ext_counter(e, map))) +		if (ip_set_put_extensions(skb, set, e, true))  			goto nla_put_failure;  		ipset_nest_end(skb, nested);  	}  finish:  	ipset_nest_end(skb, atd);  	/* Set listing finished */ -	cb->args[2] = 0; +	cb->args[IPSET_CB_ARG0] = 0;  	return 0;  nla_put_failure:  	nla_nest_cancel(skb, nested); -	ipset_nest_end(skb, atd);  	if (unlikely(i == first)) { -		cb->args[2] = 0; +		cb->args[IPSET_CB_ARG0] = 0;  		return -EMSGSIZE;  	} +	ipset_nest_end(skb, atd);  	return 0;  } @@ -565,7 +543,7 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)  	const struct list_set *y = b->data;  	return x->size == y->size && -	       x->timeout == y->timeout && +	       a->timeout == b->timeout &&  	       a->extensions == b->extensions;  } @@ -594,7 +572,7 @@ list_set_gc(unsigned long ul_set)  	set_cleanup_entries(set);  	write_unlock_bh(&set->lock); -	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;  	add_timer(&map->gc);  } @@ -606,43 +584,40 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))  	init_timer(&map->gc);  	map->gc.data = (unsigned long) set;  	map->gc.function = gc; -	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;  	add_timer(&map->gc);  }  /* Create list:set type of sets */ -static struct list_set * -init_list_set(struct ip_set *set, u32 size, size_t dsize, -	      unsigned long timeout) +static bool +init_list_set(struct net *net, struct ip_set *set, u32 size)  {  	struct list_set *map;  	struct set_elem *e;  	u32 i; -	map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL); +	map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL);  	if (!map) -		return NULL; +		return false;  	map->size = size; -	map->dsize = dsize; -	map->timeout = timeout; +	map->net = net;  	set->data = map;  	for (i = 0; i < size; i++) { -		e = list_set_elem(map, i); +		e = list_set_elem(set, map, i);  		e->id = IPSET_INVALID_ID;  	} -	return map; +	return true;  }  static int -list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], +		u32 flags)  { -	struct list_set *map; -	u32 size = IP_SET_LIST_DEFAULT_SIZE, cadt_flags = 0; -	unsigned long timeout = 0; +	u32 size = IP_SET_LIST_DEFAULT_SIZE;  	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||  		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || @@ -654,45 +629,13 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)  	if (size < IP_SET_LIST_MIN_SIZE)  		size = IP_SET_LIST_MIN_SIZE; -	if (tb[IPSET_ATTR_CADT_FLAGS]) -		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); -	if (tb[IPSET_ATTR_TIMEOUT]) -		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);  	set->variant = &set_variant; -	if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { -		set->extensions |= IPSET_EXT_COUNTER; -		if (tb[IPSET_ATTR_TIMEOUT]) { -			map = init_list_set(set, size, -					sizeof(struct setct_elem), timeout); -			if (!map) -				return -ENOMEM; -			set->extensions |= IPSET_EXT_TIMEOUT; -			map->offset[IPSET_OFFSET_TIMEOUT] = -				offsetof(struct setct_elem, timeout); -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct setct_elem, counter); -			list_set_gc_init(set, list_set_gc); -		} else { -			map = init_list_set(set, size, -					    sizeof(struct setc_elem), 0); -			if (!map) -				return -ENOMEM; -			map->offset[IPSET_OFFSET_COUNTER] = -				offsetof(struct setc_elem, counter); -		} -	} else if (tb[IPSET_ATTR_TIMEOUT]) { -		map = init_list_set(set, size, -				    sizeof(struct sett_elem), timeout); -		if (!map) -			return -ENOMEM; -		set->extensions |= IPSET_EXT_TIMEOUT; -		map->offset[IPSET_OFFSET_TIMEOUT] = -			offsetof(struct sett_elem, timeout); +	set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); +	if (!init_list_set(net, set, size)) +		return -ENOMEM; +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);  		list_set_gc_init(set, list_set_gc); -	} else { -		map = init_list_set(set, size, sizeof(struct set_elem), 0); -		if (!map) -			return -ENOMEM;  	}  	return 0;  } @@ -703,8 +646,8 @@ static struct ip_set_type list_set_type __read_mostly = {  	.features	= IPSET_TYPE_NAME | IPSET_DUMP_LAST,  	.dimension	= IPSET_DIM_ONE,  	.family		= NFPROTO_UNSPEC, -	.revision_min	= REVISION_MIN, -	.revision_max	= REVISION_MAX, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX,  	.create		= list_set_create,  	.create_policy	= {  		[IPSET_ATTR_SIZE]	= { .type = NLA_U32 }, @@ -721,6 +664,7 @@ static struct ip_set_type list_set_type __read_mostly = {  		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },  		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 },  		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING },  	},  	.me		= THIS_MODULE,  }; diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 4f29fa97044..04d15fdc99e 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -7,8 +7,8 @@  #define E(a, b, c, d) \  	{.ip6 = { \ -		__constant_htonl(a), __constant_htonl(b), \ -		__constant_htonl(c), __constant_htonl(d), \ +		htonl(a), htonl(b), \ +		htonl(c), htonl(d), \  	} }  /* diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 4c8e5c0aa1a..610e19c0e13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -797,7 +797,6 @@ static void ip_vs_conn_expire(unsigned long data)  			ip_vs_control_del(cp);  		if (cp->flags & IP_VS_CONN_F_NFCT) { -			ip_vs_conn_drop_conntrack(cp);  			/* Do not access conntracks during subsys cleanup  			 * because nf_conntrack_find_get can not be used after  			 * conntrack cleanup for the net. @@ -871,11 +870,11 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,  	cp->protocol	   = p->protocol;  	ip_vs_addr_set(p->af, &cp->caddr, p->caddr);  	cp->cport	   = p->cport; -	ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr); -	cp->vport	   = p->vport; -	/* proto should only be IPPROTO_IP if d_addr is a fwmark */ +	/* proto should only be IPPROTO_IP if p->vaddr is a fwmark */  	ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, -		       &cp->daddr, daddr); +		       &cp->vaddr, p->vaddr); +	cp->vport	   = p->vport; +	ip_vs_addr_set(p->af, &cp->daddr, daddr);  	cp->dport          = dport;  	cp->flags	   = flags;  	cp->fwmark         = fwmark; @@ -1209,7 +1208,7 @@ void ip_vs_random_dropentry(struct net *net)  	 * Randomly scan 1/32 of the whole table every second  	 */  	for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { -		unsigned int hash = net_random() & ip_vs_conn_tab_mask; +		unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;  		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {  			if (cp->flags & IP_VS_CONN_F_TEMPLATE) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 74fd00c2721..e6836755c45 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -97,7 +97,7 @@ const char *ip_vs_proto_name(unsigned int proto)  		return "ICMPv6";  #endif  	default: -		sprintf(buf, "IP_%d", proto); +		sprintf(buf, "IP_%u", proto);  		return buf;  	}  } @@ -1139,12 +1139,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)  	ip_vs_fill_iph_skb(af, skb, &iph);  #ifdef CONFIG_IP_VS_IPV6  	if (af == AF_INET6) { -		if (!iph.fragoffs && skb_nfct_reasm(skb)) { -			struct sk_buff *reasm = skb_nfct_reasm(skb); -			/* Save fw mark for coming frags */ -			reasm->ipvs_property = 1; -			reasm->mark = skb->mark; -		}  		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {  			int related;  			int verdict = ip_vs_out_icmp_v6(skb, &related, @@ -1239,11 +1233,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,  	     const struct net_device *in, const struct net_device *out,  	     int (*okfn)(struct sk_buff *))  { -	return ip_vs_out(hooknum, skb, AF_INET); +	return ip_vs_out(ops->hooknum, skb, AF_INET);  }  /* @@ -1251,11 +1245,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,  		   const struct net_device *in, const struct net_device *out,  		   int (*okfn)(struct sk_buff *))  { -	return ip_vs_out(hooknum, skb, AF_INET); +	return ip_vs_out(ops->hooknum, skb, AF_INET);  }  #ifdef CONFIG_IP_VS_IPV6 @@ -1266,11 +1260,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,  	     const struct net_device *in, const struct net_device *out,  	     int (*okfn)(struct sk_buff *))  { -	return ip_vs_out(hooknum, skb, AF_INET6); +	return ip_vs_out(ops->hooknum, skb, AF_INET6);  }  /* @@ -1278,11 +1272,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		   const struct net_device *in, const struct net_device *out,  		   int (*okfn)(struct sk_buff *))  { -	return ip_vs_out(hooknum, skb, AF_INET6); +	return ip_vs_out(ops->hooknum, skb, AF_INET6);  }  #endif @@ -1398,15 +1392,19 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)  	if (ipip) {  		__be32 info = ic->un.gateway; +		__u8 type = ic->type; +		__u8 code = ic->code;  		/* Update the MTU */  		if (ic->type == ICMP_DEST_UNREACH &&  		    ic->code == ICMP_FRAG_NEEDED) {  			struct ip_vs_dest *dest = cp->dest;  			u32 mtu = ntohs(ic->un.frag.mtu); +			__be16 frag_off = cih->frag_off;  			/* Strip outer IP and ICMP, go to IPIP header */ -			__skb_pull(skb, ihl + sizeof(_icmph)); +			if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) +				goto ignore_ipip;  			offset2 -= ihl + sizeof(_icmph);  			skb_reset_network_header(skb);  			IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", @@ -1414,7 +1412,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)  			ipv4_update_pmtu(skb, dev_net(skb->dev),  					 mtu, 0, 0, 0, 0);  			/* Client uses PMTUD? */ -			if (!(cih->frag_off & htons(IP_DF))) +			if (!(frag_off & htons(IP_DF)))  				goto ignore_ipip;  			/* Prefer the resulting PMTU */  			if (dest) { @@ -1433,12 +1431,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)  		/* Strip outer IP, ICMP and IPIP, go to IP header of  		 * original request.  		 */ -		__skb_pull(skb, offset2); +		if (pskb_pull(skb, offset2) == NULL) +			goto ignore_ipip;  		skb_reset_network_header(skb);  		IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",  			&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, -			ic->type, ic->code, ntohl(info)); -		icmp_send(skb, ic->type, ic->code, info); +			type, code, ntohl(info)); +		icmp_send(skb, type, code, info);  		/* ICMP can be shorter but anyways, account it */  		ip_vs_out_stats(cp, skb); @@ -1614,12 +1613,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  #ifdef CONFIG_IP_VS_IPV6  	if (af == AF_INET6) { -		if (!iph.fragoffs && skb_nfct_reasm(skb)) { -			struct sk_buff *reasm = skb_nfct_reasm(skb); -			/* Save fw mark for coming frags. */ -			reasm->ipvs_property = 1; -			reasm->mark = skb->mark; -		}  		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {  			int related;  			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, @@ -1671,9 +1664,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  		/* sorry, all this trouble for a no-hit :) */  		IP_VS_DBG_PKT(12, af, pp, skb, 0,  			      "ip_vs_in: packet continues traversal as normal"); -		if (iph.fragoffs && !skb_nfct_reasm(skb)) { +		if (iph.fragoffs) {  			/* Fragment that couldn't be mapped to a conn entry -			 * and don't have any pointer to a reasm skb  			 * is missing module nf_defrag_ipv6  			 */  			IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); @@ -1733,12 +1725,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)   *	Schedule and forward packets from remote clients   */  static unsigned int -ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in,  		      const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  { -	return ip_vs_in(hooknum, skb, AF_INET); +	return ip_vs_in(ops->hooknum, skb, AF_INET);  }  /* @@ -1746,58 +1738,26 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,   *	Schedule and forward packets from local clients   */  static unsigned int -ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,  		     const struct net_device *in, const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  { -	return ip_vs_in(hooknum, skb, AF_INET); +	return ip_vs_in(ops->hooknum, skb, AF_INET);  }  #ifdef CONFIG_IP_VS_IPV6  /* - * AF_INET6 fragment handling - * Copy info from first fragment, to the rest of them. - */ -static unsigned int -ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb, -		     const struct net_device *in, -		     const struct net_device *out, -		     int (*okfn)(struct sk_buff *)) -{ -	struct sk_buff *reasm = skb_nfct_reasm(skb); -	struct net *net; - -	/* Skip if not a "replay" from nf_ct_frag6_output or first fragment. -	 * ipvs_property is set when checking first fragment -	 * in ip_vs_in() and ip_vs_out(). -	 */ -	if (reasm) -		IP_VS_DBG(2, "Fragment recv prop:%d\n", reasm->ipvs_property); -	if (!reasm || !reasm->ipvs_property) -		return NF_ACCEPT; - -	net = skb_net(skb); -	if (!net_ipvs(net)->enable) -		return NF_ACCEPT; - -	/* Copy stored fw mark, saved in ip_vs_{in,out} */ -	skb->mark = reasm->mark; - -	return NF_ACCEPT; -} - -/*   *	AF_INET6 handler in NF_INET_LOCAL_IN chain   *	Schedule and forward packets from remote clients   */  static unsigned int -ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in,  		      const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  { -	return ip_vs_in(hooknum, skb, AF_INET6); +	return ip_vs_in(ops->hooknum, skb, AF_INET6);  }  /* @@ -1805,11 +1765,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,   *	Schedule and forward packets from local clients   */  static unsigned int -ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		     const struct net_device *in, const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  { -	return ip_vs_in(hooknum, skb, AF_INET6); +	return ip_vs_in(ops->hooknum, skb, AF_INET6);  }  #endif @@ -1825,7 +1785,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,   *      and send them to ip_vs_in_icmp.   */  static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,  		   const struct net_device *in, const struct net_device *out,  		   int (*okfn)(struct sk_buff *))  { @@ -1842,12 +1802,12 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,  	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))  		return NF_ACCEPT; -	return ip_vs_in_icmp(skb, &r, hooknum); +	return ip_vs_in_icmp(skb, &r, ops->hooknum);  }  #ifdef CONFIG_IP_VS_IPV6  static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in, const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  { @@ -1866,7 +1826,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,  	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))  		return NF_ACCEPT; -	return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr); +	return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);  }  #endif @@ -1924,14 +1884,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {  		.priority	= 100,  	},  #ifdef CONFIG_IP_VS_IPV6 -	/* After mangle & nat fetch 2:nd fragment and following */ -	{ -		.hook		= ip_vs_preroute_frag6, -		.owner		= THIS_MODULE, -		.pf		= NFPROTO_IPV6, -		.hooknum	= NF_INET_PRE_ROUTING, -		.priority	= NF_IP6_PRI_NAT_DST + 1, -	},  	/* After packet filtering, change source only for VS/NAT */  	{  		.hook		= ip_vs_reply6, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index a3df9bddc4f..581a6584ed0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -704,7 +704,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)  	__ip_vs_dst_cache_reset(dest);  	__ip_vs_svc_put(svc, false);  	free_percpu(dest->stats.cpustats); -	kfree(dest); +	ip_vs_dest_put_and_free(dest);  }  /* @@ -842,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,  	       struct ip_vs_dest **dest_p)  {  	struct ip_vs_dest *dest; -	unsigned int atype; +	unsigned int atype, i;  	EnterFunction(2); @@ -869,6 +869,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,  	if (!dest->stats.cpustats)  		goto err_alloc; +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *ip_vs_dest_stats; +		ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); +		u64_stats_init(&ip_vs_dest_stats->syncp); +	} +  	dest->af = svc->af;  	dest->protocol = svc->protocol;  	dest->vaddr = svc->addr; @@ -1134,7 +1140,7 @@ static int  ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,  		  struct ip_vs_service **svc_p)  { -	int ret = 0; +	int ret = 0, i;  	struct ip_vs_scheduler *sched = NULL;  	struct ip_vs_pe *pe = NULL;  	struct ip_vs_service *svc = NULL; @@ -1184,6 +1190,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,  		goto out_err;  	} +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *ip_vs_stats; +		ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); +		u64_stats_init(&ip_vs_stats->syncp); +	} + +  	/* I'm the first user of the service */  	atomic_set(&svc->refcnt, 0); @@ -2164,10 +2177,10 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)  		__u64 inbytes, outbytes;  		do { -			start = u64_stats_fetch_begin_bh(&u->syncp); +			start = u64_stats_fetch_begin_irq(&u->syncp);  			inbytes = u->ustats.inbytes;  			outbytes = u->ustats.outbytes; -		} while (u64_stats_fetch_retry_bh(&u->syncp, start)); +		} while (u64_stats_fetch_retry_irq(&u->syncp, start));  		seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",  			   i, u->ustats.conns, u->ustats.inpkts, @@ -3567,7 +3580,7 @@ out:  } -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { +static const struct genl_ops ip_vs_genl_ops[] = {  	{  		.cmd	= IPVS_CMD_NEW_SERVICE,  		.flags	= GENL_ADMIN_PERM, @@ -3666,7 +3679,7 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = {  static int __init ip_vs_genl_register(void)  {  	return genl_register_family_with_ops(&ip_vs_genl_family, -		ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops)); +					     ip_vs_genl_ops);  }  static void ip_vs_genl_unregister(void) @@ -3765,6 +3778,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)  	cancel_delayed_work_sync(&ipvs->defense_work);  	cancel_work_sync(&ipvs->defense_work.work);  	unregister_net_sysctl_table(ipvs->sysctl_hdr); +	ip_vs_stop_estimator(net, &ipvs->tot_stats);  }  #else @@ -3780,7 +3794,7 @@ static struct notifier_block ip_vs_dst_notifier = {  int __net_init ip_vs_control_net_init(struct net *net)  { -	int idx; +	int i, idx;  	struct netns_ipvs *ipvs = net_ipvs(net);  	/* Initialize rs_table */ @@ -3799,6 +3813,12 @@ int __net_init ip_vs_control_net_init(struct net *net)  	if (!ipvs->tot_stats.cpustats)  		return -ENOMEM; +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *ipvs_tot_stats; +		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); +		u64_stats_init(&ipvs_tot_stats->syncp); +	} +  	spin_lock_init(&ipvs->tot_stats.lock);  	proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); @@ -3820,12 +3840,7 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)  {  	struct netns_ipvs *ipvs = net_ipvs(net); -	/* Some dest can be in grace period even before cleanup, we have to -	 * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called. -	 */ -	rcu_barrier();  	ip_vs_trash_cleanup(net); -	ip_vs_stop_estimator(net, &ipvs->tot_stats);  	ip_vs_control_net_cleanup_sysctl(net);  	remove_proc_entry("ip_vs_stats_percpu", net->proc_net);  	remove_proc_entry("ip_vs_stats", net->proc_net); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index eff13c94498..547ff33c1ef 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -136,7 +136,7 @@ static void ip_vs_lblc_rcu_free(struct rcu_head *head)  						   struct ip_vs_lblc_entry,  						   rcu_head); -	ip_vs_dest_put(en->dest); +	ip_vs_dest_put_and_free(en->dest);  	kfree(en);  } @@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)  	spin_lock_bh(&svc->sched_lock);  	tbl->dead = 1; -	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { +	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {  		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {  			ip_vs_lblc_del(en);  			atomic_dec(&tbl->entries); @@ -265,7 +265,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)  	unsigned long now = jiffies;  	int i, j; -	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { +	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {  		j = (j + 1) & IP_VS_LBLC_TAB_MASK;  		spin_lock(&svc->sched_lock); @@ -321,7 +321,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)  	if (goal > tbl->max_size/2)  		goal = tbl->max_size/2; -	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { +	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {  		j = (j + 1) & IP_VS_LBLC_TAB_MASK;  		spin_lock(&svc->sched_lock); @@ -340,7 +340,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)  	tbl->rover = j;    out: -	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);  } @@ -363,7 +363,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)  	/*  	 *    Initialize the hash buckets  	 */ -	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { +	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {  		INIT_HLIST_HEAD(&tbl->bucket[i]);  	}  	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; @@ -536,8 +536,7 @@ out:  /*   *      IPVS LBLC Scheduler structure   */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = {  	.name =			"lblc",  	.refcnt =		ATOMIC_INIT(0),  	.module =		THIS_MODULE, diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0b8550089a2..3f21a2f47de 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -130,7 +130,7 @@ static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)  	struct ip_vs_dest_set_elem *e;  	e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); -	ip_vs_dest_put(e->dest); +	ip_vs_dest_put_and_free(e->dest);  	kfree(e);  } diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index c8beafd401a..5882bbfd198 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -19,8 +19,7 @@   * GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   *   *   * Authors: @@ -63,6 +62,7 @@  #include <net/ip_vs.h>  #include <net/netfilter/nf_conntrack_core.h>  #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h>  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_conntrack_zones.h> @@ -97,6 +97,11 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)  	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)  		return; +	/* Applications may adjust TCP seqs */ +	if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && +	    !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) +		return; +  	/*  	 * The connection is not yet in the hashtable, so we update it.  	 * CIP->VIP will remain the same, so leave the tuple in diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 9ef22bdce9f..bed5f704252 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -65,7 +65,6 @@ static int get_callid(const char *dptr, unsigned int dataoff,  static int  ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)  { -	struct sk_buff *reasm = skb_nfct_reasm(skb);  	struct ip_vs_iphdr iph;  	unsigned int dataoff, datalen, matchoff, matchlen;  	const char *dptr; @@ -79,15 +78,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)  	/* todo: IPv6 fragments:  	 *       I think this only should be done for the first fragment. /HS  	 */ -	if (reasm) { -		skb = reasm; -		dataoff = iph.thoff_reasm + sizeof(struct udphdr); -	} else -		dataoff = iph.len + sizeof(struct udphdr); +	dataoff = iph.len + sizeof(struct udphdr);  	if (dataoff >= skb->len)  		return -EINVAL; -	/* todo: Check if this will mess-up the reasm skb !!! /HS */  	retc = skb_linearize(skb);  	if (retc < 0)  		return retc; diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 23e596e438b..2f7ea756404 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -20,13 +20,18 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,  	sctp_sctphdr_t *sh, _sctph;  	sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); -	if (sh == NULL) +	if (sh == NULL) { +		*verdict = NF_DROP;  		return 0; +	}  	sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),  				 sizeof(_schunkh), &_schunkh); -	if (sch == NULL) +	if (sch == NULL) { +		*verdict = NF_DROP;  		return 0; +	} +  	net = skb_net(skb);  	ipvs = net_ipvs(net);  	rcu_read_lock(); @@ -76,6 +81,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,  {  	sctp_sctphdr_t *sctph;  	unsigned int sctphoff = iph->len; +	bool payload_csum = false;  #ifdef CONFIG_IP_VS_IPV6  	if (cp->af == AF_INET6 && iph->fragoffs) @@ -87,19 +93,31 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,  		return 0;  	if (unlikely(cp->app != NULL)) { +		int ret; +  		/* Some checks before mangling */  		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))  			return 0;  		/* Call application helper if needed */ -		if (!ip_vs_app_pkt_out(cp, skb)) +		ret = ip_vs_app_pkt_out(cp, skb); +		if (ret == 0)  			return 0; +		/* ret=2: csum update is needed after payload mangling */ +		if (ret == 2) +			payload_csum = true;  	}  	sctph = (void *) skb_network_header(skb) + sctphoff; -	sctph->source = cp->vport; -	sctp_nat_csum(skb, sctph, sctphoff); +	/* Only update csum if we really have to */ +	if (sctph->source != cp->vport || payload_csum || +	    skb->ip_summed == CHECKSUM_PARTIAL) { +		sctph->source = cp->vport; +		sctp_nat_csum(skb, sctph, sctphoff); +	} else { +		skb->ip_summed = CHECKSUM_UNNECESSARY; +	}  	return 1;  } @@ -110,6 +128,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,  {  	sctp_sctphdr_t *sctph;  	unsigned int sctphoff = iph->len; +	bool payload_csum = false;  #ifdef CONFIG_IP_VS_IPV6  	if (cp->af == AF_INET6 && iph->fragoffs) @@ -121,19 +140,32 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,  		return 0;  	if (unlikely(cp->app != NULL)) { +		int ret; +  		/* Some checks before mangling */  		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))  			return 0;  		/* Call application helper if needed */ -		if (!ip_vs_app_pkt_in(cp, skb)) +		ret = ip_vs_app_pkt_in(cp, skb); +		if (ret == 0)  			return 0; +		/* ret=2: csum update is needed after payload mangling */ +		if (ret == 2) +			payload_csum = true;  	}  	sctph = (void *) skb_network_header(skb) + sctphoff; -	sctph->dest = cp->dport; -	sctp_nat_csum(skb, sctph, sctphoff); +	/* Only update csum if we really have to */ +	if (sctph->dest != cp->dport || payload_csum || +	    (skb->ip_summed == CHECKSUM_PARTIAL && +	     !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { +		sctph->dest = cp->dport; +		sctp_nat_csum(skb, sctph, sctphoff); +	} else if (skb->ip_summed != CHECKSUM_PARTIAL) { +		skb->ip_summed = CHECKSUM_UNNECESSARY; +	}  	return 1;  } diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 3588faebe52..cc65b2f42cd 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -115,27 +115,46 @@ ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,  } -/* As ip_vs_sh_get, but with fallback if selected server is unavailable */ +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */  static inline struct ip_vs_dest *  ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,  		      const union nf_inet_addr *addr, __be16 port)  { -	unsigned int offset; -	unsigned int hash; +	unsigned int offset, roffset; +	unsigned int hash, ihash;  	struct ip_vs_dest *dest; +	/* first try the dest it's supposed to go to */ +	ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); +	dest = rcu_dereference(s->buckets[ihash].dest); +	if (!dest) +		return NULL; +	if (!is_unavailable(dest)) +		return dest; + +	IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", +		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + +	/* if the original dest is unavailable, loop around the table +	 * starting from ihash to find a new dest +	 */  	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { -		hash = ip_vs_sh_hashkey(svc->af, addr, port, offset); +		roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; +		hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);  		dest = rcu_dereference(s->buckets[hash].dest);  		if (!dest)  			break; -		if (is_unavailable(dest)) -			IP_VS_DBG_BUF(6, "SH: selected unavailable server " -				      "%s:%d (offset %d)", -				      IP_VS_DBG_ADDR(svc->af, &dest->addr), -				      ntohs(dest->port), offset); -		else +		if (!is_unavailable(dest))  			return dest; +		IP_VS_DBG_BUF(6, "SH: selected unavailable " +			      "server %s:%d (offset %d), reselecting", +			      IP_VS_DBG_ADDR(svc->af, &dest->addr), +			      ntohs(dest->port), roffset);  	}  	return NULL; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f4484719f3e..db801263ee9 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1637,12 +1637,12 @@ static int sync_thread_master(void *data)  			continue;  		}  		while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { -			int ret = 0; - +			/* (Ab)use interruptible sleep to avoid increasing +			 * the load avg. +			 */  			__wait_event_interruptible(*sk_sleep(sk),  						   sock_writeable(sk) || -						   kthread_should_stop(), -						   ret); +						   kthread_should_stop());  			if (unlikely(kthread_should_stop()))  				goto done;  		} diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index c47444e4cf8..73ba1cc7a88 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -562,7 +562,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	ip_send_check(iph);  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);  	rcu_read_unlock(); @@ -590,7 +590,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  		goto tx_error;  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);  	rcu_read_unlock(); @@ -684,7 +684,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	   MTU problem. */  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);  	rcu_read_unlock(); @@ -774,7 +774,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	   MTU problem. */  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);  	rcu_read_unlock(); @@ -883,10 +883,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	iph->daddr		=	cp->daddr.ip;  	iph->saddr		=	saddr;  	iph->ttl		=	old_iph->ttl; -	ip_select_ident(skb, &rt->dst, NULL); +	ip_select_ident(skb, NULL);  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	ret = ip_vs_tunnel_xmit_prepare(skb, cp);  	if (ret == NF_ACCEPT) @@ -974,7 +974,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	iph->hop_limit		=	old_iph->hop_limit;  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	ret = ip_vs_tunnel_xmit_prepare(skb, cp);  	if (ret == NF_ACCEPT) @@ -1023,7 +1023,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	ip_send_check(ip_hdr(skb));  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);  	rcu_read_unlock(); @@ -1060,7 +1060,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	}  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);  	rcu_read_unlock(); @@ -1157,7 +1157,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	ip_vs_nat_icmp(skb, pp, cp, 0);  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);  	rcu_read_unlock(); @@ -1249,7 +1249,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	ip_vs_nat_icmp_v6(skb, pp, cp, 0);  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1;  	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);  	rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 2d3030ab5b6..a4b5e2a435a 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -39,21 +39,23 @@ static struct ctl_table acct_sysctl_table[] = {  unsigned int  seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)  { -	struct nf_conn_counter *acct; +	struct nf_conn_acct *acct; +	struct nf_conn_counter *counter;  	acct = nf_conn_acct_find(ct);  	if (!acct)  		return 0; +	counter = acct->counter;  	return seq_printf(s, "packets=%llu bytes=%llu ", -			  (unsigned long long)atomic64_read(&acct[dir].packets), -			  (unsigned long long)atomic64_read(&acct[dir].bytes)); +			  (unsigned long long)atomic64_read(&counter[dir].packets), +			  (unsigned long long)atomic64_read(&counter[dir].bytes));  };  EXPORT_SYMBOL_GPL(seq_print_acct);  static struct nf_ct_ext_type acct_extend __read_mostly = { -	.len	= sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]), -	.align	= __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]), +	.len	= sizeof(struct nf_conn_acct), +	.align	= __alignof__(struct nf_conn_acct),  	.id	= NF_CT_EXT_ACCT,  }; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5d892febd64..1f4f954c4b4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -60,14 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,  				      const struct nlattr *attr) __read_mostly;  EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); -int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, -			      struct nf_conn *ct, -			      enum ip_conntrack_info ctinfo, -			      unsigned int protoff); -EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook); +__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; +EXPORT_SYMBOL_GPL(nf_conntrack_locks); -DEFINE_SPINLOCK(nf_conntrack_lock); -EXPORT_SYMBOL_GPL(nf_conntrack_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); + +static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) +{ +	h1 %= CONNTRACK_LOCKS; +	h2 %= CONNTRACK_LOCKS; +	spin_unlock(&nf_conntrack_locks[h1]); +	if (h1 != h2) +		spin_unlock(&nf_conntrack_locks[h2]); +} + +/* return true if we need to recompute hashes (in case hash table was resized) */ +static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, +				     unsigned int h2, unsigned int sequence) +{ +	h1 %= CONNTRACK_LOCKS; +	h2 %= CONNTRACK_LOCKS; +	if (h1 <= h2) { +		spin_lock(&nf_conntrack_locks[h1]); +		if (h1 != h2) +			spin_lock_nested(&nf_conntrack_locks[h2], +					 SINGLE_DEPTH_NESTING); +	} else { +		spin_lock(&nf_conntrack_locks[h2]); +		spin_lock_nested(&nf_conntrack_locks[h1], +				 SINGLE_DEPTH_NESTING); +	} +	if (read_seqcount_retry(&net->ct.generation, sequence)) { +		nf_conntrack_double_unlock(h1, h2); +		return true; +	} +	return false; +} + +static void nf_conntrack_all_lock(void) +{ +	int i; + +	for (i = 0; i < CONNTRACK_LOCKS; i++) +		spin_lock_nested(&nf_conntrack_locks[i], i); +} + +static void nf_conntrack_all_unlock(void) +{ +	int i; + +	for (i = 0; i < CONNTRACK_LOCKS; i++) +		spin_unlock(&nf_conntrack_locks[i]); +}  unsigned int nf_conntrack_htable_size __read_mostly;  EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); @@ -198,6 +243,50 @@ clean_from_lists(struct nf_conn *ct)  	nf_ct_remove_expectations(ct);  } +/* must be called with local_bh_disable */ +static void nf_ct_add_to_dying_list(struct nf_conn *ct) +{ +	struct ct_pcpu *pcpu; + +	/* add this conntrack to the (per cpu) dying list */ +	ct->cpu = smp_processor_id(); +	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + +	spin_lock(&pcpu->lock); +	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, +			     &pcpu->dying); +	spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) +{ +	struct ct_pcpu *pcpu; + +	/* add this conntrack to the (per cpu) unconfirmed list */ +	ct->cpu = smp_processor_id(); +	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + +	spin_lock(&pcpu->lock); +	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, +			     &pcpu->unconfirmed); +	spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) +{ +	struct ct_pcpu *pcpu; + +	/* We overload first tuple to link into unconfirmed or dying list.*/ +	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + +	spin_lock(&pcpu->lock); +	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); +	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); +	spin_unlock(&pcpu->lock); +} +  static void  destroy_conntrack(struct nf_conntrack *nfct)  { @@ -209,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct)  	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);  	NF_CT_ASSERT(!timer_pending(&ct->timeout)); -	/* To make sure we don't get any weird locking issues here: -	 * destroy_conntrack() MUST NOT be called with a write lock -	 * to nf_conntrack_lock!!! -HW */  	rcu_read_lock();  	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));  	if (l4proto && l4proto->destroy) @@ -219,19 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct)  	rcu_read_unlock(); -	spin_lock_bh(&nf_conntrack_lock); +	local_bh_disable();  	/* Expectations will have been removed in clean_from_lists,  	 * except TFTP can create an expectation on the first packet,  	 * before connection is in the list, so we need to clean here, -	 * too. */ +	 * too. +	 */  	nf_ct_remove_expectations(ct); -	/* We overload first tuple to link into unconfirmed or dying list.*/ -	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); -	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); +	nf_ct_del_from_dying_or_unconfirmed_list(ct);  	NF_CT_STAT_INC(net, delete); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	if (ct->master)  		nf_ct_put(ct->master); @@ -243,17 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)  static void nf_ct_delete_from_lists(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); +	unsigned int hash, reply_hash; +	u16 zone = nf_ct_zone(ct); +	unsigned int sequence;  	nf_ct_helper_destroy(ct); -	spin_lock_bh(&nf_conntrack_lock); -	/* Inside lock so preempt is disabled on module removal path. -	 * Otherwise we can get spurious warnings. */ -	NF_CT_STAT_INC(net, delete_list); + +	local_bh_disable(); +	do { +		sequence = read_seqcount_begin(&net->ct.generation); +		hash = hash_conntrack(net, zone, +				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +		reply_hash = hash_conntrack(net, zone, +					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); +	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); +  	clean_from_lists(ct); -	/* add this conntrack to the dying list */ -	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, -			     &net->ct.dying); -	spin_unlock_bh(&nf_conntrack_lock); +	nf_conntrack_double_unlock(hash, reply_hash); + +	nf_ct_add_to_dying_list(ct); + +	NF_CT_STAT_INC(net, delete_list); +	local_bh_enable();  }  static void death_by_event(unsigned long ul_conntrack) @@ -318,12 +414,25 @@ static void death_by_timeout(unsigned long ul_conntrack)  	nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);  } +static inline bool +nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, +			const struct nf_conntrack_tuple *tuple, +			u16 zone) +{ +	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + +	/* A conntrack can be recreated with the equal tuple, +	 * so we need to check that the conntrack is confirmed +	 */ +	return nf_ct_tuple_equal(tuple, &h->tuple) && +		nf_ct_zone(ct) == zone && +		nf_ct_is_confirmed(ct); +} +  /*   * Warning :   * - Caller must take a reference on returned object   *   and recheck nf_ct_tuple_equal(tuple, &h->tuple) - * OR - * - Caller must lock nf_conntrack_lock before calling this function   */  static struct nf_conntrack_tuple_hash *  ____nf_conntrack_find(struct net *net, u16 zone, @@ -339,8 +448,7 @@ ____nf_conntrack_find(struct net *net, u16 zone,  	local_bh_disable();  begin:  	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { -		if (nf_ct_tuple_equal(tuple, &h->tuple) && -		    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { +		if (nf_ct_key_equal(h, tuple, zone)) {  			NF_CT_STAT_INC(net, found);  			local_bh_enable();  			return h; @@ -361,15 +469,6 @@ begin:  	return NULL;  } -struct nf_conntrack_tuple_hash * -__nf_conntrack_find(struct net *net, u16 zone, -		    const struct nf_conntrack_tuple *tuple) -{ -	return ____nf_conntrack_find(net, zone, tuple, -				     hash_conntrack_raw(tuple, zone)); -} -EXPORT_SYMBOL_GPL(__nf_conntrack_find); -  /* Find a connection corresponding to a tuple. */  static struct nf_conntrack_tuple_hash *  __nf_conntrack_find_get(struct net *net, u16 zone, @@ -387,8 +486,7 @@ begin:  			     !atomic_inc_not_zero(&ct->ct_general.use)))  			h = NULL;  		else { -			if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || -				     nf_ct_zone(ct) != zone)) { +			if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {  				nf_ct_put(ct);  				goto begin;  			} @@ -410,32 +508,36 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get);  static void __nf_conntrack_hash_insert(struct nf_conn *ct,  				       unsigned int hash, -				       unsigned int repl_hash) +				       unsigned int reply_hash)  {  	struct net *net = nf_ct_net(ct);  	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,  			   &net->ct.hash[hash]);  	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, -			   &net->ct.hash[repl_hash]); +			   &net->ct.hash[reply_hash]);  }  int  nf_conntrack_hash_check_insert(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); -	unsigned int hash, repl_hash; +	unsigned int hash, reply_hash;  	struct nf_conntrack_tuple_hash *h;  	struct hlist_nulls_node *n;  	u16 zone; +	unsigned int sequence;  	zone = nf_ct_zone(ct); -	hash = hash_conntrack(net, zone, -			      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); -	repl_hash = hash_conntrack(net, zone, -				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); -	spin_lock_bh(&nf_conntrack_lock); +	local_bh_disable(); +	do { +		sequence = read_seqcount_begin(&net->ct.generation); +		hash = hash_conntrack(net, zone, +				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +		reply_hash = hash_conntrack(net, zone, +					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); +	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));  	/* See if there's one in the list already, including reverse */  	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) @@ -443,32 +545,57 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)  				      &h->tuple) &&  		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))  			goto out; -	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)  		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,  				      &h->tuple) &&  		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))  			goto out;  	add_timer(&ct->timeout); -	nf_conntrack_get(&ct->ct_general); -	__nf_conntrack_hash_insert(ct, hash, repl_hash); +	smp_wmb(); +	/* The caller holds a reference to this object */ +	atomic_set(&ct->ct_general.use, 2); +	__nf_conntrack_hash_insert(ct, hash, reply_hash); +	nf_conntrack_double_unlock(hash, reply_hash);  	NF_CT_STAT_INC(net, insert); -	spin_unlock_bh(&nf_conntrack_lock); - +	local_bh_enable();  	return 0;  out: +	nf_conntrack_double_unlock(hash, reply_hash);  	NF_CT_STAT_INC(net, insert_failed); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	return -EEXIST;  }  EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ +	struct ct_pcpu *pcpu; + +	__set_bit(IPS_TEMPLATE_BIT, &tmpl->status); +	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status); +	nf_conntrack_get(&tmpl->ct_general); + +	/* add this conntrack to the (per cpu) tmpl list */ +	local_bh_disable(); +	tmpl->cpu = smp_processor_id(); +	pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); + +	spin_lock(&pcpu->lock); +	/* Overload tuple linked list to put us in template list. */ +	hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, +				 &pcpu->tmpl); +	spin_unlock_bh(&pcpu->lock); +} +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); +  /* Confirm a connection given skb; places it in hash table */  int  __nf_conntrack_confirm(struct sk_buff *skb)  { -	unsigned int hash, repl_hash; +	unsigned int hash, reply_hash;  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct;  	struct nf_conn_help *help; @@ -477,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)  	enum ip_conntrack_info ctinfo;  	struct net *net;  	u16 zone; +	unsigned int sequence;  	ct = nf_ct_get(skb, &ctinfo);  	net = nf_ct_net(ct); @@ -489,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)  		return NF_ACCEPT;  	zone = nf_ct_zone(ct); -	/* reuse the hash saved before */ -	hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; -	hash = hash_bucket(hash, net); -	repl_hash = hash_conntrack(net, zone, -				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); +	local_bh_disable(); + +	do { +		sequence = read_seqcount_begin(&net->ct.generation); +		/* reuse the hash saved before */ +		hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; +		hash = hash_bucket(hash, net); +		reply_hash = hash_conntrack(net, zone, +					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + +	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));  	/* We're not in hash table, and we refuse to set up related -	   connections for unconfirmed conns.  But packet copies and -	   REJECT will give spurious warnings here. */ +	 * connections for unconfirmed conns.  But packet copies and +	 * REJECT will give spurious warnings here. +	 */  	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */  	/* No external references means no one else could have -	   confirmed us. */ +	 * confirmed us. +	 */  	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));  	pr_debug("Confirming conntrack %p\n", ct); - -	spin_lock_bh(&nf_conntrack_lock); -  	/* We have to check the DYING flag inside the lock to prevent  	   a race against nf_ct_get_next_corpse() possibly called from  	   user context, else we insert an already 'dead' hash, blocking  	   further use of that particular connection -JM */  	if (unlikely(nf_ct_is_dying(ct))) { -		spin_unlock_bh(&nf_conntrack_lock); +		nf_conntrack_double_unlock(hash, reply_hash); +		local_bh_enable();  		return NF_ACCEPT;  	} @@ -525,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb)  				      &h->tuple) &&  		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))  			goto out; -	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)  		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,  				      &h->tuple) &&  		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))  			goto out; -	/* Remove from unconfirmed list */ -	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); +	nf_ct_del_from_dying_or_unconfirmed_list(ct);  	/* Timer relative to confirmation time, not original  	   setting time, otherwise we'd get timer wrap in @@ -555,9 +688,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)  	 * guarantee that no other CPU can find the conntrack before the above  	 * stores are visible.  	 */ -	__nf_conntrack_hash_insert(ct, hash, repl_hash); +	__nf_conntrack_hash_insert(ct, hash, reply_hash); +	nf_conntrack_double_unlock(hash, reply_hash);  	NF_CT_STAT_INC(net, insert); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	help = nfct_help(ct);  	if (help && help->helper) @@ -568,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)  	return NF_ACCEPT;  out: +	nf_conntrack_double_unlock(hash, reply_hash);  	NF_CT_STAT_INC(net, insert_failed); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	return NF_DROP;  }  EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -612,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);  /* There's a small race here where we may free a just-assured     connection.  Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int hash) +static noinline int early_drop(struct net *net, unsigned int _hash)  {  	/* Use oldest entry, which is roughly LRU */  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct = NULL, *tmp;  	struct hlist_nulls_node *n; -	unsigned int i, cnt = 0; +	unsigned int i = 0, cnt = 0;  	int dropped = 0; +	unsigned int hash, sequence; +	spinlock_t *lockp; -	rcu_read_lock(); -	for (i = 0; i < net->ct.htable_size; i++) { +	local_bh_disable(); +restart: +	sequence = read_seqcount_begin(&net->ct.generation); +	hash = hash_bucket(_hash, net); +	for (; i < net->ct.htable_size; i++) { +		lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; +		spin_lock(lockp); +		if (read_seqcount_retry(&net->ct.generation, sequence)) { +			spin_unlock(lockp); +			goto restart; +		}  		hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],  					 hnnode) {  			tmp = nf_ct_tuplehash_to_ctrack(h); -			if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) +			if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && +			    !nf_ct_is_dying(tmp) && +			    atomic_inc_not_zero(&tmp->ct_general.use)) {  				ct = tmp; +				break; +			}  			cnt++;  		} -		if (ct != NULL) { -			if (likely(!nf_ct_is_dying(ct) && -				   atomic_inc_not_zero(&ct->ct_general.use))) -				break; -			else -				ct = NULL; -		} +		hash = (hash + 1) % net->ct.htable_size; +		spin_unlock(lockp); -		if (cnt >= NF_CT_EVICTION_RANGE) +		if (ct || cnt >= NF_CT_EVICTION_RANGE)  			break; -		hash = (hash + 1) % net->ct.htable_size;  	} -	rcu_read_unlock(); +	local_bh_enable();  	if (!ct)  		return dropped; @@ -693,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,  	if (nf_conntrack_max &&  	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { -		if (!early_drop(net, hash_bucket(hash, net))) { +		if (!early_drop(net, hash)) {  			atomic_dec(&net->ct.count);  			net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");  			return ERR_PTR(-ENOMEM); @@ -735,11 +879,10 @@ __nf_conntrack_alloc(struct net *net, u16 zone,  		nf_ct_zone->id = zone;  	}  #endif -	/* -	 * changes to lookup keys must be done before setting refcnt to 1 +	/* Because we use RCU lookups, we set ct_general.use to zero before +	 * this is inserted in any list.  	 */ -	smp_wmb(); -	atomic_set(&ct->ct_general.use, 1); +	atomic_set(&ct->ct_general.use, 0);  	return ct;  #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -763,10 +906,16 @@ void nf_conntrack_free(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); +	/* A freed object has refcnt == 0, that's +	 * the golden rule for SLAB_DESTROY_BY_RCU +	 */ +	NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); +  	nf_ct_ext_destroy(ct); -	atomic_dec(&net->ct.count);  	nf_ct_ext_free(ct);  	kmem_cache_free(net->ct.nf_conntrack_cachep, ct); +	smp_mb__before_atomic(); +	atomic_dec(&net->ct.count);  }  EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -785,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,  	struct nf_conn_help *help;  	struct nf_conntrack_tuple repl_tuple;  	struct nf_conntrack_ecache *ecache; -	struct nf_conntrack_expect *exp; +	struct nf_conntrack_expect *exp = NULL;  	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;  	struct nf_conn_timeout *timeout_ext;  	unsigned int *timeouts; @@ -829,39 +978,44 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,  				 ecache ? ecache->expmask : 0,  			     GFP_ATOMIC); -	spin_lock_bh(&nf_conntrack_lock); -	exp = nf_ct_find_expectation(net, zone, tuple); -	if (exp) { -		pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", -			 ct, exp); -		/* Welcome, Mr. Bond.  We've been expecting you... */ -		__set_bit(IPS_EXPECTED_BIT, &ct->status); -		ct->master = exp->master; -		if (exp->helper) { -			help = nf_ct_helper_ext_add(ct, exp->helper, -						    GFP_ATOMIC); -			if (help) -				rcu_assign_pointer(help->helper, exp->helper); -		} +	local_bh_disable(); +	if (net->ct.expect_count) { +		spin_lock(&nf_conntrack_expect_lock); +		exp = nf_ct_find_expectation(net, zone, tuple); +		if (exp) { +			pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", +				 ct, exp); +			/* Welcome, Mr. Bond.  We've been expecting you... */ +			__set_bit(IPS_EXPECTED_BIT, &ct->status); +			/* exp->master safe, refcnt bumped in nf_ct_find_expectation */ +			ct->master = exp->master; +			if (exp->helper) { +				help = nf_ct_helper_ext_add(ct, exp->helper, +							    GFP_ATOMIC); +				if (help) +					rcu_assign_pointer(help->helper, exp->helper); +			}  #ifdef CONFIG_NF_CONNTRACK_MARK -		ct->mark = exp->master->mark; +			ct->mark = exp->master->mark;  #endif  #ifdef CONFIG_NF_CONNTRACK_SECMARK -		ct->secmark = exp->master->secmark; +			ct->secmark = exp->master->secmark;  #endif -		nf_conntrack_get(&ct->master->ct_general); -		NF_CT_STAT_INC(net, expect_new); -	} else { +			NF_CT_STAT_INC(net, expect_new); +		} +		spin_unlock(&nf_conntrack_expect_lock); +	} +	if (!exp) {  		__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);  		NF_CT_STAT_INC(net, new);  	} -	/* Overload tuple linked list to put us in unconfirmed list. */ -	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, -		       &net->ct.unconfirmed); +	/* Now it is inserted into the unconfirmed list, bump refcount */ +	nf_conntrack_get(&ct->ct_general); +	nf_ct_add_to_unconfirmed_list(ct); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	if (exp) {  		if (exp->expectfn) @@ -1109,12 +1263,14 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,  acct:  	if (do_acct) { -		struct nf_conn_counter *acct; +		struct nf_conn_acct *acct;  		acct = nf_conn_acct_find(ct);  		if (acct) { -			atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets); -			atomic64_add(skb->len, &acct[CTINFO2DIR(ctinfo)].bytes); +			struct nf_conn_counter *counter = acct->counter; + +			atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); +			atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);  		}  	}  } @@ -1126,13 +1282,15 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,  		       int do_acct)  {  	if (do_acct) { -		struct nf_conn_counter *acct; +		struct nf_conn_acct *acct;  		acct = nf_conn_acct_find(ct);  		if (acct) { -			atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets); +			struct nf_conn_counter *counter = acct->counter; + +			atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);  			atomic64_add(skb->len - skb_network_offset(skb), -				     &acct[CTINFO2DIR(ctinfo)].bytes); +				     &counter[CTINFO2DIR(ctinfo)].bytes);  		}  	} @@ -1227,27 +1385,42 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct;  	struct hlist_nulls_node *n; +	int cpu; +	spinlock_t *lockp; -	spin_lock_bh(&nf_conntrack_lock);  	for (; *bucket < net->ct.htable_size; (*bucket)++) { -		hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { -			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) -				continue; +		lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; +		local_bh_disable(); +		spin_lock(lockp); +		if (*bucket < net->ct.htable_size) { +			hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { +				if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) +					continue; +				ct = nf_ct_tuplehash_to_ctrack(h); +				if (iter(ct, data)) +					goto found; +			} +		} +		spin_unlock(lockp); +		local_bh_enable(); +	} + +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_bh(&pcpu->lock); +		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {  			ct = nf_ct_tuplehash_to_ctrack(h);  			if (iter(ct, data)) -				goto found; +				set_bit(IPS_DYING_BIT, &ct->status);  		} +		spin_unlock_bh(&pcpu->lock);  	} -	hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { -		ct = nf_ct_tuplehash_to_ctrack(h); -		if (iter(ct, data)) -			set_bit(IPS_DYING_BIT, &ct->status); -	} -	spin_unlock_bh(&nf_conntrack_lock);  	return NULL;  found:  	atomic_inc(&ct->ct_general.use); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock(lockp); +	local_bh_enable();  	return ct;  } @@ -1296,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net)  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct;  	struct hlist_nulls_node *n; +	int cpu; -	spin_lock_bh(&nf_conntrack_lock); -	hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { -		ct = nf_ct_tuplehash_to_ctrack(h); -		/* never fails to remove them, no listeners at this point */ -		nf_ct_kill(ct); +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_bh(&pcpu->lock); +		hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { +			ct = nf_ct_tuplehash_to_ctrack(h); +			/* never fails to remove them, no listeners at this point */ +			nf_ct_kill(ct); +		} +		spin_unlock_bh(&pcpu->lock);  	} -	spin_unlock_bh(&nf_conntrack_lock);  }  static int untrack_refs(void) @@ -1390,6 +1568,7 @@ i_see_dead_people:  		kmem_cache_destroy(net->ct.nf_conntrack_cachep);  		kfree(net->ct.slabname);  		free_percpu(net->ct.stat); +		free_percpu(net->ct.pcpu_lists);  	}  } @@ -1442,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)  	if (!hash)  		return -ENOMEM; +	local_bh_disable(); +	nf_conntrack_all_lock(); +	write_seqcount_begin(&init_net.ct.generation); +  	/* Lookups in the old hash might happen in parallel, which means we  	 * might get false negatives during connection lookup. New connections  	 * created because of a false negative won't make it into the hash -	 * though since that required taking the lock. +	 * though since that required taking the locks.  	 */ -	spin_lock_bh(&nf_conntrack_lock); +  	for (i = 0; i < init_net.ct.htable_size; i++) {  		while (!hlist_nulls_empty(&init_net.ct.hash[i])) {  			h = hlist_nulls_entry(init_net.ct.hash[i].first, @@ -1464,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)  	init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;  	init_net.ct.hash = hash; -	spin_unlock_bh(&nf_conntrack_lock); + +	write_seqcount_end(&init_net.ct.generation); +	nf_conntrack_all_unlock(); +	local_bh_enable();  	nf_ct_free_hashtable(old_hash, old_size);  	return 0; @@ -1486,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);  int nf_conntrack_init_start(void)  {  	int max_factor = 8; -	int ret, cpu; +	int i, ret, cpu; + +	for (i = 0; i < CONNTRACK_LOCKS; i++) +		spin_lock_init(&nf_conntrack_locks[i]);  	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB  	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ @@ -1602,37 +1791,44 @@ void nf_conntrack_init_end(void)  int nf_conntrack_init_net(struct net *net)  { -	int ret; +	int ret = -ENOMEM; +	int cpu;  	atomic_set(&net->ct.count, 0); -	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); -	INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); -	INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL); -	net->ct.stat = alloc_percpu(struct ip_conntrack_stat); -	if (!net->ct.stat) { -		ret = -ENOMEM; +	seqcount_init(&net->ct.generation); + +	net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); +	if (!net->ct.pcpu_lists)  		goto err_stat; + +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_init(&pcpu->lock); +		INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); +		INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); +		INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);  	} +	net->ct.stat = alloc_percpu(struct ip_conntrack_stat); +	if (!net->ct.stat) +		goto err_pcpu_lists; +  	net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); -	if (!net->ct.slabname) { -		ret = -ENOMEM; +	if (!net->ct.slabname)  		goto err_slabname; -	}  	net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,  							sizeof(struct nf_conn), 0,  							SLAB_DESTROY_BY_RCU, NULL);  	if (!net->ct.nf_conntrack_cachep) {  		printk(KERN_ERR "Unable to create nf_conn slab cache\n"); -		ret = -ENOMEM;  		goto err_cache;  	}  	net->ct.htable_size = nf_conntrack_htable_size;  	net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);  	if (!net->ct.hash) { -		ret = -ENOMEM;  		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");  		goto err_hash;  	} @@ -1674,6 +1870,8 @@ err_cache:  	kfree(net->ct.slabname);  err_slabname:  	free_percpu(net->ct.stat); +err_pcpu_lists: +	free_percpu(net->ct.pcpu_lists);  err_stat:  	return ret;  } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 4fd1ca94fd4..f87e8f68ad4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -66,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)  {  	struct nf_conntrack_expect *exp = (void *)ul_expect; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	nf_ct_unlink_expect(exp); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	nf_ct_expect_put(exp);  } @@ -155,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone,  	if (!nf_ct_is_confirmed(exp->master))  		return NULL; +	/* Avoid race with other CPUs, that for exp->master ct, is +	 * about to invoke ->destroy(), or nf_ct_delete() via timeout +	 * or early_drop(). +	 * +	 * The atomic_inc_not_zero() check tells:  If that fails, we +	 * know that the ct is being destroyed.  If it succeeds, we +	 * can be sure the ct cannot disappear underneath. +	 */ +	if (unlikely(nf_ct_is_dying(exp->master) || +		     !atomic_inc_not_zero(&exp->master->ct_general.use))) +		return NULL; +  	if (exp->flags & NF_CT_EXPECT_PERMANENT) {  		atomic_inc(&exp->use);  		return exp; @@ -162,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone,  		nf_ct_unlink_expect(exp);  		return exp;  	} +	/* Undo exp->master refcnt increase, if del_timer() failed */ +	nf_ct_put(exp->master);  	return NULL;  } @@ -177,12 +191,14 @@ void nf_ct_remove_expectations(struct nf_conn *ct)  	if (!help)  		return; +	spin_lock_bh(&nf_conntrack_expect_lock);  	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {  		if (del_timer(&exp->timeout)) {  			nf_ct_unlink_expect(exp);  			nf_ct_expect_put(exp);  		}  	} +	spin_unlock_bh(&nf_conntrack_expect_lock);  }  EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); @@ -217,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,  /* Generally a bad idea to call this: could have matched already. */  void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)  { -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	if (del_timer(&exp->timeout)) {  		nf_ct_unlink_expect(exp);  		nf_ct_expect_put(exp);  	} -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  }  EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); @@ -335,7 +351,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)  	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,  		    (unsigned long)exp);  	helper = rcu_dereference_protected(master_help->helper, -					   lockdep_is_held(&nf_conntrack_lock)); +					   lockdep_is_held(&nf_conntrack_expect_lock));  	if (helper) {  		exp->timeout.expires = jiffies +  			helper->expect_policy[exp->class].timeout * HZ; @@ -395,7 +411,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)  	}  	/* Will be over limit? */  	helper = rcu_dereference_protected(master_help->helper, -					   lockdep_is_held(&nf_conntrack_lock)); +					   lockdep_is_held(&nf_conntrack_expect_lock));  	if (helper) {  		p = &helper->expect_policy[expect->class];  		if (p->max_expected && @@ -417,12 +433,12 @@ out:  	return ret;  } -int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,  +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,  				u32 portid, int report)  {  	int ret; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	ret = __nf_ct_expect_check(expect);  	if (ret <= 0)  		goto out; @@ -430,11 +446,11 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,  	ret = nf_ct_expect_insert(expect);  	if (ret < 0)  		goto out; -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);  	return ret;  out: -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	return ret;  }  EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index bdebd03bc8c..3a3a60b126e 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -778,8 +778,8 @@ static int callforward_do_filter(const union nf_inet_addr *src,  				   flowi6_to_flowi(&fl1), false)) {  			if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,  					   flowi6_to_flowi(&fl2), false)) { -				if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, -					    sizeof(rt1->rt6i_gateway)) && +				if (ipv6_addr_equal(rt6_nexthop(rt1), +						    rt6_nexthop(rt2)) &&  				    rt1->dst.dev == rt2->dst.dev)  					ret = 1;  				dst_release(&rt2->dst); @@ -1476,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,  		nf_ct_refresh(ct, skb, info->timeout * HZ);  		/* Set expect timeout */ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,  				  info->sig_port[!dir]);  		if (exp) { @@ -1486,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,  			nf_ct_dump_tuple(&exp->tuple);  			set_expect_timeout(exp, info->timeout);  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  	}  	return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 974a2a4adef..5b3eae7d4c9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -250,16 +250,14 @@ out:  }  EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); +/* appropiate ct lock protecting must be taken by caller */  static inline int unhelp(struct nf_conntrack_tuple_hash *i,  			 const struct nf_conntrack_helper *me)  {  	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);  	struct nf_conn_help *help = nfct_help(ct); -	if (help && rcu_dereference_protected( -			help->helper, -			lockdep_is_held(&nf_conntrack_lock) -			) == me) { +	if (help && rcu_dereference_raw(help->helper) == me) {  		nf_conntrack_event(IPCT_HELPER, ct);  		RCU_INIT_POINTER(help->helper, NULL);  	} @@ -284,17 +282,17 @@ static LIST_HEAD(nf_ct_helper_expectfn_list);  void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n)  { -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  }  EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register);  void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)  { -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	list_del_rcu(&n->head); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  }  EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); @@ -396,15 +394,17 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,  	const struct hlist_node *next;  	const struct hlist_nulls_node *nn;  	unsigned int i; +	int cpu;  	/* Get rid of expectations */ +	spin_lock_bh(&nf_conntrack_expect_lock);  	for (i = 0; i < nf_ct_expect_hsize; i++) {  		hlist_for_each_entry_safe(exp, next,  					  &net->ct.expect_hash[i], hnode) {  			struct nf_conn_help *help = nfct_help(exp->master);  			if ((rcu_dereference_protected(  					help->helper, -					lockdep_is_held(&nf_conntrack_lock) +					lockdep_is_held(&nf_conntrack_expect_lock)  					) == me || exp->helper == me) &&  			    del_timer(&exp->timeout)) {  				nf_ct_unlink_expect(exp); @@ -412,14 +412,27 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,  			}  		}  	} +	spin_unlock_bh(&nf_conntrack_expect_lock);  	/* Get rid of expecteds, set helpers to NULL. */ -	hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) -		unhelp(h, me); -	for (i = 0; i < net->ct.htable_size; i++) { -		hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_bh(&pcpu->lock); +		hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)  			unhelp(h, me); +		spin_unlock_bh(&pcpu->lock); +	} +	local_bh_disable(); +	for (i = 0; i < net->ct.htable_size; i++) { +		spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); +		if (i < net->ct.htable_size) { +			hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) +				unhelp(h, me); +		} +		spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);  	} +	local_bh_enable();  }  void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) @@ -437,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)  	synchronize_rcu();  	rtnl_lock(); -	spin_lock_bh(&nf_conntrack_lock);  	for_each_net(net)  		__nf_conntrack_helper_unregister(me, net); -	spin_unlock_bh(&nf_conntrack_lock);  	rtnl_unlock();  }  EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eea936b70d1..300ed1eec72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -211,13 +211,23 @@ nla_put_failure:  }  static int -dump_counters(struct sk_buff *skb, u64 pkts, u64 bytes, -	      enum ip_conntrack_dir dir) +dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, +	      enum ip_conntrack_dir dir, int type)  { -	enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; +	enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; +	struct nf_conn_counter *counter = acct->counter;  	struct nlattr *nest_count; +	u64 pkts, bytes; + +	if (type == IPCTNL_MSG_CT_GET_CTRZERO) { +		pkts = atomic64_xchg(&counter[dir].packets, 0); +		bytes = atomic64_xchg(&counter[dir].bytes, 0); +	} else { +		pkts = atomic64_read(&counter[dir].packets); +		bytes = atomic64_read(&counter[dir].bytes); +	} -	nest_count = nla_nest_start(skb, type | NLA_F_NESTED); +	nest_count = nla_nest_start(skb, attr | NLA_F_NESTED);  	if (!nest_count)  		goto nla_put_failure; @@ -234,24 +244,19 @@ nla_put_failure:  }  static int -ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, -			enum ip_conntrack_dir dir, int type) +ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type)  { -	struct nf_conn_counter *acct; -	u64 pkts, bytes; +	struct nf_conn_acct *acct = nf_conn_acct_find(ct); -	acct = nf_conn_acct_find(ct);  	if (!acct)  		return 0; -	if (type == IPCTNL_MSG_CT_GET_CTRZERO) { -		pkts = atomic64_xchg(&acct[dir].packets, 0); -		bytes = atomic64_xchg(&acct[dir].bytes, 0); -	} else { -		pkts = atomic64_read(&acct[dir].packets); -		bytes = atomic64_read(&acct[dir].bytes); -	} -	return dump_counters(skb, pkts, bytes, dir); +	if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0) +		return -1; +	if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0) +		return -1; + +	return 0;  }  static int @@ -488,8 +493,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,  	if (ctnetlink_dump_status(skb, ct) < 0 ||  	    ctnetlink_dump_timeout(skb, ct) < 0 || -	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, type) < 0 || -	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, type) < 0 || +	    ctnetlink_dump_acct(skb, ct, type) < 0 ||  	    ctnetlink_dump_timestamp(skb, ct) < 0 ||  	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||  	    ctnetlink_dump_helpinfo(skb, ct) < 0 || @@ -530,7 +534,7 @@ ctnetlink_proto_size(const struct nf_conn *ct)  }  static inline size_t -ctnetlink_counters_size(const struct nf_conn *ct) +ctnetlink_acct_size(const struct nf_conn *ct)  {  	if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))  		return 0; @@ -579,7 +583,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)  	       + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ -	       + ctnetlink_counters_size(ct) +	       + ctnetlink_acct_size(ct)  	       + ctnetlink_timestamp_size(ct)  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */  	       + nla_total_size(0) /* CTA_PROTOINFO */ @@ -593,6 +597,9 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)  #ifdef CONFIG_NF_CONNTRACK_MARK  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */  #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES +	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif  	       + ctnetlink_proto_size(ct)  	       + ctnetlink_label_size(ct)  	       ; @@ -673,10 +680,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)  		goto nla_put_failure;  	if (events & (1 << IPCT_DESTROY)) { -		if (ctnetlink_dump_counters(skb, ct, -					    IP_CT_DIR_ORIGINAL, type) < 0 || -		    ctnetlink_dump_counters(skb, ct, -					    IP_CT_DIR_REPLY, type) < 0 || +		if (ctnetlink_dump_acct(skb, ct, type) < 0 ||  		    ctnetlink_dump_timestamp(skb, ct) < 0)  			goto nla_put_failure;  	} else { @@ -763,14 +767,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)  	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);  	u_int8_t l3proto = nfmsg->nfgen_family;  	int res; +	spinlock_t *lockp; +  #ifdef CONFIG_NF_CONNTRACK_MARK  	const struct ctnetlink_dump_filter *filter = cb->data;  #endif -	spin_lock_bh(&nf_conntrack_lock);  	last = (struct nf_conn *)cb->args[1]; + +	local_bh_disable();  	for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {  restart: +		lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; +		spin_lock(lockp); +		if (cb->args[0] >= net->ct.htable_size) { +			spin_unlock(lockp); +			goto out; +		}  		hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],  					 hnnode) {  			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -802,16 +815,18 @@ restart:  			if (res < 0) {  				nf_conntrack_get(&ct->ct_general);  				cb->args[1] = (unsigned long)ct; +				spin_unlock(lockp);  				goto out;  			}  		} +		spin_unlock(lockp);  		if (cb->args[1]) {  			cb->args[1] = 0;  			goto restart;  		}  	}  out: -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	if (last)  		nf_ct_put(last); @@ -965,7 +980,6 @@ ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,  	return 0;  } -#define __CTA_LABELS_MAX_LENGTH ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)  static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {  	[CTA_TUPLE_ORIG]	= { .type = NLA_NESTED },  	[CTA_TUPLE_REPLY]	= { .type = NLA_NESTED }, @@ -983,9 +997,9 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {  	[CTA_ZONE]		= { .type = NLA_U16 },  	[CTA_MARK_MASK]		= { .type = NLA_U32 },  	[CTA_LABELS]		= { .type = NLA_BINARY, -				    .len = __CTA_LABELS_MAX_LENGTH }, +				    .len = NF_CT_LABELS_MAX_SIZE },  	[CTA_LABELS_MASK]	= { .type = NLA_BINARY, -				    .len = __CTA_LABELS_MAX_LENGTH }, +				    .len = NF_CT_LABELS_MAX_SIZE },  };  static int @@ -1137,8 +1151,7 @@ static int ctnetlink_done_list(struct netlink_callback *cb)  }  static int -ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, -		    struct hlist_nulls_head *list) +ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)  {  	struct nf_conn *ct, *last;  	struct nf_conntrack_tuple_hash *h; @@ -1146,41 +1159,57 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,  	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);  	u_int8_t l3proto = nfmsg->nfgen_family;  	int res; +	int cpu; +	struct hlist_nulls_head *list; +	struct net *net = sock_net(skb->sk);  	if (cb->args[2])  		return 0; -	spin_lock_bh(&nf_conntrack_lock);  	last = (struct nf_conn *)cb->args[1]; -restart: -	hlist_nulls_for_each_entry(h, n, list, hnnode) { -		ct = nf_ct_tuplehash_to_ctrack(h); -		if (l3proto && nf_ct_l3num(ct) != l3proto) + +	for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { +		struct ct_pcpu *pcpu; + +		if (!cpu_possible(cpu))  			continue; -		if (cb->args[1]) { -			if (ct != last) + +		pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); +		spin_lock_bh(&pcpu->lock); +		list = dying ? &pcpu->dying : &pcpu->unconfirmed; +restart: +		hlist_nulls_for_each_entry(h, n, list, hnnode) { +			ct = nf_ct_tuplehash_to_ctrack(h); +			if (l3proto && nf_ct_l3num(ct) != l3proto)  				continue; -			cb->args[1] = 0; +			if (cb->args[1]) { +				if (ct != last) +					continue; +				cb->args[1] = 0; +			} +			rcu_read_lock(); +			res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, +						  cb->nlh->nlmsg_seq, +						  NFNL_MSG_TYPE(cb->nlh->nlmsg_type), +						  ct); +			rcu_read_unlock(); +			if (res < 0) { +				if (!atomic_inc_not_zero(&ct->ct_general.use)) +					continue; +				cb->args[0] = cpu; +				cb->args[1] = (unsigned long)ct; +				spin_unlock_bh(&pcpu->lock); +				goto out; +			}  		} -		rcu_read_lock(); -		res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, -					  cb->nlh->nlmsg_seq, -					  NFNL_MSG_TYPE(cb->nlh->nlmsg_type), -					  ct); -		rcu_read_unlock(); -		if (res < 0) { -			nf_conntrack_get(&ct->ct_general); -			cb->args[1] = (unsigned long)ct; -			goto out; +		if (cb->args[1]) { +			cb->args[1] = 0; +			goto restart;  		} +		spin_unlock_bh(&pcpu->lock);  	} -	if (cb->args[1]) { -		cb->args[1] = 0; -		goto restart; -	} else -		cb->args[2] = 1; +	cb->args[2] = 1;  out: -	spin_unlock_bh(&nf_conntrack_lock);  	if (last)  		nf_ct_put(last); @@ -1190,9 +1219,7 @@ out:  static int  ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)  { -	struct net *net = sock_net(skb->sk); - -	return ctnetlink_dump_list(skb, cb, &net->ct.dying); +	return ctnetlink_dump_list(skb, cb, true);  }  static int @@ -1214,9 +1241,7 @@ ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,  static int  ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)  { -	struct net *net = sock_net(skb->sk); - -	return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed); +	return ctnetlink_dump_list(skb, cb, false);  }  static int @@ -1309,27 +1334,25 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])  }  static int -ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])  {  #ifdef CONFIG_NF_NAT_NEEDED  	int ret; -	if (cda[CTA_NAT_DST]) { -		ret = ctnetlink_parse_nat_setup(ct, -						NF_NAT_MANIP_DST, -						cda[CTA_NAT_DST]); -		if (ret < 0) -			return ret; -	} -	if (cda[CTA_NAT_SRC]) { -		ret = ctnetlink_parse_nat_setup(ct, -						NF_NAT_MANIP_SRC, -						cda[CTA_NAT_SRC]); -		if (ret < 0) -			return ret; -	} -	return 0; +	if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) +		return 0; + +	ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, +					cda[CTA_NAT_DST]); +	if (ret < 0) +		return ret; + +	ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, +					cda[CTA_NAT_SRC]); +	return ret;  #else +	if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) +		return 0;  	return -EOPNOTSUPP;  #endif  } @@ -1365,14 +1388,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])  					    nf_ct_protonum(ct));  	if (helper == NULL) {  #ifdef CONFIG_MODULES -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		if (request_module("nfct-helper-%s", helpname) < 0) { -			spin_lock_bh(&nf_conntrack_lock); +			spin_lock_bh(&nf_conntrack_expect_lock);  			return -EOPNOTSUPP;  		} -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),  						    nf_ct_protonum(ct));  		if (helper) @@ -1658,11 +1681,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,  			goto err2;  	} -	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { -		err = ctnetlink_change_nat(ct, cda); -		if (err < 0) -			goto err2; -	} +	err = ctnetlink_setup_nat(ct, cda); +	if (err < 0) +		goto err2;  	nf_ct_acct_ext_add(ct, GFP_ATOMIC);  	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); @@ -1810,9 +1831,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,  	err = -EEXIST;  	ct = nf_ct_tuplehash_to_ctrack(h);  	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		err = ctnetlink_change_conntrack(ct, cda); -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		if (err == 0) {  			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |  						      (1 << IPCT_ASSURED) | @@ -2022,6 +2043,9 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct)  #ifdef CONFIG_NF_CONNTRACK_MARK  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */  #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES +	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif  	       + ctnetlink_proto_size(ct)  	       ;  } @@ -2117,8 +2141,16 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)  			return err;  	}  #if defined(CONFIG_NF_CONNTRACK_MARK) -	if (cda[CTA_MARK]) -		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); +	if (cda[CTA_MARK]) { +		u32 mask = 0, mark, newmark; +		if (cda[CTA_MARK_MASK]) +			mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + +		mark = ntohl(nla_get_be32(cda[CTA_MARK])); +		newmark = (ct->mark & mask) ^ mark; +		if (newmark != ct->mark) +			ct->mark = newmark; +	}  #endif  	return 0;  } @@ -2133,9 +2165,9 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)  	if (ret < 0)  		return ret; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	return ret;  } @@ -2690,13 +2722,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,  		}  		/* after list removal, usage count == 1 */ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		if (del_timer(&exp->timeout)) {  			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,  						   nlmsg_report(nlh));  			nf_ct_expect_put(exp);  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		/* have to put what we 'get' above.  		 * after this line usage count == 0 */  		nf_ct_expect_put(exp); @@ -2705,7 +2737,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,  		struct nf_conn_help *m_help;  		/* delete all expectations for this helper */ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		for (i = 0; i < nf_ct_expect_hsize; i++) {  			hlist_for_each_entry_safe(exp, next,  						  &net->ct.expect_hash[i], @@ -2720,10 +2752,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,  				}  			}  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  	} else {  		/* This basically means we have to flush everything*/ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		for (i = 0; i < nf_ct_expect_hsize; i++) {  			hlist_for_each_entry_safe(exp, next,  						  &net->ct.expect_hash[i], @@ -2736,7 +2768,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,  				}  			}  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  	}  	return 0; @@ -2962,11 +2994,11 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,  	if (err < 0)  		return err; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	exp = __nf_ct_expect_find(net, zone, &tuple);  	if (!exp) { -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		err = -ENOENT;  		if (nlh->nlmsg_flags & NLM_F_CREATE) {  			err = ctnetlink_create_expect(net, zone, cda, @@ -2980,7 +3012,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,  	err = -EEXIST;  	if (!(nlh->nlmsg_flags & NLM_F_EXCL))  		err = ctnetlink_change_expect(exp, cda); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	return err;  } diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7bd03decd36..825c3e3f830 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -605,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = {  	.expect_policy		= &pptp_exp_policy,  }; -static void nf_conntrack_pptp_net_exit(struct net *net) -{ -	nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations nf_conntrack_pptp_net_ops = { -	.exit = nf_conntrack_pptp_net_exit, -}; -  static int __init nf_conntrack_pptp_init(void)  { -	int rv; - -	rv = nf_conntrack_helper_register(&pptp); -	if (rv < 0) -		return rv; -	rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); -	if (rv < 0) -		nf_conntrack_helper_unregister(&pptp); -	return rv; +	return nf_conntrack_helper_register(&pptp);  }  static void __exit nf_conntrack_pptp_fini(void)  {  	nf_conntrack_helper_unregister(&pptp); -	unregister_pernet_subsys(&nf_conntrack_pptp_net_ops);  }  module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index ce3004156ee..b65d5864b6d 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -92,12 +92,6 @@ nf_ct_l3proto_find_get(u_int16_t l3proto)  }  EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); -void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) -{ -	module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_put); -  int  nf_ct_l3proto_try_module_get(unsigned short l3proto)  { diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index a99b6c3427b..cb372f96f10 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -428,7 +428,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,  	const char *msg;  	u_int8_t state; -	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); +	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);  	BUG_ON(dh == NULL);  	state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; @@ -457,7 +457,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,  out_invalid:  	if (LOG_INVALID(net, IPPROTO_DCCP))  		nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, -			      NULL, msg); +			      NULL, "%s", msg);  	return false;  } @@ -486,7 +486,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,  	u_int8_t type, old_state, new_state;  	enum ct_dccp_roles role; -	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); +	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);  	BUG_ON(dh == NULL);  	type = dh->dccph_type; @@ -577,7 +577,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,  	unsigned int cscov;  	const char *msg; -	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); +	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);  	if (dh == NULL) {  		msg = "nf_ct_dccp: short packet ";  		goto out_invalid; @@ -614,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,  out_invalid:  	if (LOG_INVALID(net, IPPROTO_DCCP)) -		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, msg); +		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg);  	return -NF_ACCEPT;  } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9d9c0dade60..d5665739e3b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -66,7 +66,7 @@ static inline struct netns_proto_gre *gre_pernet(struct net *net)  	return net_generic(net, proto_gre_net_id);  } -void nf_ct_gre_keymap_flush(struct net *net) +static void nf_ct_gre_keymap_flush(struct net *net)  {  	struct netns_proto_gre *net_gre = gre_pernet(net);  	struct nf_ct_gre_keymap *km, *tmp; @@ -78,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net)  	}  	write_unlock_bh(&net_gre->keymap_lock);  } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush);  static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,  				const struct nf_conntrack_tuple *t) diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index 5f9bfd060de..f6e2ae91a80 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -36,13 +36,18 @@ int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,  	if (off == 0)  		return 0; +	if (unlikely(!seqadj)) { +		WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); +		return 0; +	} +  	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);  	spin_lock_bh(&ct->lock);  	this_way = &seqadj->seq[dir];  	if (this_way->offset_before == this_way->offset_after || -	    before(this_way->correction_pos, seq)) { -		this_way->correction_pos = seq; +	    before(this_way->correction_pos, ntohl(seq))) { +		this_way->correction_pos = ntohl(seq);  		this_way->offset_before	 = this_way->offset_after;  		this_way->offset_after	+= off;  	} diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index e0c4373b474..4c3ba1c8d68 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -52,66 +52,8 @@ module_param(sip_direct_media, int, 0600);  MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling "  				   "endpoints only (default 1)"); -unsigned int (*nf_nat_sip_hook)(struct sk_buff *skb, unsigned int protoff, -				unsigned int dataoff, const char **dptr, -				unsigned int *datalen) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_hook); - -void (*nf_nat_sip_seq_adjust_hook)(struct sk_buff *skb, unsigned int protoff, -				   s16 off) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_seq_adjust_hook); - -unsigned int (*nf_nat_sip_expect_hook)(struct sk_buff *skb, -				       unsigned int protoff, -				       unsigned int dataoff, -				       const char **dptr, -				       unsigned int *datalen, -				       struct nf_conntrack_expect *exp, -				       unsigned int matchoff, -				       unsigned int matchlen) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_expect_hook); - -unsigned int (*nf_nat_sdp_addr_hook)(struct sk_buff *skb, unsigned int protoff, -				     unsigned int dataoff, -				     const char **dptr, -				     unsigned int *datalen, -				     unsigned int sdpoff, -				     enum sdp_header_types type, -				     enum sdp_header_types term, -				     const union nf_inet_addr *addr) -				     __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_addr_hook); - -unsigned int (*nf_nat_sdp_port_hook)(struct sk_buff *skb, unsigned int protoff, -				     unsigned int dataoff, -				     const char **dptr, -				     unsigned int *datalen, -				     unsigned int matchoff, -				     unsigned int matchlen, -				     u_int16_t port) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_port_hook); - -unsigned int (*nf_nat_sdp_session_hook)(struct sk_buff *skb, -					unsigned int protoff, -					unsigned int dataoff, -					const char **dptr, -					unsigned int *datalen, -					unsigned int sdpoff, -					const union nf_inet_addr *addr) -					__read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_session_hook); - -unsigned int (*nf_nat_sdp_media_hook)(struct sk_buff *skb, unsigned int protoff, -				      unsigned int dataoff, -				      const char **dptr, -				      unsigned int *datalen, -				      struct nf_conntrack_expect *rtp_exp, -				      struct nf_conntrack_expect *rtcp_exp, -				      unsigned int mediaoff, -				      unsigned int medialen, -				      union nf_inet_addr *rtp_addr) -				      __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_media_hook); +const struct nf_nat_sip_hooks *nf_nat_sip_hooks; +EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);  static int string_len(const struct nf_conn *ct, const char *dptr,  		      const char *limit, int *shift) @@ -858,7 +800,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,  	struct hlist_node *next;  	int found = 0; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {  		if (exp->class != SIP_EXPECT_SIGNALLING ||  		    !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || @@ -873,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,  		found = 1;  		break;  	} -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	return found;  } @@ -883,7 +825,7 @@ static void flush_expectations(struct nf_conn *ct, bool media)  	struct nf_conntrack_expect *exp;  	struct hlist_node *next; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {  		if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)  			continue; @@ -894,7 +836,7 @@ static void flush_expectations(struct nf_conn *ct, bool media)  		if (!media)  			break;  	} -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  }  static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, @@ -914,8 +856,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,  	int direct_rtp = 0, skip_expect = 0, ret = NF_DROP;  	u_int16_t base_port;  	__be16 rtp_port, rtcp_port; -	typeof(nf_nat_sdp_port_hook) nf_nat_sdp_port; -	typeof(nf_nat_sdp_media_hook) nf_nat_sdp_media; +	const struct nf_nat_sip_hooks *hooks;  	saddr = NULL;  	if (sip_direct_media) { @@ -966,22 +907,23 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,  #endif  			skip_expect = 1;  	} while (!skip_expect); -	rcu_read_unlock();  	base_port = ntohs(tuple.dst.u.udp.port) & ~1;  	rtp_port = htons(base_port);  	rtcp_port = htons(base_port + 1);  	if (direct_rtp) { -		nf_nat_sdp_port = rcu_dereference(nf_nat_sdp_port_hook); -		if (nf_nat_sdp_port && -		    !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen, +		hooks = rcu_dereference(nf_nat_sip_hooks); +		if (hooks && +		    !hooks->sdp_port(skb, protoff, dataoff, dptr, datalen,  				     mediaoff, medialen, ntohs(rtp_port)))  			goto err1;  	} -	if (skip_expect) +	if (skip_expect) { +		rcu_read_unlock();  		return NF_ACCEPT; +	}  	rtp_exp = nf_ct_expect_alloc(ct);  	if (rtp_exp == NULL) @@ -995,10 +937,10 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,  	nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr,  			  IPPROTO_UDP, NULL, &rtcp_port); -	nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook); -	if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp) -		ret = nf_nat_sdp_media(skb, protoff, dataoff, dptr, datalen, -				       rtp_exp, rtcp_exp, +	hooks = rcu_dereference(nf_nat_sip_hooks); +	if (hooks && ct->status & IPS_NAT_MASK && !direct_rtp) +		ret = hooks->sdp_media(skb, protoff, dataoff, dptr, +				       datalen, rtp_exp, rtcp_exp,  				       mediaoff, medialen, daddr);  	else {  		if (nf_ct_expect_related(rtp_exp) == 0) { @@ -1012,6 +954,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,  err2:  	nf_ct_expect_put(rtp_exp);  err1: +	rcu_read_unlock();  	return ret;  } @@ -1051,13 +994,12 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff,  	unsigned int caddr_len, maddr_len;  	unsigned int i;  	union nf_inet_addr caddr, maddr, rtp_addr; +	const struct nf_nat_sip_hooks *hooks;  	unsigned int port;  	const struct sdp_media_type *t;  	int ret = NF_ACCEPT; -	typeof(nf_nat_sdp_addr_hook) nf_nat_sdp_addr; -	typeof(nf_nat_sdp_session_hook) nf_nat_sdp_session; -	nf_nat_sdp_addr = rcu_dereference(nf_nat_sdp_addr_hook); +	hooks = rcu_dereference(nf_nat_sip_hooks);  	/* Find beginning of session description */  	if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, @@ -1125,10 +1067,11 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff,  		}  		/* Update media connection address if present */ -		if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) { -			ret = nf_nat_sdp_addr(skb, protoff, dataoff, +		if (maddr_len && hooks && ct->status & IPS_NAT_MASK) { +			ret = hooks->sdp_addr(skb, protoff, dataoff,  					      dptr, datalen, mediaoff, -					      SDP_HDR_CONNECTION, SDP_HDR_MEDIA, +					      SDP_HDR_CONNECTION, +					      SDP_HDR_MEDIA,  					      &rtp_addr);  			if (ret != NF_ACCEPT) {  				nf_ct_helper_log(skb, ct, "cannot mangle SDP"); @@ -1139,10 +1082,11 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff,  	}  	/* Update session connection and owner addresses */ -	nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook); -	if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK) -		ret = nf_nat_sdp_session(skb, protoff, dataoff, -					 dptr, datalen, sdpoff, &rtp_addr); +	hooks = rcu_dereference(nf_nat_sip_hooks); +	if (hooks && ct->status & IPS_NAT_MASK) +		ret = hooks->sdp_session(skb, protoff, dataoff, +					 dptr, datalen, sdpoff, +					 &rtp_addr);  	return ret;  } @@ -1242,11 +1186,11 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,  	unsigned int matchoff, matchlen;  	struct nf_conntrack_expect *exp;  	union nf_inet_addr *saddr, daddr; +	const struct nf_nat_sip_hooks *hooks;  	__be16 port;  	u8 proto;  	unsigned int expires = 0;  	int ret; -	typeof(nf_nat_sip_expect_hook) nf_nat_sip_expect;  	/* Expected connections can not register again. */  	if (ct->status & IPS_EXPECTED) @@ -1309,10 +1253,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,  	exp->helper = nfct_help(ct)->helper;  	exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; -	nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook); -	if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK) -		ret = nf_nat_sip_expect(skb, protoff, dataoff, dptr, datalen, -					exp, matchoff, matchlen); +	hooks = rcu_dereference(nf_nat_sip_hooks); +	if (hooks && ct->status & IPS_NAT_MASK) +		ret = hooks->expect(skb, protoff, dataoff, dptr, datalen, +				    exp, matchoff, matchlen);  	else {  		if (nf_ct_expect_related(exp) != 0) {  			nf_ct_helper_log(skb, ct, "cannot add expectation"); @@ -1515,7 +1459,7 @@ static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct,  			   unsigned int protoff, unsigned int dataoff,  			   const char **dptr, unsigned int *datalen)  { -	typeof(nf_nat_sip_hook) nf_nat_sip; +	const struct nf_nat_sip_hooks *hooks;  	int ret;  	if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0) @@ -1524,9 +1468,9 @@ static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct,  		ret = process_sip_response(skb, protoff, dataoff, dptr, datalen);  	if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { -		nf_nat_sip = rcu_dereference(nf_nat_sip_hook); -		if (nf_nat_sip && !nf_nat_sip(skb, protoff, dataoff, -					      dptr, datalen)) { +		hooks = rcu_dereference(nf_nat_sip_hooks); +		if (hooks && !hooks->msg(skb, protoff, dataoff, +					 dptr, datalen)) {  			nf_ct_helper_log(skb, ct, "cannot NAT SIP message");  			ret = NF_DROP;  		} @@ -1546,7 +1490,6 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,  	s16 diff, tdiff = 0;  	int ret = NF_ACCEPT;  	bool term; -	typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;  	if (ctinfo != IP_CT_ESTABLISHED &&  	    ctinfo != IP_CT_ESTABLISHED_REPLY) @@ -1610,9 +1553,11 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,  	}  	if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { -		nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook); -		if (nf_nat_sip_seq_adjust) -			nf_nat_sip_seq_adjust(skb, protoff, tdiff); +		const struct nf_nat_sip_hooks *hooks; + +		hooks = rcu_dereference(nf_nat_sip_hooks); +		if (hooks) +			hooks->seq_adjust(skb, protoff, tdiff);  	}  	return ret; diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index 902fb0a6b38..7a394df0deb 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -97,7 +97,6 @@ int nf_conntrack_tstamp_pernet_init(struct net *net)  void nf_conntrack_tstamp_pernet_fini(struct net *net)  {  	nf_conntrack_tstamp_fini_sysctl(net); -	nf_ct_extend_unregister(&tstamp_extend);  }  int nf_conntrack_tstamp_init(void) diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 3deec997be8..61a3c927e63 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -13,26 +13,20 @@  /* core.c */ -extern unsigned int nf_iterate(struct list_head *head, -				struct sk_buff *skb, -				unsigned int hook, -				const struct net_device *indev, -				const struct net_device *outdev, -				struct nf_hook_ops **elemp, -				int (*okfn)(struct sk_buff *), -				int hook_thresh); +unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, +			unsigned int hook, const struct net_device *indev, +			const struct net_device *outdev, +			struct nf_hook_ops **elemp, +			int (*okfn)(struct sk_buff *), int hook_thresh);  /* nf_queue.c */ -extern int nf_queue(struct sk_buff *skb, -		    struct nf_hook_ops *elem, -		    u_int8_t pf, unsigned int hook, -		    struct net_device *indev, -		    struct net_device *outdev, -		    int (*okfn)(struct sk_buff *), -		    unsigned int queuenum); -extern int __init netfilter_queue_init(void); +int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf, +	     unsigned int hook, struct net_device *indev, +	     struct net_device *outdev, int (*okfn)(struct sk_buff *), +	     unsigned int queuenum); +int __init netfilter_queue_init(void);  /* nf_log.c */ -extern int __init netfilter_log_init(void); +int __init netfilter_log_init(void);  #endif diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6f0f4f7f68a..a49907b1dab 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -315,7 +315,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,  	 * manips not an issue.  	 */  	if (maniptype == NF_NAT_MANIP_SRC && -	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) { +	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {  		/* try the original tuple first */  		if (in_range(l3proto, l4proto, orig_tuple, range)) {  			if (!nf_nat_used_tuple(orig_tuple, ct)) { @@ -339,7 +339,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,  	 */  	/* Only bother mapping if it's not already in range and unique */ -	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) { +	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {  		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {  			if (l4proto->in_range(tuple, maniptype,  					      &range->min_proto, @@ -358,6 +358,19 @@ out:  	rcu_read_unlock();  } +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) +{ +	struct nf_conn_nat *nat = nfct_nat(ct); +	if (nat) +		return nat; + +	if (!nf_ct_is_confirmed(ct)) +		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + +	return nat; +} +EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); +  unsigned int  nf_nat_setup_info(struct nf_conn *ct,  		  const struct nf_nat_range *range, @@ -368,14 +381,9 @@ nf_nat_setup_info(struct nf_conn *ct,  	struct nf_conn_nat *nat;  	/* nat helper or nfctnetlink also setup binding */ -	nat = nfct_nat(ct); -	if (!nat) { -		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); -		if (nat == NULL) { -			pr_debug("failed to add NAT extension\n"); -			return NF_ACCEPT; -		} -	} +	nat = nf_ct_nat_ext_add(ct); +	if (nat == NULL) +		return NF_ACCEPT;  	NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||  		     maniptype == NF_NAT_MANIP_DST); @@ -432,6 +440,32 @@ nf_nat_setup_info(struct nf_conn *ct,  }  EXPORT_SYMBOL(nf_nat_setup_info); +static unsigned int +__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) +{ +	/* Force range to this IP; let proto decide mapping for +	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). +	 * Use reply in case it's already been mangled (eg local packet). +	 */ +	union nf_inet_addr ip = +		(manip == NF_NAT_MANIP_SRC ? +		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : +		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); +	struct nf_nat_range range = { +		.flags		= NF_NAT_RANGE_MAP_IPS, +		.min_addr	= ip, +		.max_addr	= ip, +	}; +	return nf_nat_setup_info(ct, &range, manip); +} + +unsigned int +nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ +	return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); +} +EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); +  /* Do packet manipulations according to nf_nat_setup_info. */  unsigned int nf_nat_packet(struct nf_conn *ct,  			   enum ip_conntrack_info ctinfo, @@ -491,6 +525,39 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)  	return i->status & IPS_NAT_MASK ? 1 : 0;  } +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ +	struct nf_conn_nat *nat = nfct_nat(ct); + +	if (nf_nat_proto_remove(ct, data)) +		return 1; + +	if (!nat || !nat->ct) +		return 0; + +	/* This netns is being destroyed, and conntrack has nat null binding. +	 * Remove it from bysource hash, as the table will be freed soon. +	 * +	 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() +	 * will delete entry from already-freed table. +	 */ +	if (!del_timer(&ct->timeout)) +		return 1; + +	spin_lock_bh(&nf_nat_lock); +	hlist_del_rcu(&nat->bysource); +	ct->status &= ~IPS_NAT_DONE_MASK; +	nat->ct = NULL; +	spin_unlock_bh(&nf_nat_lock); + +	add_timer(&ct->timeout); + +	/* don't delete conntrack.  Although that would make things a lot +	 * simpler, we'd end up flushing all conntracks on nat rmmod. +	 */ +	return 0; +} +  static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)  {  	struct nf_nat_proto_clean clean = { @@ -682,9 +749,9 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {  static int  nfnetlink_parse_nat(const struct nlattr *nat, -		    const struct nf_conn *ct, struct nf_nat_range *range) +		    const struct nf_conn *ct, struct nf_nat_range *range, +		    const struct nf_nat_l3proto *l3proto)  { -	const struct nf_nat_l3proto *l3proto;  	struct nlattr *tb[CTA_NAT_MAX+1];  	int err; @@ -694,38 +761,46 @@ nfnetlink_parse_nat(const struct nlattr *nat,  	if (err < 0)  		return err; -	rcu_read_lock(); -	l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); -	if (l3proto == NULL) { -		err = -EAGAIN; -		goto out; -	}  	err = l3proto->nlattr_to_range(tb, range);  	if (err < 0) -		goto out; +		return err;  	if (!tb[CTA_NAT_PROTO]) -		goto out; +		return 0; -	err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); -out: -	rcu_read_unlock(); -	return err; +	return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);  } +/* This function is called under rcu_read_lock() */  static int  nfnetlink_parse_nat_setup(struct nf_conn *ct,  			  enum nf_nat_manip_type manip,  			  const struct nlattr *attr)  {  	struct nf_nat_range range; +	const struct nf_nat_l3proto *l3proto;  	int err; -	err = nfnetlink_parse_nat(attr, ct, &range); +	/* Should not happen, restricted to creating new conntracks +	 * via ctnetlink. +	 */ +	if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) +		return -EEXIST; + +	/* Make sure that L3 NAT is there by when we call nf_nat_setup_info to +	 * attach the null binding, otherwise this may oops. +	 */ +	l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); +	if (l3proto == NULL) +		return -EAGAIN; + +	/* No NAT information has been passed, allocate the null-binding */ +	if (attr == NULL) +		return __nf_nat_alloc_null_binding(ct, manip); + +	err = nfnetlink_parse_nat(attr, ct, &range, l3proto);  	if (err < 0)  		return err; -	if (nf_nat_initialized(ct, manip)) -		return -EEXIST;  	return nf_nat_setup_info(ct, &range, manip);  } @@ -753,7 +828,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)  {  	struct nf_nat_proto_clean clean = {}; -	nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0); +	nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);  	synchronize_rcu();  	nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);  } diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index f02b3605823..1fb2258c353 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -34,10 +34,14 @@ static unsigned int help(struct sk_buff *skb,  			 struct nf_conntrack_expect *exp)  {  	char buffer[sizeof("4294967296 65635")]; +	struct nf_conn *ct = exp->master; +	union nf_inet_addr newaddr;  	u_int16_t port;  	unsigned int ret;  	/* Reply comes from server. */ +	newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3; +  	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;  	exp->dir = IP_CT_DIR_REPLY;  	exp->expectfn = nf_nat_follow_master; @@ -57,17 +61,35 @@ static unsigned int help(struct sk_buff *skb,  	}  	if (port == 0) { -		nf_ct_helper_log(skb, exp->master, "all ports in use"); +		nf_ct_helper_log(skb, ct, "all ports in use");  		return NF_DROP;  	} -	ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo, -				       protoff, matchoff, matchlen, buffer, -				       strlen(buffer)); +	/* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 +	 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 +	 * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 +	 * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 +	 * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 +	 * +	 * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, +	 *                        255.255.255.255==4294967296, 10 digits) +	 * P:         bound port (min 1 d, max 5d (65635)) +	 * F:         filename   (min 1 d ) +	 * S:         size       (min 1 d ) +	 * 0x01, \n:  terminators +	 */ +	/* AAA = "us", ie. where server normally talks to. */ +	snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port); +	pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", +		 buffer, &newaddr.ip, port); + +	ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, +				       matchlen, buffer, strlen(buffer));  	if (ret != NF_ACCEPT) { -		nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); +		nf_ct_helper_log(skb, ct, "cannot mangle packet");  		nf_ct_unexpect_related(exp);  	} +  	return ret;  } diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 9baaf734c14..83a72a235ca 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -74,22 +74,24 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,  		range_size = ntohs(range->max_proto.all) - min + 1;  	} -	if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) +	if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {  		off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC  						  ? tuple->dst.u.all  						  : tuple->src.u.all); -	else +	} else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { +		off = prandom_u32(); +	} else {  		off = *rover; +	}  	for (i = 0; ; ++off) {  		*portptr = htons(min + off % range_size);  		if (++i != range_size && nf_nat_used_tuple(tuple, ct))  			continue; -		if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) +		if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))  			*rover = off;  		return;  	} -	return;  }  EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index f9790405b7f..b4d691db955 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -625,33 +625,26 @@ static struct nf_ct_helper_expectfn sip_nat = {  static void __exit nf_nat_sip_fini(void)  { -	RCU_INIT_POINTER(nf_nat_sip_hook, NULL); -	RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, NULL); -	RCU_INIT_POINTER(nf_nat_sip_expect_hook, NULL); -	RCU_INIT_POINTER(nf_nat_sdp_addr_hook, NULL); -	RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL); -	RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL); -	RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL); +	RCU_INIT_POINTER(nf_nat_sip_hooks, NULL); +  	nf_ct_helper_expectfn_unregister(&sip_nat);  	synchronize_rcu();  } +static const struct nf_nat_sip_hooks sip_hooks = { +	.msg		= nf_nat_sip, +	.seq_adjust	= nf_nat_sip_seq_adjust, +	.expect		= nf_nat_sip_expect, +	.sdp_addr	= nf_nat_sdp_addr, +	.sdp_port	= nf_nat_sdp_port, +	.sdp_session	= nf_nat_sdp_session, +	.sdp_media	= nf_nat_sdp_media, +}; +  static int __init nf_nat_sip_init(void)  { -	BUG_ON(nf_nat_sip_hook != NULL); -	BUG_ON(nf_nat_sip_seq_adjust_hook != NULL); -	BUG_ON(nf_nat_sip_expect_hook != NULL); -	BUG_ON(nf_nat_sdp_addr_hook != NULL); -	BUG_ON(nf_nat_sdp_port_hook != NULL); -	BUG_ON(nf_nat_sdp_session_hook != NULL); -	BUG_ON(nf_nat_sdp_media_hook != NULL); -	RCU_INIT_POINTER(nf_nat_sip_hook, nf_nat_sip); -	RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, nf_nat_sip_seq_adjust); -	RCU_INIT_POINTER(nf_nat_sip_expect_hook, nf_nat_sip_expect); -	RCU_INIT_POINTER(nf_nat_sdp_addr_hook, nf_nat_sdp_addr); -	RCU_INIT_POINTER(nf_nat_sdp_port_hook, nf_nat_sdp_port); -	RCU_INIT_POINTER(nf_nat_sdp_session_hook, nf_nat_sdp_session); -	RCU_INIT_POINTER(nf_nat_sdp_media_hook, nf_nat_sdp_media); +	BUG_ON(nf_nat_sip_hooks != NULL); +	RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks);  	nf_ct_helper_expectfn_register(&sip_nat);  	return 0;  } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index cdf4567ba9b..52e20c9a46a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -151,9 +151,10 @@ void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,  	opts->tsecr = opts->tsval;  	opts->tsval = tcp_time_stamp & ~0x3f; -	if (opts->options & XT_SYNPROXY_OPT_WSCALE) -		opts->tsval |= info->wscale; -	else +	if (opts->options & XT_SYNPROXY_OPT_WSCALE) { +		opts->tsval |= opts->wscale; +		opts->wscale = info->wscale; +	} else  		opts->tsval |= 0xf;  	if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) @@ -362,9 +363,8 @@ static int __net_init synproxy_net_init(struct net *net)  		goto err2;  	if (!nfct_synproxy_ext_add(ct))  		goto err2; -	__set_bit(IPS_TEMPLATE_BIT, &ct->status); -	__set_bit(IPS_CONFIRMED_BIT, &ct->status); +	nf_conntrack_tmpl_insert(net, ct);  	snet->tmpl = ct;  	snet->stats = alloc_percpu(struct synproxy_stats); @@ -389,7 +389,7 @@ static void __net_exit synproxy_net_exit(struct net *net)  {  	struct synproxy_net *snet = synproxy_pernet(net); -	nf_conntrack_free(snet->tmpl); +	nf_ct_put(snet->tmpl);  	synproxy_proc_exit(net);  	free_percpu(snet->stats);  } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c new file mode 100644 index 00000000000..8746ff9a835 --- /dev/null +++ b/net/netfilter/nf_tables_api.c @@ -0,0 +1,4041 @@ +/* + * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +static LIST_HEAD(nf_tables_expressions); + +/** + *	nft_register_afinfo - register nf_tables address family info + * + *	@afi: address family info to register + * + *	Register the address family for use with nf_tables. Returns zero on + *	success or a negative errno code otherwise. + */ +int nft_register_afinfo(struct net *net, struct nft_af_info *afi) +{ +	INIT_LIST_HEAD(&afi->tables); +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_add_tail_rcu(&afi->list, &net->nft.af_info); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return 0; +} +EXPORT_SYMBOL_GPL(nft_register_afinfo); + +/** + *	nft_unregister_afinfo - unregister nf_tables address family info + * + *	@afi: address family info to unregister + * + *	Unregister the address family for use with nf_tables. + */ +void nft_unregister_afinfo(struct nft_af_info *afi) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_del_rcu(&afi->list); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_afinfo); + +static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family) +{ +	struct nft_af_info *afi; + +	list_for_each_entry(afi, &net->nft.af_info, list) { +		if (afi->family == family) +			return afi; +	} +	return NULL; +} + +static struct nft_af_info * +nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) +{ +	struct nft_af_info *afi; + +	afi = nft_afinfo_lookup(net, family); +	if (afi != NULL) +		return afi; +#ifdef CONFIG_MODULES +	if (autoload) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-afinfo-%u", family); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		afi = nft_afinfo_lookup(net, family); +		if (afi != NULL) +			return ERR_PTR(-EAGAIN); +	} +#endif +	return ERR_PTR(-EAFNOSUPPORT); +} + +static void nft_ctx_init(struct nft_ctx *ctx, +			 const struct sk_buff *skb, +			 const struct nlmsghdr *nlh, +			 struct nft_af_info *afi, +			 struct nft_table *table, +			 struct nft_chain *chain, +			 const struct nlattr * const *nla) +{ +	ctx->net	= sock_net(skb->sk); +	ctx->afi	= afi; +	ctx->table	= table; +	ctx->chain	= chain; +	ctx->nla   	= nla; +	ctx->portid	= NETLINK_CB(skb).portid; +	ctx->report	= nlmsg_report(nlh); +	ctx->seq	= nlh->nlmsg_seq; +} + +static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, +					 u32 size) +{ +	struct nft_trans *trans; + +	trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); +	if (trans == NULL) +		return NULL; + +	trans->msg_type = msg_type; +	trans->ctx	= *ctx; + +	return trans; +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ +	list_del(&trans->list); +	kfree(trans); +} + +/* + * Tables + */ + +static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, +					  const struct nlattr *nla) +{ +	struct nft_table *table; + +	list_for_each_entry(table, &afi->tables, list) { +		if (!nla_strcmp(nla, table->name)) +			return table; +	} +	return NULL; +} + +static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, +						const struct nlattr *nla) +{ +	struct nft_table *table; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	table = nft_table_lookup(afi, nla); +	if (table != NULL) +		return table; + +	return ERR_PTR(-ENOENT); +} + +static inline u64 nf_tables_alloc_handle(struct nft_table *table) +{ +	return ++table->hgenerator; +} + +static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX]; + +static const struct nf_chain_type * +__nf_tables_chain_type_lookup(int family, const struct nlattr *nla) +{ +	int i; + +	for (i = 0; i < NFT_CHAIN_T_MAX; i++) { +		if (chain_type[family][i] != NULL && +		    !nla_strcmp(nla, chain_type[family][i]->name)) +			return chain_type[family][i]; +	} +	return NULL; +} + +static const struct nf_chain_type * +nf_tables_chain_type_lookup(const struct nft_af_info *afi, +			    const struct nlattr *nla, +			    bool autoload) +{ +	const struct nf_chain_type *type; + +	type = __nf_tables_chain_type_lookup(afi->family, nla); +	if (type != NULL) +		return type; +#ifdef CONFIG_MODULES +	if (autoload) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-chain-%u-%.*s", afi->family, +			       nla_len(nla), (const char *)nla_data(nla)); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		type = __nf_tables_chain_type_lookup(afi->family, nla); +		if (type != NULL) +			return ERR_PTR(-EAGAIN); +	} +#endif +	return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { +	[NFTA_TABLE_NAME]	= { .type = NLA_STRING }, +	[NFTA_TABLE_FLAGS]	= { .type = NLA_U32 }, +}; + +static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, +				     int event, u32 flags, int family, +				     const struct nft_table *table) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || +	    nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || +	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) +{ +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, +					ctx->afi->family, ctx->table); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, +			     ctx->report, GFP_KERNEL); +err: +	if (err < 0) { +		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, +				  err); +	} +	return err; +} + +static int nf_tables_dump_tables(struct sk_buff *skb, +				 struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	unsigned int idx = 0, s_idx = cb->args[0]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (family != NFPROTO_UNSPEC && family != afi->family) +			continue; + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			if (idx < s_idx) +				goto cont; +			if (idx > s_idx) +				memset(&cb->args[1], 0, +				       sizeof(cb->args) - sizeof(cb->args[0])); +			if (nf_tables_fill_table_info(skb, +						      NETLINK_CB(cb->skb).portid, +						      cb->nlh->nlmsg_seq, +						      NFT_MSG_NEWTABLE, +						      NLM_F_MULTI, +						      afi->family, table) < 0) +				goto done; + +			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +			idx++; +		} +	} +done: +	rcu_read_unlock(); +	cb->args[0] = idx; +	return skb->len; +} + +/* Internal table flags */ +#define NFT_TABLE_INACTIVE	(1 << 15) + +static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	struct sk_buff *skb2; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	int err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_tables, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb2) +		return -ENOMEM; + +	err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, +					nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, +					family, table); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static int nf_tables_table_enable(const struct nft_af_info *afi, +				  struct nft_table *table) +{ +	struct nft_chain *chain; +	int err, i = 0; + +	list_for_each_entry(chain, &table->chains, list) { +		if (!(chain->flags & NFT_BASE_CHAIN)) +			continue; + +		err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); +		if (err < 0) +			goto err; + +		i++; +	} +	return 0; +err: +	list_for_each_entry(chain, &table->chains, list) { +		if (!(chain->flags & NFT_BASE_CHAIN)) +			continue; + +		if (i-- <= 0) +			break; + +		nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); +	} +	return err; +} + +static void nf_tables_table_disable(const struct nft_af_info *afi, +				   struct nft_table *table) +{ +	struct nft_chain *chain; + +	list_for_each_entry(chain, &table->chains, list) { +		if (chain->flags & NFT_BASE_CHAIN) +			nf_unregister_hooks(nft_base_chain(chain)->ops, +					    afi->nops); +	} +} + +static int nf_tables_updtable(struct nft_ctx *ctx) +{ +	struct nft_trans *trans; +	u32 flags; +	int ret = 0; + +	if (!ctx->nla[NFTA_TABLE_FLAGS]) +		return 0; + +	flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); +	if (flags & ~NFT_TABLE_F_DORMANT) +		return -EINVAL; + +	if (flags == ctx->table->flags) +		return 0; + +	trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, +				sizeof(struct nft_trans_table)); +	if (trans == NULL) +		return -ENOMEM; + +	if ((flags & NFT_TABLE_F_DORMANT) && +	    !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { +		nft_trans_table_enable(trans) = false; +	} else if (!(flags & NFT_TABLE_F_DORMANT) && +		   ctx->table->flags & NFT_TABLE_F_DORMANT) { +		ret = nf_tables_table_enable(ctx->afi, ctx->table); +		if (ret >= 0) { +			ctx->table->flags &= ~NFT_TABLE_F_DORMANT; +			nft_trans_table_enable(trans) = true; +		} +	} +	if (ret < 0) +		goto err; + +	nft_trans_table_update(trans) = true; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; +err: +	nft_trans_destroy(trans); +	return ret; +} + +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); +	if (trans == NULL) +		return -ENOMEM; + +	if (msg_type == NFT_MSG_NEWTABLE) +		ctx->table->flags |= NFT_TABLE_INACTIVE; + +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; +} + +static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nlattr *name; +	struct nft_af_info *afi; +	struct nft_table *table; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	u32 flags = 0; +	struct nft_ctx ctx; +	int err; + +	afi = nf_tables_afinfo_lookup(net, family, true); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	name = nla[NFTA_TABLE_NAME]; +	table = nf_tables_table_lookup(afi, name); +	if (IS_ERR(table)) { +		if (PTR_ERR(table) != -ENOENT) +			return PTR_ERR(table); +		table = NULL; +	} + +	if (table != NULL) { +		if (table->flags & NFT_TABLE_INACTIVE) +			return -ENOENT; +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EOPNOTSUPP; + +		nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); +		return nf_tables_updtable(&ctx); +	} + +	if (nla[NFTA_TABLE_FLAGS]) { +		flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); +		if (flags & ~NFT_TABLE_F_DORMANT) +			return -EINVAL; +	} + +	if (!try_module_get(afi->owner)) +		return -EAFNOSUPPORT; + +	table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); +	if (table == NULL) { +		module_put(afi->owner); +		return -ENOMEM; +	} + +	nla_strlcpy(table->name, name, nla_len(name)); +	INIT_LIST_HEAD(&table->chains); +	INIT_LIST_HEAD(&table->sets); +	table->flags = flags; + +	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); +	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); +	if (err < 0) { +		kfree(table); +		module_put(afi->owner); +		return err; +	} +	list_add_tail_rcu(&table->list, &afi->tables); +	return 0; +} + +static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family, err; +	struct nft_ctx ctx; + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; +	if (table->use > 0) +		return -EBUSY; + +	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); +	err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); +	if (err < 0) +		return err; + +	list_del_rcu(&table->list); +	return 0; +} + +static void nf_tables_table_destroy(struct nft_ctx *ctx) +{ +	BUG_ON(ctx->table->use > 0); + +	kfree(ctx->table); +	module_put(ctx->afi->owner); +} + +int nft_register_chain_type(const struct nf_chain_type *ctype) +{ +	int err = 0; + +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	if (chain_type[ctype->family][ctype->type] != NULL) { +		err = -EBUSY; +		goto out; +	} +	chain_type[ctype->family][ctype->type] = ctype; +out: +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return err; +} +EXPORT_SYMBOL_GPL(nft_register_chain_type); + +void nft_unregister_chain_type(const struct nf_chain_type *ctype) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	chain_type[ctype->family][ctype->type] = NULL; +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_chain_type); + +/* + * Chains + */ + +static struct nft_chain * +nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) +{ +	struct nft_chain *chain; + +	list_for_each_entry(chain, &table->chains, list) { +		if (chain->handle == handle) +			return chain; +	} + +	return ERR_PTR(-ENOENT); +} + +static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, +						const struct nlattr *nla) +{ +	struct nft_chain *chain; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	list_for_each_entry(chain, &table->chains, list) { +		if (!nla_strcmp(nla, chain->name)) +			return chain; +	} + +	return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { +	[NFTA_CHAIN_TABLE]	= { .type = NLA_STRING }, +	[NFTA_CHAIN_HANDLE]	= { .type = NLA_U64 }, +	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING, +				    .len = NFT_CHAIN_MAXNAMELEN - 1 }, +	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED }, +	[NFTA_CHAIN_POLICY]	= { .type = NLA_U32 }, +	[NFTA_CHAIN_TYPE]	= { .type = NLA_STRING }, +	[NFTA_CHAIN_COUNTERS]	= { .type = NLA_NESTED }, +}; + +static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { +	[NFTA_HOOK_HOOKNUM]	= { .type = NLA_U32 }, +	[NFTA_HOOK_PRIORITY]	= { .type = NLA_U32 }, +}; + +static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) +{ +	struct nft_stats *cpu_stats, total; +	struct nlattr *nest; +	unsigned int seq; +	u64 pkts, bytes; +	int cpu; + +	memset(&total, 0, sizeof(total)); +	for_each_possible_cpu(cpu) { +		cpu_stats = per_cpu_ptr(stats, cpu); +		do { +			seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); +			pkts = cpu_stats->pkts; +			bytes = cpu_stats->bytes; +		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); +		total.pkts += pkts; +		total.bytes += bytes; +	} +	nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); +	if (nest == NULL) +		goto nla_put_failure; + +	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) || +	    nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes))) +		goto nla_put_failure; + +	nla_nest_end(skb, nest); +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, +				     int event, u32 flags, int family, +				     const struct nft_table *table, +				     const struct nft_chain *chain) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle))) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) +		goto nla_put_failure; + +	if (chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = nft_base_chain(chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; +		struct nlattr *nest; + +		nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); +		if (nest == NULL) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) +			goto nla_put_failure; +		nla_nest_end(skb, nest); + +		if (nla_put_be32(skb, NFTA_CHAIN_POLICY, +				 htonl(basechain->policy))) +			goto nla_put_failure; + +		if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) +			goto nla_put_failure; + +		if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) +			goto nla_put_failure; +	} + +	if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +{ +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, +					ctx->afi->family, ctx->table, +					ctx->chain); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, +			     ctx->report, GFP_KERNEL); +err: +	if (err < 0) { +		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, +				  err); +	} +	return err; +} + +static int nf_tables_dump_chains(struct sk_buff *skb, +				 struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	unsigned int idx = 0, s_idx = cb->args[0]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (family != NFPROTO_UNSPEC && family != afi->family) +			continue; + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			list_for_each_entry_rcu(chain, &table->chains, list) { +				if (idx < s_idx) +					goto cont; +				if (idx > s_idx) +					memset(&cb->args[1], 0, +					       sizeof(cb->args) - sizeof(cb->args[0])); +				if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, +							      cb->nlh->nlmsg_seq, +							      NFT_MSG_NEWCHAIN, +							      NLM_F_MULTI, +							      afi->family, table, chain) < 0) +					goto done; + +				nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +				idx++; +			} +		} +	} +done: +	rcu_read_unlock(); +	cb->args[0] = idx; +	return skb->len; +} + +static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	struct sk_buff *skb2; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	int err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_chains, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); +	if (chain->flags & NFT_CHAIN_INACTIVE) +		return -ENOENT; + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb2) +		return -ENOMEM; + +	err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, +					nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, +					family, table, chain); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { +	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 }, +	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 }, +}; + +static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) +{ +	struct nlattr *tb[NFTA_COUNTER_MAX+1]; +	struct nft_stats __percpu *newstats; +	struct nft_stats *stats; +	int err; + +	err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); +	if (err < 0) +		return ERR_PTR(err); + +	if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) +		return ERR_PTR(-EINVAL); + +	newstats = netdev_alloc_pcpu_stats(struct nft_stats); +	if (newstats == NULL) +		return ERR_PTR(-ENOMEM); + +	/* Restore old counters on this cpu, no problem. Per-cpu statistics +	 * are not exposed to userspace. +	 */ +	stats = this_cpu_ptr(newstats); +	stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); +	stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + +	return newstats; +} + +static void nft_chain_stats_replace(struct nft_base_chain *chain, +				    struct nft_stats __percpu *newstats) +{ +	if (chain->stats) { +		struct nft_stats __percpu *oldstats = +				nft_dereference(chain->stats); + +		rcu_assign_pointer(chain->stats, newstats); +		synchronize_rcu(); +		free_percpu(oldstats); +	} else +		rcu_assign_pointer(chain->stats, newstats); +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); +	if (trans == NULL) +		return -ENOMEM; + +	if (msg_type == NFT_MSG_NEWCHAIN) +		ctx->chain->flags |= NFT_CHAIN_INACTIVE; + +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; +} + +static void nf_tables_chain_destroy(struct nft_chain *chain) +{ +	BUG_ON(chain->use > 0); + +	if (chain->flags & NFT_BASE_CHAIN) { +		module_put(nft_base_chain(chain)->type->owner); +		free_percpu(nft_base_chain(chain)->stats); +		kfree(nft_base_chain(chain)); +	} else { +		kfree(chain); +	} +} + +static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nlattr * uninitialized_var(name); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct nft_chain *chain; +	struct nft_base_chain *basechain = NULL; +	struct nlattr *ha[NFTA_HOOK_MAX + 1]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	u8 policy = NF_ACCEPT; +	u64 handle = 0; +	unsigned int i; +	struct nft_stats __percpu *stats; +	int err; +	bool create; +	struct nft_ctx ctx; + +	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + +	afi = nf_tables_afinfo_lookup(net, family, true); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); + +	chain = NULL; +	name = nla[NFTA_CHAIN_NAME]; + +	if (nla[NFTA_CHAIN_HANDLE]) { +		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); +		chain = nf_tables_chain_lookup_byhandle(table, handle); +		if (IS_ERR(chain)) +			return PTR_ERR(chain); +	} else { +		chain = nf_tables_chain_lookup(table, name); +		if (IS_ERR(chain)) { +			if (PTR_ERR(chain) != -ENOENT) +				return PTR_ERR(chain); +			chain = NULL; +		} +	} + +	if (nla[NFTA_CHAIN_POLICY]) { +		if ((chain != NULL && +		    !(chain->flags & NFT_BASE_CHAIN)) || +		    nla[NFTA_CHAIN_HOOK] == NULL) +			return -EOPNOTSUPP; + +		policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); +		switch (policy) { +		case NF_DROP: +		case NF_ACCEPT: +			break; +		default: +			return -EINVAL; +		} +	} + +	if (chain != NULL) { +		struct nft_stats *stats = NULL; +		struct nft_trans *trans; + +		if (chain->flags & NFT_CHAIN_INACTIVE) +			return -ENOENT; +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EOPNOTSUPP; + +		if (nla[NFTA_CHAIN_HANDLE] && name && +		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) +			return -EEXIST; + +		if (nla[NFTA_CHAIN_COUNTERS]) { +			if (!(chain->flags & NFT_BASE_CHAIN)) +				return -EOPNOTSUPP; + +			stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); +			if (IS_ERR(stats)) +				return PTR_ERR(stats); +		} + +		nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); +		trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, +					sizeof(struct nft_trans_chain)); +		if (trans == NULL) +			return -ENOMEM; + +		nft_trans_chain_stats(trans) = stats; +		nft_trans_chain_update(trans) = true; + +		if (nla[NFTA_CHAIN_POLICY]) +			nft_trans_chain_policy(trans) = policy; +		else +			nft_trans_chain_policy(trans) = -1; + +		if (nla[NFTA_CHAIN_HANDLE] && name) { +			nla_strlcpy(nft_trans_chain_name(trans), name, +				    NFT_CHAIN_MAXNAMELEN); +		} +		list_add_tail(&trans->list, &net->nft.commit_list); +		return 0; +	} + +	if (table->use == UINT_MAX) +		return -EOVERFLOW; + +	if (nla[NFTA_CHAIN_HOOK]) { +		const struct nf_chain_type *type; +		struct nf_hook_ops *ops; +		nf_hookfn *hookfn; +		u32 hooknum, priority; + +		type = chain_type[family][NFT_CHAIN_T_DEFAULT]; +		if (nla[NFTA_CHAIN_TYPE]) { +			type = nf_tables_chain_type_lookup(afi, +							   nla[NFTA_CHAIN_TYPE], +							   create); +			if (IS_ERR(type)) +				return PTR_ERR(type); +		} + +		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], +				       nft_hook_policy); +		if (err < 0) +			return err; +		if (ha[NFTA_HOOK_HOOKNUM] == NULL || +		    ha[NFTA_HOOK_PRIORITY] == NULL) +			return -EINVAL; + +		hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); +		if (hooknum >= afi->nhooks) +			return -EINVAL; +		priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + +		if (!(type->hook_mask & (1 << hooknum))) +			return -EOPNOTSUPP; +		if (!try_module_get(type->owner)) +			return -ENOENT; +		hookfn = type->hooks[hooknum]; + +		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); +		if (basechain == NULL) +			return -ENOMEM; + +		if (nla[NFTA_CHAIN_COUNTERS]) { +			stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); +			if (IS_ERR(stats)) { +				module_put(type->owner); +				kfree(basechain); +				return PTR_ERR(stats); +			} +			basechain->stats = stats; +		} else { +			stats = netdev_alloc_pcpu_stats(struct nft_stats); +			if (IS_ERR(stats)) { +				module_put(type->owner); +				kfree(basechain); +				return PTR_ERR(stats); +			} +			rcu_assign_pointer(basechain->stats, stats); +		} + +		basechain->type = type; +		chain = &basechain->chain; + +		for (i = 0; i < afi->nops; i++) { +			ops = &basechain->ops[i]; +			ops->pf		= family; +			ops->owner	= afi->owner; +			ops->hooknum	= hooknum; +			ops->priority	= priority; +			ops->priv	= chain; +			ops->hook	= afi->hooks[ops->hooknum]; +			if (hookfn) +				ops->hook = hookfn; +			if (afi->hook_ops_init) +				afi->hook_ops_init(ops, i); +		} + +		chain->flags |= NFT_BASE_CHAIN; +		basechain->policy = policy; +	} else { +		chain = kzalloc(sizeof(*chain), GFP_KERNEL); +		if (chain == NULL) +			return -ENOMEM; +	} + +	INIT_LIST_HEAD(&chain->rules); +	chain->handle = nf_tables_alloc_handle(table); +	chain->net = net; +	chain->table = table; +	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + +	if (!(table->flags & NFT_TABLE_F_DORMANT) && +	    chain->flags & NFT_BASE_CHAIN) { +		err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); +		if (err < 0) +			goto err1; +	} + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); +	err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); +	if (err < 0) +		goto err2; + +	table->use++; +	list_add_tail_rcu(&chain->list, &table->chains); +	return 0; +err2: +	if (!(table->flags & NFT_TABLE_F_DORMANT) && +	    chain->flags & NFT_BASE_CHAIN) { +		nf_unregister_hooks(nft_base_chain(chain)->ops, +				    afi->nops); +	} +err1: +	nf_tables_chain_destroy(chain); +	return err; +} + +static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct nft_chain *chain; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	struct nft_ctx ctx; +	int err; + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); +	if (chain->flags & NFT_CHAIN_INACTIVE) +		return -ENOENT; +	if (chain->use > 0) +		return -EBUSY; + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); +	err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); +	if (err < 0) +		return err; + +	table->use--; +	list_del_rcu(&chain->list); +	return 0; +} + +/* + * Expressions + */ + +/** + *	nft_register_expr - register nf_tables expr type + *	@ops: expr type + * + *	Registers the expr type for use with nf_tables. Returns zero on + *	success or a negative errno code otherwise. + */ +int nft_register_expr(struct nft_expr_type *type) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	if (type->family == NFPROTO_UNSPEC) +		list_add_tail_rcu(&type->list, &nf_tables_expressions); +	else +		list_add_rcu(&type->list, &nf_tables_expressions); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return 0; +} +EXPORT_SYMBOL_GPL(nft_register_expr); + +/** + *	nft_unregister_expr - unregister nf_tables expr type + *	@ops: expr type + * + * 	Unregisters the expr typefor use with nf_tables. + */ +void nft_unregister_expr(struct nft_expr_type *type) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_del_rcu(&type->list); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_expr); + +static const struct nft_expr_type *__nft_expr_type_get(u8 family, +						       struct nlattr *nla) +{ +	const struct nft_expr_type *type; + +	list_for_each_entry(type, &nf_tables_expressions, list) { +		if (!nla_strcmp(nla, type->name) && +		    (!type->family || type->family == family)) +			return type; +	} +	return NULL; +} + +static const struct nft_expr_type *nft_expr_type_get(u8 family, +						     struct nlattr *nla) +{ +	const struct nft_expr_type *type; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	type = __nft_expr_type_get(family, nla); +	if (type != NULL && try_module_get(type->owner)) +		return type; + +#ifdef CONFIG_MODULES +	if (type == NULL) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-expr-%u-%.*s", family, +			       nla_len(nla), (char *)nla_data(nla)); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		if (__nft_expr_type_get(family, nla)) +			return ERR_PTR(-EAGAIN); + +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-expr-%.*s", +			       nla_len(nla), (char *)nla_data(nla)); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		if (__nft_expr_type_get(family, nla)) +			return ERR_PTR(-EAGAIN); +	} +#endif +	return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { +	[NFTA_EXPR_NAME]	= { .type = NLA_STRING }, +	[NFTA_EXPR_DATA]	= { .type = NLA_NESTED }, +}; + +static int nf_tables_fill_expr_info(struct sk_buff *skb, +				    const struct nft_expr *expr) +{ +	if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) +		goto nla_put_failure; + +	if (expr->ops->dump) { +		struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA); +		if (data == NULL) +			goto nla_put_failure; +		if (expr->ops->dump(skb, expr) < 0) +			goto nla_put_failure; +		nla_nest_end(skb, data); +	} + +	return skb->len; + +nla_put_failure: +	return -1; +}; + +struct nft_expr_info { +	const struct nft_expr_ops	*ops; +	struct nlattr			*tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nf_tables_expr_parse(const struct nft_ctx *ctx, +				const struct nlattr *nla, +				struct nft_expr_info *info) +{ +	const struct nft_expr_type *type; +	const struct nft_expr_ops *ops; +	struct nlattr *tb[NFTA_EXPR_MAX + 1]; +	int err; + +	err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy); +	if (err < 0) +		return err; + +	type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); +	if (IS_ERR(type)) +		return PTR_ERR(type); + +	if (tb[NFTA_EXPR_DATA]) { +		err = nla_parse_nested(info->tb, type->maxattr, +				       tb[NFTA_EXPR_DATA], type->policy); +		if (err < 0) +			goto err1; +	} else +		memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1)); + +	if (type->select_ops != NULL) { +		ops = type->select_ops(ctx, +				       (const struct nlattr * const *)info->tb); +		if (IS_ERR(ops)) { +			err = PTR_ERR(ops); +			goto err1; +		} +	} else +		ops = type->ops; + +	info->ops = ops; +	return 0; + +err1: +	module_put(type->owner); +	return err; +} + +static int nf_tables_newexpr(const struct nft_ctx *ctx, +			     const struct nft_expr_info *info, +			     struct nft_expr *expr) +{ +	const struct nft_expr_ops *ops = info->ops; +	int err; + +	expr->ops = ops; +	if (ops->init) { +		err = ops->init(ctx, expr, (const struct nlattr **)info->tb); +		if (err < 0) +			goto err1; +	} + +	return 0; + +err1: +	expr->ops = NULL; +	return err; +} + +static void nf_tables_expr_destroy(const struct nft_ctx *ctx, +				   struct nft_expr *expr) +{ +	if (expr->ops->destroy) +		expr->ops->destroy(ctx, expr); +	module_put(expr->ops->type->owner); +} + +/* + * Rules + */ + +static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, +						u64 handle) +{ +	struct nft_rule *rule; + +	// FIXME: this sucks +	list_for_each_entry(rule, &chain->rules, list) { +		if (handle == rule->handle) +			return rule; +	} + +	return ERR_PTR(-ENOENT); +} + +static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, +					      const struct nlattr *nla) +{ +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); +} + +static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { +	[NFTA_RULE_TABLE]	= { .type = NLA_STRING }, +	[NFTA_RULE_CHAIN]	= { .type = NLA_STRING, +				    .len = NFT_CHAIN_MAXNAMELEN - 1 }, +	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 }, +	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED }, +	[NFTA_RULE_COMPAT]	= { .type = NLA_NESTED }, +	[NFTA_RULE_POSITION]	= { .type = NLA_U64 }, +	[NFTA_RULE_USERDATA]	= { .type = NLA_BINARY, +				    .len = NFT_USERDATA_MAXLEN }, +}; + +static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, +				    int event, u32 flags, int family, +				    const struct nft_table *table, +				    const struct nft_chain *chain, +				    const struct nft_rule *rule) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	const struct nft_expr *expr, *next; +	struct nlattr *list; +	const struct nft_rule *prule; +	int type = event | NFNL_SUBSYS_NFTABLES << 8; + +	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), +			flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle))) +		goto nla_put_failure; + +	if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { +		prule = list_entry(rule->list.prev, struct nft_rule, list); +		if (nla_put_be64(skb, NFTA_RULE_POSITION, +				 cpu_to_be64(prule->handle))) +			goto nla_put_failure; +	} + +	list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS); +	if (list == NULL) +		goto nla_put_failure; +	nft_rule_for_each_expr(expr, next, rule) { +		struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM); +		if (elem == NULL) +			goto nla_put_failure; +		if (nf_tables_fill_expr_info(skb, expr) < 0) +			goto nla_put_failure; +		nla_nest_end(skb, elem); +	} +	nla_nest_end(skb, list); + +	if (rule->ulen && +	    nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_rule_notify(const struct nft_ctx *ctx, +				 const struct nft_rule *rule, +				 int event) +{ +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, +				       ctx->afi->family, ctx->table, +				       ctx->chain, rule); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, +			     ctx->report, GFP_KERNEL); +err: +	if (err < 0) { +		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, +				  err); +	} +	return err; +} + +static inline bool +nft_rule_is_active(struct net *net, const struct nft_rule *rule) +{ +	return (rule->genmask & (1 << net->nft.gencursor)) == 0; +} + +static inline int gencursor_next(struct net *net) +{ +	return net->nft.gencursor+1 == 1 ? 1 : 0; +} + +static inline int +nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) +{ +	return (rule->genmask & (1 << gencursor_next(net))) == 0; +} + +static inline void +nft_rule_activate_next(struct net *net, struct nft_rule *rule) +{ +	/* Now inactive, will be active in the future */ +	rule->genmask = (1 << net->nft.gencursor); +} + +static inline void +nft_rule_disactivate_next(struct net *net, struct nft_rule *rule) +{ +	rule->genmask = (1 << gencursor_next(net)); +} + +static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) +{ +	rule->genmask = 0; +} + +static int nf_tables_dump_rules(struct sk_buff *skb, +				struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	const struct nft_rule *rule; +	unsigned int idx = 0, s_idx = cb->args[0]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (family != NFPROTO_UNSPEC && family != afi->family) +			continue; + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			list_for_each_entry_rcu(chain, &table->chains, list) { +				list_for_each_entry_rcu(rule, &chain->rules, list) { +					if (!nft_rule_is_active(net, rule)) +						goto cont; +					if (idx < s_idx) +						goto cont; +					if (idx > s_idx) +						memset(&cb->args[1], 0, +						       sizeof(cb->args) - sizeof(cb->args[0])); +					if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, +								      cb->nlh->nlmsg_seq, +								      NFT_MSG_NEWRULE, +								      NLM_F_MULTI | NLM_F_APPEND, +								      afi->family, table, chain, rule) < 0) +						goto done; + +					nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +					idx++; +				} +			} +		} +	} +done: +	rcu_read_unlock(); + +	cb->args[0] = idx; +	return skb->len; +} + +static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, +			     const struct nlmsghdr *nlh, +			     const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	const struct nft_rule *rule; +	struct sk_buff *skb2; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	int err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_rules, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); +	if (chain->flags & NFT_CHAIN_INACTIVE) +		return -ENOENT; + +	rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); +	if (IS_ERR(rule)) +		return PTR_ERR(rule); + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb2) +		return -ENOMEM; + +	err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, +				       nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, +				       family, table, chain, rule); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static void nf_tables_rule_destroy(const struct nft_ctx *ctx, +				   struct nft_rule *rule) +{ +	struct nft_expr *expr; + +	/* +	 * Careful: some expressions might not be initialized in case this +	 * is called on error from nf_tables_newrule(). +	 */ +	expr = nft_expr_first(rule); +	while (expr->ops && expr != nft_expr_last(rule)) { +		nf_tables_expr_destroy(ctx, expr); +		expr = nft_expr_next(expr); +	} +	kfree(rule); +} + +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, +					    struct nft_rule *rule) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); +	if (trans == NULL) +		return NULL; + +	nft_trans_rule(trans) = rule; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); + +	return trans; +} + +#define NFT_RULE_MAXEXPRS	128 + +static struct nft_expr_info *info; + +static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, +			     const struct nlmsghdr *nlh, +			     const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct net *net = sock_net(skb->sk); +	struct nft_table *table; +	struct nft_chain *chain; +	struct nft_rule *rule, *old_rule = NULL; +	struct nft_trans *trans = NULL; +	struct nft_expr *expr; +	struct nft_ctx ctx; +	struct nlattr *tmp; +	unsigned int size, i, n, ulen = 0; +	int err, rem; +	bool create; +	u64 handle, pos_handle; + +	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + +	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); + +	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); + +	if (nla[NFTA_RULE_HANDLE]) { +		handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); +		rule = __nf_tables_rule_lookup(chain, handle); +		if (IS_ERR(rule)) +			return PTR_ERR(rule); + +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			old_rule = rule; +		else +			return -EOPNOTSUPP; +	} else { +		if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EINVAL; +		handle = nf_tables_alloc_handle(table); + +		if (chain->use == UINT_MAX) +			return -EOVERFLOW; +	} + +	if (nla[NFTA_RULE_POSITION]) { +		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) +			return -EOPNOTSUPP; + +		pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); +		old_rule = __nf_tables_rule_lookup(chain, pos_handle); +		if (IS_ERR(old_rule)) +			return PTR_ERR(old_rule); +	} + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + +	n = 0; +	size = 0; +	if (nla[NFTA_RULE_EXPRESSIONS]) { +		nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { +			err = -EINVAL; +			if (nla_type(tmp) != NFTA_LIST_ELEM) +				goto err1; +			if (n == NFT_RULE_MAXEXPRS) +				goto err1; +			err = nf_tables_expr_parse(&ctx, tmp, &info[n]); +			if (err < 0) +				goto err1; +			size += info[n].ops->size; +			n++; +		} +	} + +	if (nla[NFTA_RULE_USERDATA]) +		ulen = nla_len(nla[NFTA_RULE_USERDATA]); + +	err = -ENOMEM; +	rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); +	if (rule == NULL) +		goto err1; + +	nft_rule_activate_next(net, rule); + +	rule->handle = handle; +	rule->dlen   = size; +	rule->ulen   = ulen; + +	if (ulen) +		nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); + +	expr = nft_expr_first(rule); +	for (i = 0; i < n; i++) { +		err = nf_tables_newexpr(&ctx, &info[i], expr); +		if (err < 0) +			goto err2; +		info[i].ops = NULL; +		expr = nft_expr_next(expr); +	} + +	if (nlh->nlmsg_flags & NLM_F_REPLACE) { +		if (nft_rule_is_active_next(net, old_rule)) { +			trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, +						   old_rule); +			if (trans == NULL) { +				err = -ENOMEM; +				goto err2; +			} +			nft_rule_disactivate_next(net, old_rule); +			chain->use--; +			list_add_tail_rcu(&rule->list, &old_rule->list); +		} else { +			err = -ENOENT; +			goto err2; +		} +	} else if (nlh->nlmsg_flags & NLM_F_APPEND) +		if (old_rule) +			list_add_rcu(&rule->list, &old_rule->list); +		else +			list_add_tail_rcu(&rule->list, &chain->rules); +	else { +		if (old_rule) +			list_add_tail_rcu(&rule->list, &old_rule->list); +		else +			list_add_rcu(&rule->list, &chain->rules); +	} + +	if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { +		err = -ENOMEM; +		goto err3; +	} +	chain->use++; +	return 0; + +err3: +	list_del_rcu(&rule->list); +	if (trans) { +		list_del_rcu(&nft_trans_rule(trans)->list); +		nft_rule_clear(net, nft_trans_rule(trans)); +		nft_trans_destroy(trans); +		chain->use++; +	} +err2: +	nf_tables_rule_destroy(&ctx, rule); +err1: +	for (i = 0; i < n; i++) { +		if (info[i].ops != NULL) +			module_put(info[i].ops->type->owner); +	} +	return err; +} + +static int +nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) +{ +	/* You cannot delete the same rule twice */ +	if (nft_rule_is_active_next(ctx->net, rule)) { +		if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) +			return -ENOMEM; +		nft_rule_disactivate_next(ctx->net, rule); +		ctx->chain->use--; +		return 0; +	} +	return -ENOENT; +} + +static int nf_table_delrule_by_chain(struct nft_ctx *ctx) +{ +	struct nft_rule *rule; +	int err; + +	list_for_each_entry(rule, &ctx->chain->rules, list) { +		err = nf_tables_delrule_one(ctx, rule); +		if (err < 0) +			return err; +	} +	return 0; +} + +static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, +			     const struct nlmsghdr *nlh, +			     const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct net *net = sock_net(skb->sk); +	struct nft_table *table; +	struct nft_chain *chain = NULL; +	struct nft_rule *rule; +	int family = nfmsg->nfgen_family, err = 0; +	struct nft_ctx ctx; + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	if (nla[NFTA_RULE_CHAIN]) { +		chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); +		if (IS_ERR(chain)) +			return PTR_ERR(chain); +	} + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + +	if (chain) { +		if (nla[NFTA_RULE_HANDLE]) { +			rule = nf_tables_rule_lookup(chain, +						     nla[NFTA_RULE_HANDLE]); +			if (IS_ERR(rule)) +				return PTR_ERR(rule); + +			err = nf_tables_delrule_one(&ctx, rule); +		} else { +			err = nf_table_delrule_by_chain(&ctx); +		} +	} else { +		list_for_each_entry(chain, &table->chains, list) { +			ctx.chain = chain; +			err = nf_table_delrule_by_chain(&ctx); +			if (err < 0) +				break; +		} +	} + +	return err; +} + +/* + * Sets + */ + +static LIST_HEAD(nf_tables_set_ops); + +int nft_register_set(struct nft_set_ops *ops) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_add_tail_rcu(&ops->list, &nf_tables_set_ops); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return 0; +} +EXPORT_SYMBOL_GPL(nft_register_set); + +void nft_unregister_set(struct nft_set_ops *ops) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_del_rcu(&ops->list); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_set); + +/* + * Select a set implementation based on the data characteristics and the + * given policy. The total memory use might not be known if no size is + * given, in that case the amount of memory per element is used. + */ +static const struct nft_set_ops * +nft_select_set_ops(const struct nlattr * const nla[], +		   const struct nft_set_desc *desc, +		   enum nft_set_policies policy) +{ +	const struct nft_set_ops *ops, *bops; +	struct nft_set_estimate est, best; +	u32 features; + +#ifdef CONFIG_MODULES +	if (list_empty(&nf_tables_set_ops)) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-set"); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		if (!list_empty(&nf_tables_set_ops)) +			return ERR_PTR(-EAGAIN); +	} +#endif +	features = 0; +	if (nla[NFTA_SET_FLAGS] != NULL) { +		features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); +		features &= NFT_SET_INTERVAL | NFT_SET_MAP; +	} + +	bops	   = NULL; +	best.size  = ~0; +	best.class = ~0; + +	list_for_each_entry(ops, &nf_tables_set_ops, list) { +		if ((ops->features & features) != features) +			continue; +		if (!ops->estimate(desc, features, &est)) +			continue; + +		switch (policy) { +		case NFT_SET_POL_PERFORMANCE: +			if (est.class < best.class) +				break; +			if (est.class == best.class && est.size < best.size) +				break; +			continue; +		case NFT_SET_POL_MEMORY: +			if (est.size < best.size) +				break; +			if (est.size == best.size && est.class < best.class) +				break; +			continue; +		default: +			break; +		} + +		if (!try_module_get(ops->owner)) +			continue; +		if (bops != NULL) +			module_put(bops->owner); + +		bops = ops; +		best = est; +	} + +	if (bops != NULL) +		return bops; + +	return ERR_PTR(-EOPNOTSUPP); +} + +static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { +	[NFTA_SET_TABLE]		= { .type = NLA_STRING }, +	[NFTA_SET_NAME]			= { .type = NLA_STRING, +					    .len = IFNAMSIZ - 1 }, +	[NFTA_SET_FLAGS]		= { .type = NLA_U32 }, +	[NFTA_SET_KEY_TYPE]		= { .type = NLA_U32 }, +	[NFTA_SET_KEY_LEN]		= { .type = NLA_U32 }, +	[NFTA_SET_DATA_TYPE]		= { .type = NLA_U32 }, +	[NFTA_SET_DATA_LEN]		= { .type = NLA_U32 }, +	[NFTA_SET_POLICY]		= { .type = NLA_U32 }, +	[NFTA_SET_DESC]			= { .type = NLA_NESTED }, +	[NFTA_SET_ID]			= { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { +	[NFTA_SET_DESC_SIZE]		= { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, +				     const struct sk_buff *skb, +				     const struct nlmsghdr *nlh, +				     const struct nlattr * const nla[]) +{ +	struct net *net = sock_net(skb->sk); +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi = NULL; +	struct nft_table *table = NULL; + +	if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { +		afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); +		if (IS_ERR(afi)) +			return PTR_ERR(afi); +	} + +	if (nla[NFTA_SET_TABLE] != NULL) { +		if (afi == NULL) +			return -EAFNOSUPPORT; + +		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); +		if (IS_ERR(table)) +			return PTR_ERR(table); +		if (table->flags & NFT_TABLE_INACTIVE) +			return -ENOENT; +	} + +	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); +	return 0; +} + +struct nft_set *nf_tables_set_lookup(const struct nft_table *table, +				     const struct nlattr *nla) +{ +	struct nft_set *set; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	list_for_each_entry(set, &table->sets, list) { +		if (!nla_strcmp(nla, set->name)) +			return set; +	} +	return ERR_PTR(-ENOENT); +} + +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, +					  const struct nlattr *nla) +{ +	struct nft_trans *trans; +	u32 id = ntohl(nla_get_be32(nla)); + +	list_for_each_entry(trans, &net->nft.commit_list, list) { +		if (trans->msg_type == NFT_MSG_NEWSET && +		    id == nft_trans_set_id(trans)) +			return nft_trans_set(trans); +	} +	return ERR_PTR(-ENOENT); +} + +static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, +				    const char *name) +{ +	const struct nft_set *i; +	const char *p; +	unsigned long *inuse; +	unsigned int n = 0, min = 0; + +	p = strnchr(name, IFNAMSIZ, '%'); +	if (p != NULL) { +		if (p[1] != 'd' || strchr(p + 2, '%')) +			return -EINVAL; + +		inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); +		if (inuse == NULL) +			return -ENOMEM; +cont: +		list_for_each_entry(i, &ctx->table->sets, list) { +			int tmp; + +			if (!sscanf(i->name, name, &tmp)) +				continue; +			if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) +				continue; + +			set_bit(tmp - min, inuse); +		} + +		n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); +		if (n >= BITS_PER_BYTE * PAGE_SIZE) { +			min += BITS_PER_BYTE * PAGE_SIZE; +			memset(inuse, 0, PAGE_SIZE); +			goto cont; +		} +		free_page((unsigned long)inuse); +	} + +	snprintf(set->name, sizeof(set->name), name, min + n); +	list_for_each_entry(i, &ctx->table->sets, list) { +		if (!strcmp(set->name, i->name)) +			return -ENFILE; +	} +	return 0; +} + +static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, +			      const struct nft_set *set, u16 event, u16 flags) +{ +	struct nfgenmsg *nfmsg; +	struct nlmsghdr *nlh; +	struct nlattr *desc; +	u32 portid = ctx->portid; +	u32 seq = ctx->seq; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), +			flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= ctx->afi->family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_SET_NAME, set->name)) +		goto nla_put_failure; +	if (set->flags != 0) +		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) +			goto nla_put_failure; + +	if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen))) +		goto nla_put_failure; +	if (set->flags & NFT_SET_MAP) { +		if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) +			goto nla_put_failure; +	} + +	desc = nla_nest_start(skb, NFTA_SET_DESC); +	if (desc == NULL) +		goto nla_put_failure; +	if (set->size && +	    nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) +		goto nla_put_failure; +	nla_nest_end(skb, desc); + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_set_notify(const struct nft_ctx *ctx, +				const struct nft_set *set, +				int event, gfp_t gfp_flags) +{ +	struct sk_buff *skb; +	u32 portid = ctx->portid; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_set(skb, ctx, set, event, 0); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, +			     ctx->report, gfp_flags); +err: +	if (err < 0) +		nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); +	return err; +} + +static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, +				     struct netlink_callback *cb) +{ +	const struct nft_set *set; +	unsigned int idx = 0, s_idx = cb->args[0]; + +	if (cb->args[1]) +		return skb->len; + +	rcu_read_lock(); +	cb->seq = ctx->net->nft.base_seq; + +	list_for_each_entry_rcu(set, &ctx->table->sets, list) { +		if (idx < s_idx) +			goto cont; +		if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, +				       NLM_F_MULTI) < 0) { +			cb->args[0] = idx; +			goto done; +		} +		nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +		idx++; +	} +	cb->args[1] = 1; +done: +	rcu_read_unlock(); +	return skb->len; +} + +static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, +				      struct netlink_callback *cb) +{ +	const struct nft_set *set; +	unsigned int idx, s_idx = cb->args[0]; +	struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; + +	if (cb->args[1]) +		return skb->len; + +	rcu_read_lock(); +	cb->seq = ctx->net->nft.base_seq; + +	list_for_each_entry_rcu(table, &ctx->afi->tables, list) { +		if (cur_table) { +			if (cur_table != table) +				continue; + +			cur_table = NULL; +		} +		ctx->table = table; +		idx = 0; +		list_for_each_entry_rcu(set, &ctx->table->sets, list) { +			if (idx < s_idx) +				goto cont; +			if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, +					       NLM_F_MULTI) < 0) { +				cb->args[0] = idx; +				cb->args[2] = (unsigned long) table; +				goto done; +			} +			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +			idx++; +		} +	} +	cb->args[1] = 1; +done: +	rcu_read_unlock(); +	return skb->len; +} + +static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, +				   struct netlink_callback *cb) +{ +	const struct nft_set *set; +	unsigned int idx, s_idx = cb->args[0]; +	struct nft_af_info *afi; +	struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; +	struct net *net = sock_net(skb->sk); +	int cur_family = cb->args[3]; + +	if (cb->args[1]) +		return skb->len; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (cur_family) { +			if (afi->family != cur_family) +				continue; + +			cur_family = 0; +		} + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			if (cur_table) { +				if (cur_table != table) +					continue; + +				cur_table = NULL; +			} + +			ctx->table = table; +			ctx->afi = afi; +			idx = 0; +			list_for_each_entry_rcu(set, &ctx->table->sets, list) { +				if (idx < s_idx) +					goto cont; +				if (nf_tables_fill_set(skb, ctx, set, +						       NFT_MSG_NEWSET, +						       NLM_F_MULTI) < 0) { +					cb->args[0] = idx; +					cb->args[2] = (unsigned long) table; +					cb->args[3] = afi->family; +					goto done; +				} +				nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +				idx++; +			} +			if (s_idx) +				s_idx = 0; +		} +	} +	cb->args[1] = 1; +done: +	rcu_read_unlock(); +	return skb->len; +} + +static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	struct nlattr *nla[NFTA_SET_MAX + 1]; +	struct nft_ctx ctx; +	int err, ret; + +	err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX, +			  nft_set_policy); +	if (err < 0) +		return err; + +	err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla); +	if (err < 0) +		return err; + +	if (ctx.table == NULL) { +		if (ctx.afi == NULL) +			ret = nf_tables_dump_sets_all(&ctx, skb, cb); +		else +			ret = nf_tables_dump_sets_family(&ctx, skb, cb); +	} else +		ret = nf_tables_dump_sets_table(&ctx, skb, cb); + +	return ret; +} + +#define NFT_SET_INACTIVE	(1 << 15)	/* Internal set flag */ + +static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, +			    const struct nlmsghdr *nlh, +			    const struct nlattr * const nla[]) +{ +	const struct nft_set *set; +	struct nft_ctx ctx; +	struct sk_buff *skb2; +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	int err; + +	/* Verify existance before starting dump */ +	err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); +	if (err < 0) +		return err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_sets, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	/* Only accept unspec with dump */ +	if (nfmsg->nfgen_family == NFPROTO_UNSPEC) +		return -EAFNOSUPPORT; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, +				    struct nft_set_desc *desc, +				    const struct nlattr *nla) +{ +	struct nlattr *da[NFTA_SET_DESC_MAX + 1]; +	int err; + +	err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); +	if (err < 0) +		return err; + +	if (da[NFTA_SET_DESC_SIZE] != NULL) +		desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + +	return 0; +} + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, +			     struct nft_set *set) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); +	if (trans == NULL) +		return -ENOMEM; + +	if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { +		nft_trans_set_id(trans) = +			ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); +		set->flags |= NFT_SET_INACTIVE; +	} +	nft_trans_set(trans) = set; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); + +	return 0; +} + +static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, +			    const struct nlmsghdr *nlh, +			    const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_set_ops *ops; +	struct nft_af_info *afi; +	struct net *net = sock_net(skb->sk); +	struct nft_table *table; +	struct nft_set *set; +	struct nft_ctx ctx; +	char name[IFNAMSIZ]; +	unsigned int size; +	bool create; +	u32 ktype, dtype, flags, policy; +	struct nft_set_desc desc; +	int err; + +	if (nla[NFTA_SET_TABLE] == NULL || +	    nla[NFTA_SET_NAME] == NULL || +	    nla[NFTA_SET_KEY_LEN] == NULL || +	    nla[NFTA_SET_ID] == NULL) +		return -EINVAL; + +	memset(&desc, 0, sizeof(desc)); + +	ktype = NFT_DATA_VALUE; +	if (nla[NFTA_SET_KEY_TYPE] != NULL) { +		ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); +		if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) +			return -EINVAL; +	} + +	desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); +	if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) +		return -EINVAL; + +	flags = 0; +	if (nla[NFTA_SET_FLAGS] != NULL) { +		flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); +		if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | +			      NFT_SET_INTERVAL | NFT_SET_MAP)) +			return -EINVAL; +	} + +	dtype = 0; +	if (nla[NFTA_SET_DATA_TYPE] != NULL) { +		if (!(flags & NFT_SET_MAP)) +			return -EINVAL; + +		dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); +		if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && +		    dtype != NFT_DATA_VERDICT) +			return -EINVAL; + +		if (dtype != NFT_DATA_VERDICT) { +			if (nla[NFTA_SET_DATA_LEN] == NULL) +				return -EINVAL; +			desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); +			if (desc.dlen == 0 || +			    desc.dlen > FIELD_SIZEOF(struct nft_data, data)) +				return -EINVAL; +		} else +			desc.dlen = sizeof(struct nft_data); +	} else if (flags & NFT_SET_MAP) +		return -EINVAL; + +	policy = NFT_SET_POL_PERFORMANCE; +	if (nla[NFTA_SET_POLICY] != NULL) +		policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + +	if (nla[NFTA_SET_DESC] != NULL) { +		err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); +		if (err < 0) +			return err; +	} + +	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + +	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); + +	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + +	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); +	if (IS_ERR(set)) { +		if (PTR_ERR(set) != -ENOENT) +			return PTR_ERR(set); +		set = NULL; +	} + +	if (set != NULL) { +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EOPNOTSUPP; +		return 0; +	} + +	if (!(nlh->nlmsg_flags & NLM_F_CREATE)) +		return -ENOENT; + +	ops = nft_select_set_ops(nla, &desc, policy); +	if (IS_ERR(ops)) +		return PTR_ERR(ops); + +	size = 0; +	if (ops->privsize != NULL) +		size = ops->privsize(nla); + +	err = -ENOMEM; +	set = kzalloc(sizeof(*set) + size, GFP_KERNEL); +	if (set == NULL) +		goto err1; + +	nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name)); +	err = nf_tables_set_alloc_name(&ctx, set, name); +	if (err < 0) +		goto err2; + +	INIT_LIST_HEAD(&set->bindings); +	set->ops   = ops; +	set->ktype = ktype; +	set->klen  = desc.klen; +	set->dtype = dtype; +	set->dlen  = desc.dlen; +	set->flags = flags; +	set->size  = desc.size; + +	err = ops->init(set, &desc, nla); +	if (err < 0) +		goto err2; + +	err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); +	if (err < 0) +		goto err2; + +	list_add_tail_rcu(&set->list, &table->sets); +	table->use++; +	return 0; + +err2: +	kfree(set); +err1: +	module_put(ops->owner); +	return err; +} + +static void nft_set_destroy(struct nft_set *set) +{ +	set->ops->destroy(set); +	module_put(set->ops->owner); +	kfree(set); +} + +static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ +	list_del_rcu(&set->list); +	nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); +	nft_set_destroy(set); +} + +static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, +			    const struct nlmsghdr *nlh, +			    const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_set *set; +	struct nft_ctx ctx; +	int err; + +	if (nfmsg->nfgen_family == NFPROTO_UNSPEC) +		return -EAFNOSUPPORT; +	if (nla[NFTA_SET_TABLE] == NULL) +		return -EINVAL; + +	err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; +	if (!list_empty(&set->bindings)) +		return -EBUSY; + +	err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); +	if (err < 0) +		return err; + +	list_del_rcu(&set->list); +	ctx.table->use--; +	return 0; +} + +static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, +					const struct nft_set *set, +					const struct nft_set_iter *iter, +					const struct nft_set_elem *elem) +{ +	enum nft_registers dreg; + +	dreg = nft_type_to_reg(set->dtype); +	return nft_validate_data_load(ctx, dreg, &elem->data, +				      set->dtype == NFT_DATA_VERDICT ? +				      NFT_DATA_VERDICT : NFT_DATA_VALUE); +} + +int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, +		       struct nft_set_binding *binding) +{ +	struct nft_set_binding *i; +	struct nft_set_iter iter; + +	if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) +		return -EBUSY; + +	if (set->flags & NFT_SET_MAP) { +		/* If the set is already bound to the same chain all +		 * jumps are already validated for that chain. +		 */ +		list_for_each_entry(i, &set->bindings, list) { +			if (i->chain == binding->chain) +				goto bind; +		} + +		iter.skip 	= 0; +		iter.count	= 0; +		iter.err	= 0; +		iter.fn		= nf_tables_bind_check_setelem; + +		set->ops->walk(ctx, set, &iter); +		if (iter.err < 0) { +			/* Destroy anonymous sets if binding fails */ +			if (set->flags & NFT_SET_ANONYMOUS) +				nf_tables_set_destroy(ctx, set); + +			return iter.err; +		} +	} +bind: +	binding->chain = ctx->chain; +	list_add_tail_rcu(&binding->list, &set->bindings); +	return 0; +} + +void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, +			  struct nft_set_binding *binding) +{ +	list_del_rcu(&binding->list); + +	if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && +	    !(set->flags & NFT_SET_INACTIVE)) +		nf_tables_set_destroy(ctx, set); +} + +/* + * Set elements + */ + +static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { +	[NFTA_SET_ELEM_KEY]		= { .type = NLA_NESTED }, +	[NFTA_SET_ELEM_DATA]		= { .type = NLA_NESTED }, +	[NFTA_SET_ELEM_FLAGS]		= { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { +	[NFTA_SET_ELEM_LIST_TABLE]	= { .type = NLA_STRING }, +	[NFTA_SET_ELEM_LIST_SET]	= { .type = NLA_STRING }, +	[NFTA_SET_ELEM_LIST_ELEMENTS]	= { .type = NLA_NESTED }, +	[NFTA_SET_ELEM_LIST_SET_ID]	= { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, +				      const struct sk_buff *skb, +				      const struct nlmsghdr *nlh, +				      const struct nlattr * const nla[], +				      bool trans) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct net *net = sock_net(skb->sk); + +	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (!trans && (table->flags & NFT_TABLE_INACTIVE)) +		return -ENOENT; + +	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); +	return 0; +} + +static int nf_tables_fill_setelem(struct sk_buff *skb, +				  const struct nft_set *set, +				  const struct nft_set_elem *elem) +{ +	unsigned char *b = skb_tail_pointer(skb); +	struct nlattr *nest; + +	nest = nla_nest_start(skb, NFTA_LIST_ELEM); +	if (nest == NULL) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE, +			  set->klen) < 0) +		goto nla_put_failure; + +	if (set->flags & NFT_SET_MAP && +	    !(elem->flags & NFT_SET_ELEM_INTERVAL_END) && +	    nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data, +			  set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, +			  set->dlen) < 0) +		goto nla_put_failure; + +	if (elem->flags != 0) +		if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags))) +			goto nla_put_failure; + +	nla_nest_end(skb, nest); +	return 0; + +nla_put_failure: +	nlmsg_trim(skb, b); +	return -EMSGSIZE; +} + +struct nft_set_dump_args { +	const struct netlink_callback	*cb; +	struct nft_set_iter		iter; +	struct sk_buff			*skb; +}; + +static int nf_tables_dump_setelem(const struct nft_ctx *ctx, +				  const struct nft_set *set, +				  const struct nft_set_iter *iter, +				  const struct nft_set_elem *elem) +{ +	struct nft_set_dump_args *args; + +	args = container_of(iter, struct nft_set_dump_args, iter); +	return nf_tables_fill_setelem(args->skb, set, elem); +} + +static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) +{ +	const struct nft_set *set; +	struct nft_set_dump_args args; +	struct nft_ctx ctx; +	struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; +	struct nfgenmsg *nfmsg; +	struct nlmsghdr *nlh; +	struct nlattr *nest; +	u32 portid, seq; +	int event, err; + +	err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, +			  NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy); +	if (err < 0) +		return err; + +	err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, +					 false); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; + +	event  = NFT_MSG_NEWSETELEM; +	event |= NFNL_SUBSYS_NFTABLES << 8; +	portid = NETLINK_CB(cb->skb).portid; +	seq    = cb->nlh->nlmsg_seq; + +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), +			NLM_F_MULTI); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = ctx.afi->family; +	nfmsg->version      = NFNETLINK_V0; +	nfmsg->res_id       = 0; + +	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) +		goto nla_put_failure; + +	nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); +	if (nest == NULL) +		goto nla_put_failure; + +	args.cb		= cb; +	args.skb	= skb; +	args.iter.skip	= cb->args[0]; +	args.iter.count	= 0; +	args.iter.err   = 0; +	args.iter.fn	= nf_tables_dump_setelem; +	set->ops->walk(&ctx, set, &args.iter); + +	nla_nest_end(skb, nest); +	nlmsg_end(skb, nlh); + +	if (args.iter.err && args.iter.err != -EMSGSIZE) +		return args.iter.err; +	if (args.iter.count == cb->args[0]) +		return 0; + +	cb->args[0] = args.iter.count; +	return skb->len; + +nla_put_failure: +	return -ENOSPC; +} + +static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, +				const struct nlmsghdr *nlh, +				const struct nlattr * const nla[]) +{ +	const struct nft_set *set; +	struct nft_ctx ctx; +	int err; + +	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_set, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} +	return -EOPNOTSUPP; +} + +static int nf_tables_fill_setelem_info(struct sk_buff *skb, +				       const struct nft_ctx *ctx, u32 seq, +				       u32 portid, int event, u16 flags, +				       const struct nft_set *set, +				       const struct nft_set_elem *elem) +{ +	struct nfgenmsg *nfmsg; +	struct nlmsghdr *nlh; +	struct nlattr *nest; +	int err; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), +			flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= ctx->afi->family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_SET_NAME, set->name)) +		goto nla_put_failure; + +	nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); +	if (nest == NULL) +		goto nla_put_failure; + +	err = nf_tables_fill_setelem(skb, set, elem); +	if (err < 0) +		goto nla_put_failure; + +	nla_nest_end(skb, nest); + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_setelem_notify(const struct nft_ctx *ctx, +				    const struct nft_set *set, +				    const struct nft_set_elem *elem, +				    int event, u16 flags) +{ +	struct net *net = ctx->net; +	u32 portid = ctx->portid; +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, +					  set, elem); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, +			     GFP_KERNEL); +err: +	if (err < 0) +		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); +	return err; +} + +static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, +					      int msg_type, +					      struct nft_set *set) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); +	if (trans == NULL) +		return NULL; + +	nft_trans_elem_set(trans) = set; +	return trans; +} + +static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, +			    const struct nlattr *attr) +{ +	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; +	struct nft_data_desc d1, d2; +	struct nft_set_elem elem; +	struct nft_set_binding *binding; +	enum nft_registers dreg; +	struct nft_trans *trans; +	int err; + +	if (set->size && set->nelems == set->size) +		return -ENFILE; + +	err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, +			       nft_set_elem_policy); +	if (err < 0) +		return err; + +	if (nla[NFTA_SET_ELEM_KEY] == NULL) +		return -EINVAL; + +	elem.flags = 0; +	if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { +		elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); +		if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END) +			return -EINVAL; +	} + +	if (set->flags & NFT_SET_MAP) { +		if (nla[NFTA_SET_ELEM_DATA] == NULL && +		    !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) +			return -EINVAL; +		if (nla[NFTA_SET_ELEM_DATA] != NULL && +		    elem.flags & NFT_SET_ELEM_INTERVAL_END) +			return -EINVAL; +	} else { +		if (nla[NFTA_SET_ELEM_DATA] != NULL) +			return -EINVAL; +	} + +	err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]); +	if (err < 0) +		goto err1; +	err = -EINVAL; +	if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) +		goto err2; + +	err = -EEXIST; +	if (set->ops->get(set, &elem) == 0) +		goto err2; + +	if (nla[NFTA_SET_ELEM_DATA] != NULL) { +		err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]); +		if (err < 0) +			goto err2; + +		err = -EINVAL; +		if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) +			goto err3; + +		dreg = nft_type_to_reg(set->dtype); +		list_for_each_entry(binding, &set->bindings, list) { +			struct nft_ctx bind_ctx = { +				.afi	= ctx->afi, +				.table	= ctx->table, +				.chain	= (struct nft_chain *)binding->chain, +			}; + +			err = nft_validate_data_load(&bind_ctx, dreg, +						     &elem.data, d2.type); +			if (err < 0) +				goto err3; +		} +	} + +	trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); +	if (trans == NULL) +		goto err3; + +	err = set->ops->insert(set, &elem); +	if (err < 0) +		goto err4; + +	nft_trans_elem(trans) = elem; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; + +err4: +	kfree(trans); +err3: +	if (nla[NFTA_SET_ELEM_DATA] != NULL) +		nft_data_uninit(&elem.data, d2.type); +err2: +	nft_data_uninit(&elem.key, d1.type); +err1: +	return err; +} + +static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, +				const struct nlmsghdr *nlh, +				const struct nlattr * const nla[]) +{ +	struct net *net = sock_net(skb->sk); +	const struct nlattr *attr; +	struct nft_set *set; +	struct nft_ctx ctx; +	int rem, err = 0; + +	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) { +		if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { +			set = nf_tables_set_lookup_byid(net, +					nla[NFTA_SET_ELEM_LIST_SET_ID]); +		} +		if (IS_ERR(set)) +			return PTR_ERR(set); +	} + +	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) +		return -EBUSY; + +	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { +		err = nft_add_set_elem(&ctx, set, attr); +		if (err < 0) +			break; + +		set->nelems++; +	} +	return err; +} + +static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, +			   const struct nlattr *attr) +{ +	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; +	struct nft_data_desc desc; +	struct nft_set_elem elem; +	struct nft_trans *trans; +	int err; + +	err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, +			       nft_set_elem_policy); +	if (err < 0) +		goto err1; + +	err = -EINVAL; +	if (nla[NFTA_SET_ELEM_KEY] == NULL) +		goto err1; + +	err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]); +	if (err < 0) +		goto err1; + +	err = -EINVAL; +	if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) +		goto err2; + +	err = set->ops->get(set, &elem); +	if (err < 0) +		goto err2; + +	trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); +	if (trans == NULL) +		goto err2; + +	nft_trans_elem(trans) = elem; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); + +	nft_data_uninit(&elem.key, NFT_DATA_VALUE); +	if (set->flags & NFT_SET_MAP) +		nft_data_uninit(&elem.data, set->dtype); + +err2: +	nft_data_uninit(&elem.key, desc.type); +err1: +	return err; +} + +static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, +				const struct nlmsghdr *nlh, +				const struct nlattr * const nla[]) +{ +	const struct nlattr *attr; +	struct nft_set *set; +	struct nft_ctx ctx; +	int rem, err = 0; + +	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) +		return -EBUSY; + +	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { +		err = nft_del_setelem(&ctx, set, attr); +		if (err < 0) +			break; + +		set->nelems--; +	} +	return err; +} + +static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { +	[NFT_MSG_NEWTABLE] = { +		.call_batch	= nf_tables_newtable, +		.attr_count	= NFTA_TABLE_MAX, +		.policy		= nft_table_policy, +	}, +	[NFT_MSG_GETTABLE] = { +		.call		= nf_tables_gettable, +		.attr_count	= NFTA_TABLE_MAX, +		.policy		= nft_table_policy, +	}, +	[NFT_MSG_DELTABLE] = { +		.call_batch	= nf_tables_deltable, +		.attr_count	= NFTA_TABLE_MAX, +		.policy		= nft_table_policy, +	}, +	[NFT_MSG_NEWCHAIN] = { +		.call_batch	= nf_tables_newchain, +		.attr_count	= NFTA_CHAIN_MAX, +		.policy		= nft_chain_policy, +	}, +	[NFT_MSG_GETCHAIN] = { +		.call		= nf_tables_getchain, +		.attr_count	= NFTA_CHAIN_MAX, +		.policy		= nft_chain_policy, +	}, +	[NFT_MSG_DELCHAIN] = { +		.call_batch	= nf_tables_delchain, +		.attr_count	= NFTA_CHAIN_MAX, +		.policy		= nft_chain_policy, +	}, +	[NFT_MSG_NEWRULE] = { +		.call_batch	= nf_tables_newrule, +		.attr_count	= NFTA_RULE_MAX, +		.policy		= nft_rule_policy, +	}, +	[NFT_MSG_GETRULE] = { +		.call		= nf_tables_getrule, +		.attr_count	= NFTA_RULE_MAX, +		.policy		= nft_rule_policy, +	}, +	[NFT_MSG_DELRULE] = { +		.call_batch	= nf_tables_delrule, +		.attr_count	= NFTA_RULE_MAX, +		.policy		= nft_rule_policy, +	}, +	[NFT_MSG_NEWSET] = { +		.call_batch	= nf_tables_newset, +		.attr_count	= NFTA_SET_MAX, +		.policy		= nft_set_policy, +	}, +	[NFT_MSG_GETSET] = { +		.call		= nf_tables_getset, +		.attr_count	= NFTA_SET_MAX, +		.policy		= nft_set_policy, +	}, +	[NFT_MSG_DELSET] = { +		.call_batch	= nf_tables_delset, +		.attr_count	= NFTA_SET_MAX, +		.policy		= nft_set_policy, +	}, +	[NFT_MSG_NEWSETELEM] = { +		.call_batch	= nf_tables_newsetelem, +		.attr_count	= NFTA_SET_ELEM_LIST_MAX, +		.policy		= nft_set_elem_list_policy, +	}, +	[NFT_MSG_GETSETELEM] = { +		.call		= nf_tables_getsetelem, +		.attr_count	= NFTA_SET_ELEM_LIST_MAX, +		.policy		= nft_set_elem_list_policy, +	}, +	[NFT_MSG_DELSETELEM] = { +		.call_batch	= nf_tables_delsetelem, +		.attr_count	= NFTA_SET_ELEM_LIST_MAX, +		.policy		= nft_set_elem_list_policy, +	}, +}; + +static void nft_chain_commit_update(struct nft_trans *trans) +{ +	struct nft_base_chain *basechain; + +	if (nft_trans_chain_name(trans)[0]) +		strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + +	if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) +		return; + +	basechain = nft_base_chain(trans->ctx.chain); +	nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + +	switch (nft_trans_chain_policy(trans)) { +	case NF_DROP: +	case NF_ACCEPT: +		basechain->policy = nft_trans_chain_policy(trans); +		break; +	} +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * removed rules. + */ +static void nf_tables_commit_release_rcu(struct rcu_head *rt) +{ +	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + +	switch (trans->msg_type) { +	case NFT_MSG_DELTABLE: +		nf_tables_table_destroy(&trans->ctx); +		break; +	case NFT_MSG_DELCHAIN: +		nf_tables_chain_destroy(trans->ctx.chain); +		break; +	case NFT_MSG_DELRULE: +		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); +		break; +	case NFT_MSG_DELSET: +		nft_set_destroy(nft_trans_set(trans)); +		break; +	} +	kfree(trans); +} + +static int nf_tables_commit(struct sk_buff *skb) +{ +	struct net *net = sock_net(skb->sk); +	struct nft_trans *trans, *next; +	struct nft_set *set; + +	/* Bump generation counter, invalidate any dump in progress */ +	while (++net->nft.base_seq == 0); + +	/* A new generation has just started */ +	net->nft.gencursor = gencursor_next(net); + +	/* Make sure all packets have left the previous generation before +	 * purging old rules. +	 */ +	synchronize_rcu(); + +	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { +		switch (trans->msg_type) { +		case NFT_MSG_NEWTABLE: +			if (nft_trans_table_update(trans)) { +				if (!nft_trans_table_enable(trans)) { +					nf_tables_table_disable(trans->ctx.afi, +								trans->ctx.table); +					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; +				} +			} else { +				trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; +			} +			nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELTABLE: +			nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); +			break; +		case NFT_MSG_NEWCHAIN: +			if (nft_trans_chain_update(trans)) +				nft_chain_commit_update(trans); +			else +				trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + +			nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELCHAIN: +			nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); +			if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && +			    trans->ctx.chain->flags & NFT_BASE_CHAIN) { +				nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, +						    trans->ctx.afi->nops); +			} +			break; +		case NFT_MSG_NEWRULE: +			nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); +			nf_tables_rule_notify(&trans->ctx, +					      nft_trans_rule(trans), +					      NFT_MSG_NEWRULE); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELRULE: +			list_del_rcu(&nft_trans_rule(trans)->list); +			nf_tables_rule_notify(&trans->ctx, +					      nft_trans_rule(trans), +					      NFT_MSG_DELRULE); +			break; +		case NFT_MSG_NEWSET: +			nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; +			/* This avoids hitting -EBUSY when deleting the table +			 * from the transaction. +			 */ +			if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS && +			    !list_empty(&nft_trans_set(trans)->bindings)) +				trans->ctx.table->use--; + +			nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), +					     NFT_MSG_NEWSET, GFP_KERNEL); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELSET: +			nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), +					     NFT_MSG_DELSET, GFP_KERNEL); +			break; +		case NFT_MSG_NEWSETELEM: +			nf_tables_setelem_notify(&trans->ctx, +						 nft_trans_elem_set(trans), +						 &nft_trans_elem(trans), +						 NFT_MSG_NEWSETELEM, 0); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELSETELEM: +			nf_tables_setelem_notify(&trans->ctx, +						 nft_trans_elem_set(trans), +						 &nft_trans_elem(trans), +						 NFT_MSG_DELSETELEM, 0); +			set = nft_trans_elem_set(trans); +			set->ops->get(set, &nft_trans_elem(trans)); +			set->ops->remove(set, &nft_trans_elem(trans)); +			nft_trans_destroy(trans); +			break; +		} +	} + +	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { +		list_del(&trans->list); +		trans->ctx.nla = NULL; +		call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); +	} + +	return 0; +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * aborted rules. + */ +static void nf_tables_abort_release_rcu(struct rcu_head *rt) +{ +	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + +	switch (trans->msg_type) { +	case NFT_MSG_NEWTABLE: +		nf_tables_table_destroy(&trans->ctx); +		break; +	case NFT_MSG_NEWCHAIN: +		nf_tables_chain_destroy(trans->ctx.chain); +		break; +	case NFT_MSG_NEWRULE: +		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); +		break; +	case NFT_MSG_NEWSET: +		nft_set_destroy(nft_trans_set(trans)); +		break; +	} +	kfree(trans); +} + +static int nf_tables_abort(struct sk_buff *skb) +{ +	struct net *net = sock_net(skb->sk); +	struct nft_trans *trans, *next; +	struct nft_set *set; + +	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { +		switch (trans->msg_type) { +		case NFT_MSG_NEWTABLE: +			if (nft_trans_table_update(trans)) { +				if (nft_trans_table_enable(trans)) { +					nf_tables_table_disable(trans->ctx.afi, +								trans->ctx.table); +					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; +				} +				nft_trans_destroy(trans); +			} else { +				list_del_rcu(&trans->ctx.table->list); +			} +			break; +		case NFT_MSG_DELTABLE: +			list_add_tail_rcu(&trans->ctx.table->list, +					  &trans->ctx.afi->tables); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWCHAIN: +			if (nft_trans_chain_update(trans)) { +				if (nft_trans_chain_stats(trans)) +					free_percpu(nft_trans_chain_stats(trans)); + +				nft_trans_destroy(trans); +			} else { +				trans->ctx.table->use--; +				list_del_rcu(&trans->ctx.chain->list); +				if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && +				    trans->ctx.chain->flags & NFT_BASE_CHAIN) { +					nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, +							    trans->ctx.afi->nops); +				} +			} +			break; +		case NFT_MSG_DELCHAIN: +			trans->ctx.table->use++; +			list_add_tail_rcu(&trans->ctx.chain->list, +					  &trans->ctx.table->chains); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWRULE: +			trans->ctx.chain->use--; +			list_del_rcu(&nft_trans_rule(trans)->list); +			break; +		case NFT_MSG_DELRULE: +			trans->ctx.chain->use++; +			nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWSET: +			trans->ctx.table->use--; +			list_del_rcu(&nft_trans_set(trans)->list); +			break; +		case NFT_MSG_DELSET: +			trans->ctx.table->use++; +			list_add_tail_rcu(&nft_trans_set(trans)->list, +					  &trans->ctx.table->sets); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWSETELEM: +			nft_trans_elem_set(trans)->nelems--; +			set = nft_trans_elem_set(trans); +			set->ops->get(set, &nft_trans_elem(trans)); +			set->ops->remove(set, &nft_trans_elem(trans)); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELSETELEM: +			nft_trans_elem_set(trans)->nelems++; +			nft_trans_destroy(trans); +			break; +		} +	} + +	list_for_each_entry_safe_reverse(trans, next, +					 &net->nft.commit_list, list) { +		list_del(&trans->list); +		trans->ctx.nla = NULL; +		call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); +	} + +	return 0; +} + +static const struct nfnetlink_subsystem nf_tables_subsys = { +	.name		= "nf_tables", +	.subsys_id	= NFNL_SUBSYS_NFTABLES, +	.cb_count	= NFT_MSG_MAX, +	.cb		= nf_tables_cb, +	.commit		= nf_tables_commit, +	.abort		= nf_tables_abort, +}; + +/* + * Loop detection - walk through the ruleset beginning at the destination chain + * of a new jump until either the source chain is reached (loop) or all + * reachable chains have been traversed. + * + * The loop check is performed whenever a new jump verdict is added to an + * expression or verdict map or a verdict map is bound to a new chain. + */ + +static int nf_tables_check_loops(const struct nft_ctx *ctx, +				 const struct nft_chain *chain); + +static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, +					const struct nft_set *set, +					const struct nft_set_iter *iter, +					const struct nft_set_elem *elem) +{ +	if (elem->flags & NFT_SET_ELEM_INTERVAL_END) +		return 0; + +	switch (elem->data.verdict) { +	case NFT_JUMP: +	case NFT_GOTO: +		return nf_tables_check_loops(ctx, elem->data.chain); +	default: +		return 0; +	} +} + +static int nf_tables_check_loops(const struct nft_ctx *ctx, +				 const struct nft_chain *chain) +{ +	const struct nft_rule *rule; +	const struct nft_expr *expr, *last; +	const struct nft_set *set; +	struct nft_set_binding *binding; +	struct nft_set_iter iter; + +	if (ctx->chain == chain) +		return -ELOOP; + +	list_for_each_entry(rule, &chain->rules, list) { +		nft_rule_for_each_expr(expr, last, rule) { +			const struct nft_data *data = NULL; +			int err; + +			if (!expr->ops->validate) +				continue; + +			err = expr->ops->validate(ctx, expr, &data); +			if (err < 0) +				return err; + +			if (data == NULL) +				continue; + +			switch (data->verdict) { +			case NFT_JUMP: +			case NFT_GOTO: +				err = nf_tables_check_loops(ctx, data->chain); +				if (err < 0) +					return err; +			default: +				break; +			} +		} +	} + +	list_for_each_entry(set, &ctx->table->sets, list) { +		if (!(set->flags & NFT_SET_MAP) || +		    set->dtype != NFT_DATA_VERDICT) +			continue; + +		list_for_each_entry(binding, &set->bindings, list) { +			if (binding->chain != chain) +				continue; + +			iter.skip 	= 0; +			iter.count	= 0; +			iter.err	= 0; +			iter.fn		= nf_tables_loop_check_setelem; + +			set->ops->walk(ctx, set, &iter); +			if (iter.err < 0) +				return iter.err; +		} +	} + +	return 0; +} + +/** + *	nft_validate_input_register - validate an expressions' input register + * + *	@reg: the register number + * + * 	Validate that the input register is one of the general purpose + * 	registers. + */ +int nft_validate_input_register(enum nft_registers reg) +{ +	if (reg <= NFT_REG_VERDICT) +		return -EINVAL; +	if (reg > NFT_REG_MAX) +		return -ERANGE; +	return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_input_register); + +/** + *	nft_validate_output_register - validate an expressions' output register + * + *	@reg: the register number + * + * 	Validate that the output register is one of the general purpose + * 	registers or the verdict register. + */ +int nft_validate_output_register(enum nft_registers reg) +{ +	if (reg < NFT_REG_VERDICT) +		return -EINVAL; +	if (reg > NFT_REG_MAX) +		return -ERANGE; +	return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_output_register); + +/** + *	nft_validate_data_load - validate an expressions' data load + * + *	@ctx: context of the expression performing the load + * 	@reg: the destination register number + * 	@data: the data to load + * 	@type: the data type + * + * 	Validate that a data load uses the appropriate data type for + * 	the destination register. A value of NULL for the data means + * 	that its runtime gathered data, which is always of type + * 	NFT_DATA_VALUE. + */ +int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg, +			   const struct nft_data *data, +			   enum nft_data_types type) +{ +	int err; + +	switch (reg) { +	case NFT_REG_VERDICT: +		if (data == NULL || type != NFT_DATA_VERDICT) +			return -EINVAL; + +		if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) { +			err = nf_tables_check_loops(ctx, data->chain); +			if (err < 0) +				return err; + +			if (ctx->chain->level + 1 > data->chain->level) { +				if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE) +					return -EMLINK; +				data->chain->level = ctx->chain->level + 1; +			} +		} + +		return 0; +	default: +		if (data != NULL && type != NFT_DATA_VALUE) +			return -EINVAL; +		return 0; +	} +} +EXPORT_SYMBOL_GPL(nft_validate_data_load); + +static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { +	[NFTA_VERDICT_CODE]	= { .type = NLA_U32 }, +	[NFTA_VERDICT_CHAIN]	= { .type = NLA_STRING, +				    .len = NFT_CHAIN_MAXNAMELEN - 1 }, +}; + +static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, +			    struct nft_data_desc *desc, const struct nlattr *nla) +{ +	struct nlattr *tb[NFTA_VERDICT_MAX + 1]; +	struct nft_chain *chain; +	int err; + +	err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy); +	if (err < 0) +		return err; + +	if (!tb[NFTA_VERDICT_CODE]) +		return -EINVAL; +	data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); + +	switch (data->verdict) { +	default: +		switch (data->verdict & NF_VERDICT_MASK) { +		case NF_ACCEPT: +		case NF_DROP: +		case NF_QUEUE: +			break; +		default: +			return -EINVAL; +		} +		/* fall through */ +	case NFT_CONTINUE: +	case NFT_BREAK: +	case NFT_RETURN: +		desc->len = sizeof(data->verdict); +		break; +	case NFT_JUMP: +	case NFT_GOTO: +		if (!tb[NFTA_VERDICT_CHAIN]) +			return -EINVAL; +		chain = nf_tables_chain_lookup(ctx->table, +					       tb[NFTA_VERDICT_CHAIN]); +		if (IS_ERR(chain)) +			return PTR_ERR(chain); +		if (chain->flags & NFT_BASE_CHAIN) +			return -EOPNOTSUPP; + +		chain->use++; +		data->chain = chain; +		desc->len = sizeof(data); +		break; +	} + +	desc->type = NFT_DATA_VERDICT; +	return 0; +} + +static void nft_verdict_uninit(const struct nft_data *data) +{ +	switch (data->verdict) { +	case NFT_JUMP: +	case NFT_GOTO: +		data->chain->use--; +		break; +	} +} + +static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data) +{ +	struct nlattr *nest; + +	nest = nla_nest_start(skb, NFTA_DATA_VERDICT); +	if (!nest) +		goto nla_put_failure; + +	if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict))) +		goto nla_put_failure; + +	switch (data->verdict) { +	case NFT_JUMP: +	case NFT_GOTO: +		if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name)) +			goto nla_put_failure; +	} +	nla_nest_end(skb, nest); +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data, +			  struct nft_data_desc *desc, const struct nlattr *nla) +{ +	unsigned int len; + +	len = nla_len(nla); +	if (len == 0) +		return -EINVAL; +	if (len > sizeof(data->data)) +		return -EOVERFLOW; + +	nla_memcpy(data->data, nla, sizeof(data->data)); +	desc->type = NFT_DATA_VALUE; +	desc->len  = len; +	return 0; +} + +static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data, +			  unsigned int len) +{ +	return nla_put(skb, NFTA_DATA_VALUE, len, data->data); +} + +static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { +	[NFTA_DATA_VALUE]	= { .type = NLA_BINARY, +				    .len  = FIELD_SIZEOF(struct nft_data, data) }, +	[NFTA_DATA_VERDICT]	= { .type = NLA_NESTED }, +}; + +/** + *	nft_data_init - parse nf_tables data netlink attributes + * + *	@ctx: context of the expression using the data + *	@data: destination struct nft_data + *	@desc: data description + *	@nla: netlink attribute containing data + * + *	Parse the netlink data attributes and initialize a struct nft_data. + *	The type and length of data are returned in the data description. + * + *	The caller can indicate that it only wants to accept data of type + *	NFT_DATA_VALUE by passing NULL for the ctx argument. + */ +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, +		  struct nft_data_desc *desc, const struct nlattr *nla) +{ +	struct nlattr *tb[NFTA_DATA_MAX + 1]; +	int err; + +	err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy); +	if (err < 0) +		return err; + +	if (tb[NFTA_DATA_VALUE]) +		return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); +	if (tb[NFTA_DATA_VERDICT] && ctx != NULL) +		return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); +	return -EINVAL; +} +EXPORT_SYMBOL_GPL(nft_data_init); + +/** + *	nft_data_uninit - release a nft_data item + * + *	@data: struct nft_data to release + *	@type: type of data + * + *	Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, + *	all others need to be released by calling this function. + */ +void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) +{ +	switch (type) { +	case NFT_DATA_VALUE: +		return; +	case NFT_DATA_VERDICT: +		return nft_verdict_uninit(data); +	default: +		WARN_ON(1); +	} +} +EXPORT_SYMBOL_GPL(nft_data_uninit); + +int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, +		  enum nft_data_types type, unsigned int len) +{ +	struct nlattr *nest; +	int err; + +	nest = nla_nest_start(skb, attr); +	if (nest == NULL) +		return -1; + +	switch (type) { +	case NFT_DATA_VALUE: +		err = nft_value_dump(skb, data, len); +		break; +	case NFT_DATA_VERDICT: +		err = nft_verdict_dump(skb, data); +		break; +	default: +		err = -EINVAL; +		WARN_ON(1); +	} + +	nla_nest_end(skb, nest); +	return err; +} +EXPORT_SYMBOL_GPL(nft_data_dump); + +static int nf_tables_init_net(struct net *net) +{ +	INIT_LIST_HEAD(&net->nft.af_info); +	INIT_LIST_HEAD(&net->nft.commit_list); +	net->nft.base_seq = 1; +	return 0; +} + +static struct pernet_operations nf_tables_net_ops = { +	.init	= nf_tables_init_net, +}; + +static int __init nf_tables_module_init(void) +{ +	int err; + +	info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS, +		       GFP_KERNEL); +	if (info == NULL) { +		err = -ENOMEM; +		goto err1; +	} + +	err = nf_tables_core_module_init(); +	if (err < 0) +		goto err2; + +	err = nfnetlink_subsys_register(&nf_tables_subsys); +	if (err < 0) +		goto err3; + +	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n"); +	return register_pernet_subsys(&nf_tables_net_ops); +err3: +	nf_tables_core_module_exit(); +err2: +	kfree(info); +err1: +	return err; +} + +static void __exit nf_tables_module_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_net_ops); +	nfnetlink_subsys_unregister(&nf_tables_subsys); +	nf_tables_core_module_exit(); +	kfree(info); +} + +module_init(nf_tables_module_init); +module_exit(nf_tables_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c new file mode 100644 index 00000000000..3b90eb2b2c5 --- /dev/null +++ b/net/netfilter/nf_tables_core.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> + +static void nft_cmp_fast_eval(const struct nft_expr *expr, +			      struct nft_data data[NFT_REG_MAX + 1]) +{ +	const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); +	u32 mask = nft_cmp_fast_mask(priv->len); + +	if ((data[priv->sreg].data[0] & mask) == priv->data) +		return; +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static bool nft_payload_fast_eval(const struct nft_expr *expr, +				  struct nft_data data[NFT_REG_MAX + 1], +				  const struct nft_pktinfo *pkt) +{ +	const struct nft_payload *priv = nft_expr_priv(expr); +	const struct sk_buff *skb = pkt->skb; +	struct nft_data *dest = &data[priv->dreg]; +	unsigned char *ptr; + +	if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) +		ptr = skb_network_header(skb); +	else +		ptr = skb_network_header(skb) + pkt->xt.thoff; + +	ptr += priv->offset; + +	if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) +		return false; + +	if (priv->len == 2) +		*(u16 *)dest->data = *(u16 *)ptr; +	else if (priv->len == 4) +		*(u32 *)dest->data = *(u32 *)ptr; +	else +		*(u8 *)dest->data = *(u8 *)ptr; +	return true; +} + +struct nft_jumpstack { +	const struct nft_chain	*chain; +	const struct nft_rule	*rule; +	int			rulenum; +}; + +enum nft_trace { +	NFT_TRACE_RULE, +	NFT_TRACE_RETURN, +	NFT_TRACE_POLICY, +}; + +static const char *const comments[] = { +	[NFT_TRACE_RULE]	= "rule", +	[NFT_TRACE_RETURN]	= "return", +	[NFT_TRACE_POLICY]	= "policy", +}; + +static struct nf_loginfo trace_loginfo = { +	.type = NF_LOG_TYPE_LOG, +	.u = { +		.log = { +			.level = 4, +			.logflags = NF_LOG_MASK, +	        }, +	}, +}; + +static void nft_trace_packet(const struct nft_pktinfo *pkt, +			     const struct nft_chain *chain, +			     int rulenum, enum nft_trace type) +{ +	struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + +	nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, +		      pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", +		      chain->table->name, chain->name, comments[type], +		      rulenum); +} + +unsigned int +nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +{ +	const struct nft_chain *chain = ops->priv, *basechain = chain; +	const struct nft_rule *rule; +	const struct nft_expr *expr, *last; +	struct nft_data data[NFT_REG_MAX + 1]; +	unsigned int stackptr = 0; +	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; +	struct nft_stats *stats; +	int rulenum; +	/* +	 * Cache cursor to avoid problems in case that the cursor is updated +	 * while traversing the ruleset. +	 */ +	unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); + +do_chain: +	rulenum = 0; +	rule = list_entry(&chain->rules, struct nft_rule, list); +next_rule: +	data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +	list_for_each_entry_continue_rcu(rule, &chain->rules, list) { + +		/* This rule is not active, skip. */ +		if (unlikely(rule->genmask & (1 << gencursor))) +			continue; + +		rulenum++; + +		nft_rule_for_each_expr(expr, last, rule) { +			if (expr->ops == &nft_cmp_fast_ops) +				nft_cmp_fast_eval(expr, data); +			else if (expr->ops != &nft_payload_fast_ops || +				 !nft_payload_fast_eval(expr, data, pkt)) +				expr->ops->eval(expr, data, pkt); + +			if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE) +				break; +		} + +		switch (data[NFT_REG_VERDICT].verdict) { +		case NFT_BREAK: +			data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +			continue; +		case NFT_CONTINUE: +			if (unlikely(pkt->skb->nf_trace)) +				nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); +			continue; +		} +		break; +	} + +	switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) { +	case NF_ACCEPT: +	case NF_DROP: +	case NF_QUEUE: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + +		return data[NFT_REG_VERDICT].verdict; +	} + +	switch (data[NFT_REG_VERDICT].verdict) { +	case NFT_JUMP: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + +		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); +		jumpstack[stackptr].chain = chain; +		jumpstack[stackptr].rule  = rule; +		jumpstack[stackptr].rulenum = rulenum; +		stackptr++; +		chain = data[NFT_REG_VERDICT].chain; +		goto do_chain; +	case NFT_GOTO: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + +		chain = data[NFT_REG_VERDICT].chain; +		goto do_chain; +	case NFT_RETURN: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); +		break; +	case NFT_CONTINUE: +		if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) +			nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); +		break; +	default: +		WARN_ON(1); +	} + +	if (stackptr > 0) { +		stackptr--; +		chain = jumpstack[stackptr].chain; +		rule  = jumpstack[stackptr].rule; +		rulenum = jumpstack[stackptr].rulenum; +		goto next_rule; +	} + +	if (unlikely(pkt->skb->nf_trace)) +		nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + +	rcu_read_lock_bh(); +	stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); +	u64_stats_update_begin(&stats->syncp); +	stats->pkts++; +	stats->bytes += pkt->skb->len; +	u64_stats_update_end(&stats->syncp); +	rcu_read_unlock_bh(); + +	return nft_base_chain(basechain)->policy; +} +EXPORT_SYMBOL_GPL(nft_do_chain); + +int __init nf_tables_core_module_init(void) +{ +	int err; + +	err = nft_immediate_module_init(); +	if (err < 0) +		goto err1; + +	err = nft_cmp_module_init(); +	if (err < 0) +		goto err2; + +	err = nft_lookup_module_init(); +	if (err < 0) +		goto err3; + +	err = nft_bitwise_module_init(); +	if (err < 0) +		goto err4; + +	err = nft_byteorder_module_init(); +	if (err < 0) +		goto err5; + +	err = nft_payload_module_init(); +	if (err < 0) +		goto err6; + +	return 0; + +err6: +	nft_byteorder_module_exit(); +err5: +	nft_bitwise_module_exit(); +err4: +	nft_lookup_module_exit(); +err3: +	nft_cmp_module_exit(); +err2: +	nft_immediate_module_exit(); +err1: +	return err; +} + +void nf_tables_core_module_exit(void) +{ +	nft_payload_module_exit(); +	nft_byteorder_module_exit(); +	nft_bitwise_module_exit(); +	nft_lookup_module_exit(); +	nft_cmp_module_exit(); +	nft_immediate_module_exit(); +} diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c new file mode 100644 index 00000000000..9dd2d216cfc --- /dev/null +++ b/net/netfilter/nf_tables_inet.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/ip.h> + +static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n) +{ +	struct nft_af_info *afi; + +	if (n == 1) +		afi = &nft_af_ipv4; +	else +		afi = &nft_af_ipv6; + +	ops->pf = afi->family; +	if (afi->hooks[ops->hooknum]) +		ops->hook = afi->hooks[ops->hooknum]; +} + +static struct nft_af_info nft_af_inet __read_mostly = { +	.family		= NFPROTO_INET, +	.nhooks		= NF_INET_NUMHOOKS, +	.owner		= THIS_MODULE, +	.nops		= 2, +	.hook_ops_init	= nft_inet_hook_ops_init, +}; + +static int __net_init nf_tables_inet_init_net(struct net *net) +{ +	net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); +	if (net->nft.inet == NULL) +		return -ENOMEM; +	memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet)); + +	if (nft_register_afinfo(net, net->nft.inet) < 0) +		goto err; + +	return 0; + +err: +	kfree(net->nft.inet); +	return -ENOMEM; +} + +static void __net_exit nf_tables_inet_exit_net(struct net *net) +{ +	nft_unregister_afinfo(net->nft.inet); +	kfree(net->nft.inet); +} + +static struct pernet_operations nf_tables_inet_net_ops = { +	.init	= nf_tables_inet_init_net, +	.exit	= nf_tables_inet_exit_net, +}; + +static const struct nf_chain_type filter_inet = { +	.name		= "filter", +	.type		= NFT_CHAIN_T_DEFAULT, +	.family		= NFPROTO_INET, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_LOCAL_IN) | +			  (1 << NF_INET_LOCAL_OUT) | +			  (1 << NF_INET_FORWARD) | +			  (1 << NF_INET_PRE_ROUTING) | +			  (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_inet_init(void) +{ +	int ret; + +	nft_register_chain_type(&filter_inet); +	ret = register_pernet_subsys(&nf_tables_inet_net_ops); +	if (ret < 0) +		nft_unregister_chain_type(&filter_inet); + +	return ret; +} + +static void __exit nf_tables_inet_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_inet_net_ops); +	nft_unregister_chain_type(&filter_inet); +} + +module_init(nf_tables_inet_init); +module_exit(nf_tables_inet_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(1); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 572d87dc116..c138b8fbe28 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -61,6 +61,14 @@ void nfnl_unlock(__u8 subsys_id)  }  EXPORT_SYMBOL_GPL(nfnl_unlock); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_nfnl_is_held(u8 subsys_id) +{ +	return lockdep_is_held(&table[subsys_id].mutex); +} +EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); +#endif +  int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)  {  	nfnl_lock(n->subsys_id); @@ -147,9 +155,6 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	const struct nfnetlink_subsystem *ss;  	int type, err; -	if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) -		return -EPERM; -  	/* All the messages must at least contain nfgenmsg */  	if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))  		return 0; @@ -217,25 +222,194 @@ replay:  	}  } +static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, +				u_int16_t subsys_id) +{ +	struct sk_buff *nskb, *oskb = skb; +	struct net *net = sock_net(skb->sk); +	const struct nfnetlink_subsystem *ss; +	const struct nfnl_callback *nc; +	bool success = true, done = false; +	int err; + +	if (subsys_id >= NFNL_SUBSYS_COUNT) +		return netlink_ack(skb, nlh, -EINVAL); +replay: +	nskb = netlink_skb_clone(oskb, GFP_KERNEL); +	if (!nskb) +		return netlink_ack(oskb, nlh, -ENOMEM); + +	nskb->sk = oskb->sk; +	skb = nskb; + +	nfnl_lock(subsys_id); +	ss = rcu_dereference_protected(table[subsys_id].subsys, +				       lockdep_is_held(&table[subsys_id].mutex)); +	if (!ss) { +#ifdef CONFIG_MODULES +		nfnl_unlock(subsys_id); +		request_module("nfnetlink-subsys-%d", subsys_id); +		nfnl_lock(subsys_id); +		ss = rcu_dereference_protected(table[subsys_id].subsys, +					       lockdep_is_held(&table[subsys_id].mutex)); +		if (!ss) +#endif +		{ +			nfnl_unlock(subsys_id); +			netlink_ack(skb, nlh, -EOPNOTSUPP); +			return kfree_skb(nskb); +		} +	} + +	if (!ss->commit || !ss->abort) { +		nfnl_unlock(subsys_id); +		netlink_ack(skb, nlh, -EOPNOTSUPP); +		return kfree_skb(skb); +	} + +	while (skb->len >= nlmsg_total_size(0)) { +		int msglen, type; + +		nlh = nlmsg_hdr(skb); +		err = 0; + +		if (nlh->nlmsg_len < NLMSG_HDRLEN) { +			err = -EINVAL; +			goto ack; +		} + +		/* Only requests are handled by the kernel */ +		if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { +			err = -EINVAL; +			goto ack; +		} + +		type = nlh->nlmsg_type; +		if (type == NFNL_MSG_BATCH_BEGIN) { +			/* Malformed: Batch begin twice */ +			success = false; +			goto done; +		} else if (type == NFNL_MSG_BATCH_END) { +			done = true; +			goto done; +		} else if (type < NLMSG_MIN_TYPE) { +			err = -EINVAL; +			goto ack; +		} + +		/* We only accept a batch with messages for the same +		 * subsystem. +		 */ +		if (NFNL_SUBSYS_ID(type) != subsys_id) { +			err = -EINVAL; +			goto ack; +		} + +		nc = nfnetlink_find_client(type, ss); +		if (!nc) { +			err = -EINVAL; +			goto ack; +		} + +		{ +			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); +			u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); +			struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; +			struct nlattr *attr = (void *)nlh + min_len; +			int attrlen = nlh->nlmsg_len - min_len; + +			err = nla_parse(cda, ss->cb[cb_id].attr_count, +					attr, attrlen, ss->cb[cb_id].policy); +			if (err < 0) +				goto ack; + +			if (nc->call_batch) { +				err = nc->call_batch(net->nfnl, skb, nlh, +						     (const struct nlattr **)cda); +			} + +			/* The lock was released to autoload some module, we +			 * have to abort and start from scratch using the +			 * original skb. +			 */ +			if (err == -EAGAIN) { +				ss->abort(skb); +				nfnl_unlock(subsys_id); +				kfree_skb(nskb); +				goto replay; +			} +		} +ack: +		if (nlh->nlmsg_flags & NLM_F_ACK || err) { +			/* We don't stop processing the batch on errors, thus, +			 * userspace gets all the errors that the batch +			 * triggers. +			 */ +			netlink_ack(skb, nlh, err); +			if (err) +				success = false; +		} + +		msglen = NLMSG_ALIGN(nlh->nlmsg_len); +		if (msglen > skb->len) +			msglen = skb->len; +		skb_pull(skb, msglen); +	} +done: +	if (success && done) +		ss->commit(skb); +	else +		ss->abort(skb); + +	nfnl_unlock(subsys_id); +	kfree_skb(nskb); +} +  static void nfnetlink_rcv(struct sk_buff *skb)  { -	netlink_rcv_skb(skb, &nfnetlink_rcv_msg); +	struct nlmsghdr *nlh = nlmsg_hdr(skb); +	int msglen; + +	if (nlh->nlmsg_len < NLMSG_HDRLEN || +	    skb->len < nlh->nlmsg_len) +		return; + +	if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { +		netlink_ack(skb, nlh, -EPERM); +		return; +	} + +	if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { +		struct nfgenmsg *nfgenmsg; + +		msglen = NLMSG_ALIGN(nlh->nlmsg_len); +		if (msglen > skb->len) +			msglen = skb->len; + +		if (nlh->nlmsg_len < NLMSG_HDRLEN || +		    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) +			return; + +		nfgenmsg = nlmsg_data(nlh); +		skb_pull(skb, msglen); +		nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); +	} else { +		netlink_rcv_skb(skb, &nfnetlink_rcv_msg); +	}  }  #ifdef CONFIG_MODULES -static void nfnetlink_bind(int group) +static int nfnetlink_bind(int group)  {  	const struct nfnetlink_subsystem *ss;  	int type = nfnl_group2type[group];  	rcu_read_lock();  	ss = nfnetlink_get_subsys(type); -	if (!ss) { -		rcu_read_unlock(); -		request_module("nfnetlink-subsys-%d", type); -		return; -	}  	rcu_read_unlock(); +	if (!ss) +		request_module("nfnetlink-subsys-%d", type); +	return 0;  }  #endif diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index c7b6d466a66..2baa125c2e8 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -32,18 +32,24 @@ static LIST_HEAD(nfnl_acct_list);  struct nf_acct {  	atomic64_t		pkts;  	atomic64_t		bytes; +	unsigned long		flags;  	struct list_head	head;  	atomic_t		refcnt;  	char			name[NFACCT_NAME_MAX];  	struct rcu_head		rcu_head; +	char			data[0];  }; +#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) +  static int  nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,  	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])  {  	struct nf_acct *nfacct, *matching = NULL;  	char *acct_name; +	unsigned int size = 0; +	u32 flags = 0;  	if (!tb[NFACCT_NAME])  		return -EINVAL; @@ -68,15 +74,38 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,  			/* reset counters if you request a replacement. */  			atomic64_set(&matching->pkts, 0);  			atomic64_set(&matching->bytes, 0); +			smp_mb__before_atomic(); +			/* reset overquota flag if quota is enabled. */ +			if ((matching->flags & NFACCT_F_QUOTA)) +				clear_bit(NFACCT_F_OVERQUOTA, &matching->flags);  			return 0;  		}  		return -EBUSY;  	} -	nfacct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL); +	if (tb[NFACCT_FLAGS]) { +		flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); +		if (flags & ~NFACCT_F_QUOTA) +			return -EOPNOTSUPP; +		if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) +			return -EINVAL; +		if (flags & NFACCT_F_OVERQUOTA) +			return -EINVAL; + +		size += sizeof(u64); +	} + +	nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL);  	if (nfacct == NULL)  		return -ENOMEM; +	if (flags & NFACCT_F_QUOTA) { +		u64 *quota = (u64 *)nfacct->data; + +		*quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); +		nfacct->flags = flags; +	} +  	strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);  	if (tb[NFACCT_BYTES]) { @@ -117,6 +146,9 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,  	if (type == NFNL_MSG_ACCT_GET_CTRZERO) {  		pkts = atomic64_xchg(&acct->pkts, 0);  		bytes = atomic64_xchg(&acct->bytes, 0); +		smp_mb__before_atomic(); +		if (acct->flags & NFACCT_F_QUOTA) +			clear_bit(NFACCT_F_OVERQUOTA, &acct->flags);  	} else {  		pkts = atomic64_read(&acct->pkts);  		bytes = atomic64_read(&acct->bytes); @@ -125,7 +157,13 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,  	    nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) ||  	    nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))))  		goto nla_put_failure; +	if (acct->flags & NFACCT_F_QUOTA) { +		u64 *quota = (u64 *)acct->data; +		if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) || +		    nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) +			goto nla_put_failure; +	}  	nlmsg_end(skb, nlh);  	return skb->len; @@ -270,6 +308,8 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {  	[NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 },  	[NFACCT_BYTES] = { .type = NLA_U64 },  	[NFACCT_PKTS] = { .type = NLA_U64 }, +	[NFACCT_FLAGS] = { .type = NLA_U32 }, +	[NFACCT_QUOTA] = { .type = NLA_U64 },  };  static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { @@ -336,6 +376,50 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)  }  EXPORT_SYMBOL_GPL(nfnl_acct_update); +static void nfnl_overquota_report(struct nf_acct *nfacct) +{ +	int ret; +	struct sk_buff *skb; + +	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); +	if (skb == NULL) +		return; + +	ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, +				  nfacct); +	if (ret <= 0) { +		kfree_skb(skb); +		return; +	} +	netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, +			  GFP_ATOMIC); +} + +int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) +{ +	u64 now; +	u64 *quota; +	int ret = NFACCT_UNDERQUOTA; + +	/* no place here if we don't have a quota */ +	if (!(nfacct->flags & NFACCT_F_QUOTA)) +		return NFACCT_NO_QUOTA; + +	quota = (u64 *)nfacct->data; +	now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? +	       atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); + +	ret = now > *quota; + +	if (now >= *quota && +	    !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) { +		nfnl_overquota_report(nfacct); +	} + +	return ret; +} +EXPORT_SYMBOL_GPL(nfnl_acct_overquota); +  static int __init nfnl_acct_init(void)  {  	int ret; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 50580494148..476accd1714 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -49,10 +49,8 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {  };  static int -ctnl_timeout_parse_policy(struct ctnl_timeout *timeout, -			  struct nf_conntrack_l4proto *l4proto, -			  struct net *net, -			  const struct nlattr *attr) +ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, +			  struct net *net, const struct nlattr *attr)  {  	int ret = 0; @@ -64,8 +62,7 @@ ctnl_timeout_parse_policy(struct ctnl_timeout *timeout,  		if (ret < 0)  			return ret; -		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, -							  &timeout->data); +		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);  	}  	return ret;  } @@ -123,7 +120,8 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,  				goto err_proto_put;  			} -			ret = ctnl_timeout_parse_policy(matching, l4proto, net, +			ret = ctnl_timeout_parse_policy(&matching->data, +							l4proto, net,  							cda[CTA_TIMEOUT_DATA]);  			return ret;  		} @@ -138,7 +136,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,  		goto err_proto_put;  	} -	ret = ctnl_timeout_parse_policy(timeout, l4proto, net, +	ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net,  					cda[CTA_TIMEOUT_DATA]);  	if (ret < 0)  		goto err; @@ -342,6 +340,147 @@ cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb,  	return ret;  } +static int +cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb, +		      const struct nlmsghdr *nlh, +		      const struct nlattr * const cda[]) +{ +	__u16 l3num; +	__u8 l4num; +	struct nf_conntrack_l4proto *l4proto; +	struct net *net = sock_net(skb->sk); +	unsigned int *timeouts; +	int ret; + +	if (!cda[CTA_TIMEOUT_L3PROTO] || +	    !cda[CTA_TIMEOUT_L4PROTO] || +	    !cda[CTA_TIMEOUT_DATA]) +		return -EINVAL; + +	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); +	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); +	l4proto = nf_ct_l4proto_find_get(l3num, l4num); + +	/* This protocol is not supported, skip. */ +	if (l4proto->l4proto != l4num) { +		ret = -EOPNOTSUPP; +		goto err; +	} + +	timeouts = l4proto->get_timeouts(net); + +	ret = ctnl_timeout_parse_policy(timeouts, l4proto, net, +					cda[CTA_TIMEOUT_DATA]); +	if (ret < 0) +		goto err; + +	nf_ct_l4proto_put(l4proto); +	return 0; +err: +	nf_ct_l4proto_put(l4proto); +	return ret; +} + +static int +cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, +			    u32 seq, u32 type, int event, +			    struct nf_conntrack_l4proto *l4proto) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0; + +	event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) || +	    nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) +		goto nla_put_failure; + +	if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { +		struct nlattr *nest_parms; +		unsigned int *timeouts = l4proto->get_timeouts(net); +		int ret; + +		nest_parms = nla_nest_start(skb, +					    CTA_TIMEOUT_DATA | NLA_F_NESTED); +		if (!nest_parms) +			goto nla_put_failure; + +		ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); +		if (ret < 0) +			goto nla_put_failure; + +		nla_nest_end(skb, nest_parms); +	} + +	nlmsg_end(skb, nlh); +	return skb->len; + +nlmsg_failure: +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb, +				 const struct nlmsghdr *nlh, +				 const struct nlattr * const cda[]) +{ +	__u16 l3num; +	__u8 l4num; +	struct nf_conntrack_l4proto *l4proto; +	struct net *net = sock_net(skb->sk); +	struct sk_buff *skb2; +	int ret, err; + +	if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) +		return -EINVAL; + +	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); +	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); +	l4proto = nf_ct_l4proto_find_get(l3num, l4num); + +	/* This protocol is not supported, skip. */ +	if (l4proto->l4proto != l4num) { +		err = -EOPNOTSUPP; +		goto err; +	} + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) { +		err = -ENOMEM; +		goto err; +	} + +	ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, +					  nlh->nlmsg_seq, +					  NFNL_MSG_TYPE(nlh->nlmsg_type), +					  IPCTNL_MSG_TIMEOUT_DEFAULT_SET, +					  l4proto); +	if (ret <= 0) { +		kfree_skb(skb2); +		err = -ENOMEM; +		goto err; +	} +	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +	if (ret > 0) +		ret = 0; + +	/* this avoids a loop in nfnetlink. */ +	return ret == -EAGAIN ? -ENOBUFS : ret; +err: +	nf_ct_l4proto_put(l4proto); +	return err; +} +  #ifdef CONFIG_NF_CONNTRACK_TIMEOUT  static struct ctnl_timeout *ctnl_timeout_find_get(const char *name)  { @@ -384,6 +523,12 @@ static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {  	[IPCTNL_MSG_TIMEOUT_DELETE]	= { .call = cttimeout_del_timeout,  					    .attr_count = CTA_TIMEOUT_MAX,  					    .policy = cttimeout_nla_policy }, +	[IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set, +					    .attr_count = CTA_TIMEOUT_MAX, +					    .policy = cttimeout_nla_policy }, +	[IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get, +					    .attr_count = CTA_TIMEOUT_MAX, +					    .policy = cttimeout_nla_policy },  };  static const struct nfnetlink_subsystem cttimeout_subsys = { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d92cc317bf8..d292c8d286e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -28,8 +28,6 @@  #include <linux/proc_fs.h>  #include <linux/security.h>  #include <linux/list.h> -#include <linux/jhash.h> -#include <linux/random.h>  #include <linux/slab.h>  #include <net/sock.h>  #include <net/netfilter/nf_log.h> @@ -75,7 +73,6 @@ struct nfulnl_instance {  };  #define INSTANCE_BUCKETS	16 -static unsigned int hash_init;  static int nfnl_log_net_id __read_mostly; @@ -319,7 +316,8 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)  }  static struct sk_buff * -nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, +		 unsigned int pkt_size)  {  	struct sk_buff *skb;  	unsigned int n; @@ -328,13 +326,13 @@ nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size)  	 * message.  WARNING: has to be <= 128k due to slab restrictions */  	n = max(inst_size, pkt_size); -	skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC); +	skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);  	if (!skb) {  		if (n > pkt_size) {  			/* try to allocate only as much as we need for current  			 * packet */ -			skb = nfnetlink_alloc_skb(&init_net, pkt_size, +			skb = nfnetlink_alloc_skb(net, pkt_size,  						  peer_portid, GFP_ATOMIC);  			if (!skb)  				pr_err("nfnetlink_log: can't even alloc %u bytes\n", @@ -702,8 +700,8 @@ nfulnl_log_packet(struct net *net,  	}  	if (!inst->skb) { -		inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz, -					     size); +		inst->skb = nfulnl_alloc_skb(net, inst->peer_portid, +					     inst->nlbufsiz, size);  		if (!inst->skb)  			goto alloc_failure;  	} @@ -1052,6 +1050,7 @@ static void __net_exit nfnl_log_net_exit(struct net *net)  #ifdef CONFIG_PROC_FS  	remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);  #endif +	nf_log_unset(net, &nfulnl_logger);  }  static struct pernet_operations nfnl_log_net_ops = { @@ -1065,11 +1064,6 @@ static int __init nfnetlink_log_init(void)  {  	int status = -ENOMEM; -	/* it's not really all that important to have a random value, so -	 * we can do this from the init function, even if there hasn't -	 * been that much entropy yet */ -	get_random_bytes(&hash_init, sizeof(hash_init)); -  	netlink_register_notifier(&nfulnl_rtnl_notifier);  	status = nfnetlink_subsys_register(&nfulnl_subsys);  	if (status < 0) { diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index ae2e5c11d01..108120f216b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -29,6 +29,7 @@  #include <linux/netfilter/nfnetlink_queue.h>  #include <linux/list.h>  #include <net/sock.h> +#include <net/tcp_states.h>  #include <net/netfilter/nf_queue.h>  #include <net/netns/generic.h>  #include <net/netfilter/nfnetlink_queue.h> @@ -235,51 +236,6 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)  	spin_unlock_bh(&queue->lock);  } -static void -nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) -{ -	int i, j = 0; -	int plen = 0; /* length of skb->head fragment */ -	struct page *page; -	unsigned int offset; - -	/* dont bother with small payloads */ -	if (len <= skb_tailroom(to)) { -		skb_copy_bits(from, 0, skb_put(to, len), len); -		return; -	} - -	if (hlen) { -		skb_copy_bits(from, 0, skb_put(to, hlen), hlen); -		len -= hlen; -	} else { -		plen = min_t(int, skb_headlen(from), len); -		if (plen) { -			page = virt_to_head_page(from->head); -			offset = from->data - (unsigned char *)page_address(page); -			__skb_fill_page_desc(to, 0, page, offset, plen); -			get_page(page); -			j = 1; -			len -= plen; -		} -	} - -	to->truesize += len + plen; -	to->len += len + plen; -	to->data_len += len + plen; - -	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { -		if (!len) -			break; -		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; -		skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); -		len -= skb_shinfo(to)->frags[j].size; -		skb_frag_ref(to, j); -		j++; -	} -	skb_shinfo(to)->nr_frags = j; -} -  static int  nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,  		      bool csum_verify) @@ -297,14 +253,39 @@ nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,  	return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0;  } +static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) +{ +	const struct cred *cred; + +	if (sk->sk_state == TCP_TIME_WAIT) +		return 0; + +	read_lock_bh(&sk->sk_callback_lock); +	if (sk->sk_socket && sk->sk_socket->file) { +		cred = sk->sk_socket->file->f_cred; +		if (nla_put_be32(skb, NFQA_UID, +		    htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFQA_GID, +		    htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) +			goto nla_put_failure; +	} +	read_unlock_bh(&sk->sk_callback_lock); +	return 0; + +nla_put_failure: +	read_unlock_bh(&sk->sk_callback_lock); +	return -1; +} +  static struct sk_buff * -nfqnl_build_packet_message(struct nfqnl_instance *queue, +nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,  			   struct nf_queue_entry *entry,  			   __be32 **packet_id_ptr)  {  	size_t size;  	size_t data_len = 0, cap_len = 0; -	int hlen = 0; +	unsigned int hlen = 0;  	struct sk_buff *skb;  	struct nlattr *nla;  	struct nfqnl_msg_packet_hdr *pmsg; @@ -356,14 +337,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,  		if (data_len > entskb->len)  			data_len = entskb->len; -		if (!entskb->head_frag || -		    skb_headlen(entskb) < L1_CACHE_BYTES || -		    skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS) -			hlen = skb_headlen(entskb); - -		if (skb_has_frag_list(entskb)) -			hlen = entskb->len; -		hlen = min_t(int, data_len, hlen); +		hlen = skb_zerocopy_headlen(entskb); +		hlen = min_t(unsigned int, hlen, data_len);  		size += sizeof(struct nlattr) + hlen;  		cap_len = entskb->len;  		break; @@ -372,15 +347,23 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,  	if (queue->flags & NFQA_CFG_F_CONNTRACK)  		ct = nfqnl_ct_get(entskb, &size, &ctinfo); -	skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid, +	if (queue->flags & NFQA_CFG_F_UID_GID) { +		size +=  (nla_total_size(sizeof(u_int32_t))	/* uid */ +			+ nla_total_size(sizeof(u_int32_t)));	/* gid */ +	} + +	skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,  				  GFP_ATOMIC); -	if (!skb) +	if (!skb) { +		skb_tx_error(entskb);  		return NULL; +	}  	nlh = nlmsg_put(skb, 0, 0,  			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,  			sizeof(struct nfgenmsg), 0);  	if (!nlh) { +		skb_tx_error(entskb);  		kfree_skb(skb);  		return NULL;  	} @@ -484,6 +467,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,  			goto nla_put_failure;  	} +	if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && +	    nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) +		goto nla_put_failure; +  	if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)  		goto nla_put_failure; @@ -504,13 +491,15 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,  		nla->nla_type = NFQA_PAYLOAD;  		nla->nla_len = nla_attr_size(data_len); -		nfqnl_zcopy(skb, entskb, data_len, hlen); +		if (skb_zerocopy(skb, entskb, data_len, hlen)) +			goto nla_put_failure;  	}  	nlh->nlmsg_len = skb->len;  	return skb;  nla_put_failure: +	skb_tx_error(entskb);  	kfree_skb(skb);  	net_err_ratelimited("nf_queue: error creating packet message\n");  	return NULL; @@ -525,7 +514,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,  	__be32 *packet_id_ptr;  	int failopen = 0; -	nskb = nfqnl_build_packet_message(queue, entry, &packet_id_ptr); +	nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr);  	if (nskb == NULL) {  		err = -ENOMEM;  		goto err_out; diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c new file mode 100644 index 00000000000..4fb6ee2c110 --- /dev/null +++ b/net/netfilter/nft_bitwise.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_bitwise { +	enum nft_registers	sreg:8; +	enum nft_registers	dreg:8; +	u8			len; +	struct nft_data		mask; +	struct nft_data		xor; +}; + +static void nft_bitwise_eval(const struct nft_expr *expr, +			     struct nft_data data[NFT_REG_MAX + 1], +			     const struct nft_pktinfo *pkt) +{ +	const struct nft_bitwise *priv = nft_expr_priv(expr); +	const struct nft_data *src = &data[priv->sreg]; +	struct nft_data *dst = &data[priv->dreg]; +	unsigned int i; + +	for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) { +		dst->data[i] = (src->data[i] & priv->mask.data[i]) ^ +			       priv->xor.data[i]; +	} +} + +static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { +	[NFTA_BITWISE_SREG]	= { .type = NLA_U32 }, +	[NFTA_BITWISE_DREG]	= { .type = NLA_U32 }, +	[NFTA_BITWISE_LEN]	= { .type = NLA_U32 }, +	[NFTA_BITWISE_MASK]	= { .type = NLA_NESTED }, +	[NFTA_BITWISE_XOR]	= { .type = NLA_NESTED }, +}; + +static int nft_bitwise_init(const struct nft_ctx *ctx, +			    const struct nft_expr *expr, +			    const struct nlattr * const tb[]) +{ +	struct nft_bitwise *priv = nft_expr_priv(expr); +	struct nft_data_desc d1, d2; +	int err; + +	if (tb[NFTA_BITWISE_SREG] == NULL || +	    tb[NFTA_BITWISE_DREG] == NULL || +	    tb[NFTA_BITWISE_LEN] == NULL || +	    tb[NFTA_BITWISE_MASK] == NULL || +	    tb[NFTA_BITWISE_XOR] == NULL) +		return -EINVAL; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + +	err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]); +	if (err < 0) +		return err; +	if (d1.len != priv->len) +		return -EINVAL; + +	err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]); +	if (err < 0) +		return err; +	if (d2.len != priv->len) +		return -EINVAL; + +	return 0; +} + +static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_bitwise *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, +			  NFT_DATA_VALUE, priv->len) < 0) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, +			  NFT_DATA_VALUE, priv->len) < 0) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_bitwise_type; +static const struct nft_expr_ops nft_bitwise_ops = { +	.type		= &nft_bitwise_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), +	.eval		= nft_bitwise_eval, +	.init		= nft_bitwise_init, +	.dump		= nft_bitwise_dump, +}; + +static struct nft_expr_type nft_bitwise_type __read_mostly = { +	.name		= "bitwise", +	.ops		= &nft_bitwise_ops, +	.policy		= nft_bitwise_policy, +	.maxattr	= NFTA_BITWISE_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_bitwise_module_init(void) +{ +	return nft_register_expr(&nft_bitwise_type); +} + +void nft_bitwise_module_exit(void) +{ +	nft_unregister_expr(&nft_bitwise_type); +} diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c new file mode 100644 index 00000000000..c39ed8d29df --- /dev/null +++ b/net/netfilter/nft_byteorder.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_byteorder { +	enum nft_registers	sreg:8; +	enum nft_registers	dreg:8; +	enum nft_byteorder_ops	op:8; +	u8			len; +	u8			size; +}; + +static void nft_byteorder_eval(const struct nft_expr *expr, +			       struct nft_data data[NFT_REG_MAX + 1], +			       const struct nft_pktinfo *pkt) +{ +	const struct nft_byteorder *priv = nft_expr_priv(expr); +	struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg]; +	union { u32 u32; u16 u16; } *s, *d; +	unsigned int i; + +	s = (void *)src->data; +	d = (void *)dst->data; + +	switch (priv->size) { +	case 4: +		switch (priv->op) { +		case NFT_BYTEORDER_NTOH: +			for (i = 0; i < priv->len / 4; i++) +				d[i].u32 = ntohl((__force __be32)s[i].u32); +			break; +		case NFT_BYTEORDER_HTON: +			for (i = 0; i < priv->len / 4; i++) +				d[i].u32 = (__force __u32)htonl(s[i].u32); +			break; +		} +		break; +	case 2: +		switch (priv->op) { +		case NFT_BYTEORDER_NTOH: +			for (i = 0; i < priv->len / 2; i++) +				d[i].u16 = ntohs((__force __be16)s[i].u16); +			break; +		case NFT_BYTEORDER_HTON: +			for (i = 0; i < priv->len / 2; i++) +				d[i].u16 = (__force __u16)htons(s[i].u16); +			break; +		} +		break; +	} +} + +static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { +	[NFTA_BYTEORDER_SREG]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_DREG]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_OP]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_LEN]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_SIZE]	= { .type = NLA_U32 }, +}; + +static int nft_byteorder_init(const struct nft_ctx *ctx, +			      const struct nft_expr *expr, +			      const struct nlattr * const tb[]) +{ +	struct nft_byteorder *priv = nft_expr_priv(expr); +	int err; + +	if (tb[NFTA_BYTEORDER_SREG] == NULL || +	    tb[NFTA_BYTEORDER_DREG] == NULL || +	    tb[NFTA_BYTEORDER_LEN] == NULL || +	    tb[NFTA_BYTEORDER_SIZE] == NULL || +	    tb[NFTA_BYTEORDER_OP] == NULL) +		return -EINVAL; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); +	switch (priv->op) { +	case NFT_BYTEORDER_NTOH: +	case NFT_BYTEORDER_HTON: +		break; +	default: +		return -EINVAL; +	} + +	priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); +	if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data)) +		return -EINVAL; + +	priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); +	switch (priv->size) { +	case 2: +	case 4: +		break; +	default: +		return -EINVAL; +	} + +	return 0; +} + +static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_byteorder *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_byteorder_type; +static const struct nft_expr_ops nft_byteorder_ops = { +	.type		= &nft_byteorder_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), +	.eval		= nft_byteorder_eval, +	.init		= nft_byteorder_init, +	.dump		= nft_byteorder_dump, +}; + +static struct nft_expr_type nft_byteorder_type __read_mostly = { +	.name		= "byteorder", +	.ops		= &nft_byteorder_ops, +	.policy		= nft_byteorder_policy, +	.maxattr	= NFTA_BYTEORDER_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_byteorder_module_init(void) +{ +	return nft_register_expr(&nft_byteorder_type); +} + +void nft_byteorder_module_exit(void) +{ +	nft_unregister_expr(&nft_byteorder_type); +} diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c new file mode 100644 index 00000000000..e2b3f51c81f --- /dev/null +++ b/net/netfilter/nft_cmp.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_cmp_expr { +	struct nft_data		data; +	enum nft_registers	sreg:8; +	u8			len; +	enum nft_cmp_ops	op:8; +}; + +static void nft_cmp_eval(const struct nft_expr *expr, +			 struct nft_data data[NFT_REG_MAX + 1], +			 const struct nft_pktinfo *pkt) +{ +	const struct nft_cmp_expr *priv = nft_expr_priv(expr); +	int d; + +	d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len); +	switch (priv->op) { +	case NFT_CMP_EQ: +		if (d != 0) +			goto mismatch; +		break; +	case NFT_CMP_NEQ: +		if (d == 0) +			goto mismatch; +		break; +	case NFT_CMP_LT: +		if (d == 0) +			goto mismatch; +	case NFT_CMP_LTE: +		if (d > 0) +			goto mismatch; +		break; +	case NFT_CMP_GT: +		if (d == 0) +			goto mismatch; +	case NFT_CMP_GTE: +		if (d < 0) +			goto mismatch; +		break; +	} +	return; + +mismatch: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = { +	[NFTA_CMP_SREG]		= { .type = NLA_U32 }, +	[NFTA_CMP_OP]		= { .type = NLA_U32 }, +	[NFTA_CMP_DATA]		= { .type = NLA_NESTED }, +}; + +static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +			const struct nlattr * const tb[]) +{ +	struct nft_cmp_expr *priv = nft_expr_priv(expr); +	struct nft_data_desc desc; +	int err; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); +	priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); + +	err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); +	BUG_ON(err < 0); + +	priv->len = desc.len; +	return 0; +} + +static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_cmp_expr *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op))) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, +			  NFT_DATA_VALUE, priv->len) < 0) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_cmp_type; +static const struct nft_expr_ops nft_cmp_ops = { +	.type		= &nft_cmp_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), +	.eval		= nft_cmp_eval, +	.init		= nft_cmp_init, +	.dump		= nft_cmp_dump, +}; + +static int nft_cmp_fast_init(const struct nft_ctx *ctx, +			     const struct nft_expr *expr, +			     const struct nlattr * const tb[]) +{ +	struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); +	struct nft_data_desc desc; +	struct nft_data data; +	u32 mask; +	int err; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + +	err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); +	BUG_ON(err < 0); +	desc.len *= BITS_PER_BYTE; + +	mask = nft_cmp_fast_mask(desc.len); +	priv->data = data.data[0] & mask; +	priv->len  = desc.len; +	return 0; +} + +static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); +	struct nft_data data; + +	if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ))) +		goto nla_put_failure; + +	data.data[0] = priv->data; +	if (nft_data_dump(skb, NFTA_CMP_DATA, &data, +			  NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +const struct nft_expr_ops nft_cmp_fast_ops = { +	.type		= &nft_cmp_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)), +	.eval		= NULL,	/* inlined */ +	.init		= nft_cmp_fast_init, +	.dump		= nft_cmp_fast_dump, +}; + +static const struct nft_expr_ops * +nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) +{ +	struct nft_data_desc desc; +	struct nft_data data; +	enum nft_registers sreg; +	enum nft_cmp_ops op; +	int err; + +	if (tb[NFTA_CMP_SREG] == NULL || +	    tb[NFTA_CMP_OP] == NULL || +	    tb[NFTA_CMP_DATA] == NULL) +		return ERR_PTR(-EINVAL); + +	sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); +	err = nft_validate_input_register(sreg); +	if (err < 0) +		return ERR_PTR(err); + +	op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); +	switch (op) { +	case NFT_CMP_EQ: +	case NFT_CMP_NEQ: +	case NFT_CMP_LT: +	case NFT_CMP_LTE: +	case NFT_CMP_GT: +	case NFT_CMP_GTE: +		break; +	default: +		return ERR_PTR(-EINVAL); +	} + +	err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); +	if (err < 0) +		return ERR_PTR(err); + +	if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) +		return &nft_cmp_fast_ops; +	else +		return &nft_cmp_ops; +} + +static struct nft_expr_type nft_cmp_type __read_mostly = { +	.name		= "cmp", +	.select_ops	= nft_cmp_select_ops, +	.policy		= nft_cmp_policy, +	.maxattr	= NFTA_CMP_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_cmp_module_init(void) +{ +	return nft_register_expr(&nft_cmp_type); +} + +void nft_cmp_module_exit(void) +{ +	nft_unregister_expr(&nft_cmp_type); +} diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c new file mode 100644 index 00000000000..1840989092e --- /dev/null +++ b/net/netfilter/nft_compat.c @@ -0,0 +1,793 @@ +/* + * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This software has been sponsored by Sophos Astaro <http://www.sophos.com> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/netfilter/nf_tables_compat.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <asm/uaccess.h> /* for set_fs */ +#include <net/netfilter/nf_tables.h> + +union nft_entry { +	struct ipt_entry e4; +	struct ip6t_entry e6; +}; + +static inline void +nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +{ +	par->target	= xt; +	par->targinfo	= xt_info; +	par->hotdrop	= false; +} + +static void nft_target_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	void *info = nft_expr_priv(expr); +	struct xt_target *target = expr->ops->data; +	struct sk_buff *skb = pkt->skb; +	int ret; + +	nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + +	ret = target->target(skb, &pkt->xt); + +	if (pkt->xt.hotdrop) +		ret = NF_DROP; + +	switch(ret) { +	case XT_CONTINUE: +		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +		break; +	default: +		data[NFT_REG_VERDICT].verdict = ret; +		break; +	} +	return; +} + +static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { +	[NFTA_TARGET_NAME]	= { .type = NLA_NUL_STRING }, +	[NFTA_TARGET_REV]	= { .type = NLA_U32 }, +	[NFTA_TARGET_INFO]	= { .type = NLA_BINARY }, +}; + +static void +nft_target_set_tgchk_param(struct xt_tgchk_param *par, +			   const struct nft_ctx *ctx, +			   struct xt_target *target, void *info, +			   union nft_entry *entry, u8 proto, bool inv) +{ +	par->net	= &init_net; +	par->table	= ctx->table->name; +	switch (ctx->afi->family) { +	case AF_INET: +		entry->e4.ip.proto = proto; +		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; +		break; +	case AF_INET6: +		entry->e6.ipv6.proto = proto; +		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; +		break; +	} +	par->entryinfo	= entry; +	par->target	= target; +	par->targinfo	= info; +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		par->hook_mask = 1 << ops->hooknum; +	} +	par->family	= ctx->afi->family; +} + +static void target_compat_from_user(struct xt_target *t, void *in, void *out) +{ +#ifdef CONFIG_COMPAT +	if (t->compat_from_user) { +		int pad; + +		t->compat_from_user(out, in); +		pad = XT_ALIGN(t->targetsize) - t->targetsize; +		if (pad > 0) +			memset(out + t->targetsize, 0, pad); +	} else +#endif +		memcpy(out, in, XT_ALIGN(t->targetsize)); +} + +static inline int nft_compat_target_offset(struct xt_target *target) +{ +#ifdef CONFIG_COMPAT +	return xt_compat_target_offset(target); +#else +	return 0; +#endif +} + +static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { +	[NFTA_RULE_COMPAT_PROTO]	= { .type = NLA_U32 }, +	[NFTA_RULE_COMPAT_FLAGS]	= { .type = NLA_U32 }, +}; + +static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) +{ +	struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; +	u32 flags; +	int err; + +	err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr, +			       nft_rule_compat_policy); +	if (err < 0) +		return err; + +	if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS]) +		return -EINVAL; + +	flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); +	if (flags & ~NFT_RULE_COMPAT_F_MASK) +		return -EINVAL; +	if (flags & NFT_RULE_COMPAT_F_INV) +		*inv = true; + +	*proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); +	return 0; +} + +static int +nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +		const struct nlattr * const tb[]) +{ +	void *info = nft_expr_priv(expr); +	struct xt_target *target = expr->ops->data; +	struct xt_tgchk_param par; +	size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); +	u8 proto = 0; +	bool inv = false; +	union nft_entry e = {}; +	int ret; + +	target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info); + +	if (ctx->nla[NFTA_RULE_COMPAT]) { +		ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); +		if (ret < 0) +			goto err; +	} + +	nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + +	ret = xt_check_target(&par, size, proto, inv); +	if (ret < 0) +		goto err; + +	/* The standard target cannot be used */ +	if (target->target == NULL) { +		ret = -EINVAL; +		goto err; +	} + +	return 0; +err: +	module_put(target->me); +	return ret; +} + +static void +nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ +	struct xt_target *target = expr->ops->data; +	void *info = nft_expr_priv(expr); +	struct xt_tgdtor_param par; + +	par.net = ctx->net; +	par.target = target; +	par.targinfo = info; +	par.family = ctx->afi->family; +	if (par.target->destroy != NULL) +		par.target->destroy(&par); + +	module_put(target->me); +} + +static int +target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in) +{ +	int ret; + +#ifdef CONFIG_COMPAT +	if (t->compat_to_user) { +		mm_segment_t old_fs; +		void *out; + +		out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC); +		if (out == NULL) +			return -ENOMEM; + +		/* We want to reuse existing compat_to_user */ +		old_fs = get_fs(); +		set_fs(KERNEL_DS); +		t->compat_to_user(out, in); +		set_fs(old_fs); +		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out); +		kfree(out); +	} else +#endif +		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in); + +	return ret; +} + +static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct xt_target *target = expr->ops->data; +	void *info = nft_expr_priv(expr); + +	if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || +	    nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || +	    target_dump_info(skb, target, info)) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_target_validate(const struct nft_ctx *ctx, +			       const struct nft_expr *expr, +			       const struct nft_data **data) +{ +	struct xt_target *target = expr->ops->data; +	unsigned int hook_mask = 0; + +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		hook_mask = 1 << ops->hooknum; +		if (hook_mask & target->hooks) +			return 0; + +		/* This target is being called from an invalid chain */ +		return -EINVAL; +	} +	return 0; +} + +static void nft_match_eval(const struct nft_expr *expr, +			   struct nft_data data[NFT_REG_MAX + 1], +			   const struct nft_pktinfo *pkt) +{ +	void *info = nft_expr_priv(expr); +	struct xt_match *match = expr->ops->data; +	struct sk_buff *skb = pkt->skb; +	bool ret; + +	nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + +	ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + +	if (pkt->xt.hotdrop) { +		data[NFT_REG_VERDICT].verdict = NF_DROP; +		return; +	} + +	switch(ret) { +	case true: +		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +		break; +	case false: +		data[NFT_REG_VERDICT].verdict = NFT_BREAK; +		break; +	} +} + +static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { +	[NFTA_MATCH_NAME]	= { .type = NLA_NUL_STRING }, +	[NFTA_MATCH_REV]	= { .type = NLA_U32 }, +	[NFTA_MATCH_INFO]	= { .type = NLA_BINARY }, +}; + +/* struct xt_mtchk_param and xt_tgchk_param look very similar */ +static void +nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, +			  struct xt_match *match, void *info, +			  union nft_entry *entry, u8 proto, bool inv) +{ +	par->net	= &init_net; +	par->table	= ctx->table->name; +	switch (ctx->afi->family) { +	case AF_INET: +		entry->e4.ip.proto = proto; +		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; +		break; +	case AF_INET6: +		entry->e6.ipv6.proto = proto; +		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; +		break; +	} +	par->entryinfo	= entry; +	par->match	= match; +	par->matchinfo	= info; +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		par->hook_mask = 1 << ops->hooknum; +	} +	par->family	= ctx->afi->family; +} + +static void match_compat_from_user(struct xt_match *m, void *in, void *out) +{ +#ifdef CONFIG_COMPAT +	if (m->compat_from_user) { +		int pad; + +		m->compat_from_user(out, in); +		pad = XT_ALIGN(m->matchsize) - m->matchsize; +		if (pad > 0) +			memset(out + m->matchsize, 0, pad); +	} else +#endif +		memcpy(out, in, XT_ALIGN(m->matchsize)); +} + +static int +nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +		const struct nlattr * const tb[]) +{ +	void *info = nft_expr_priv(expr); +	struct xt_match *match = expr->ops->data; +	struct xt_mtchk_param par; +	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); +	u8 proto = 0; +	bool inv = false; +	union nft_entry e = {}; +	int ret; + +	match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info); + +	if (ctx->nla[NFTA_RULE_COMPAT]) { +		ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); +		if (ret < 0) +			goto err; +	} + +	nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + +	ret = xt_check_match(&par, size, proto, inv); +	if (ret < 0) +		goto err; + +	return 0; +err: +	module_put(match->me); +	return ret; +} + +static void +nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ +	struct xt_match *match = expr->ops->data; +	void *info = nft_expr_priv(expr); +	struct xt_mtdtor_param par; + +	par.net = ctx->net; +	par.match = match; +	par.matchinfo = info; +	par.family = ctx->afi->family; +	if (par.match->destroy != NULL) +		par.match->destroy(&par); + +	module_put(match->me); +} + +static int +match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in) +{ +	int ret; + +#ifdef CONFIG_COMPAT +	if (m->compat_to_user) { +		mm_segment_t old_fs; +		void *out; + +		out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC); +		if (out == NULL) +			return -ENOMEM; + +		/* We want to reuse existing compat_to_user */ +		old_fs = get_fs(); +		set_fs(KERNEL_DS); +		m->compat_to_user(out, in); +		set_fs(old_fs); +		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out); +		kfree(out); +	} else +#endif +		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in); + +	return ret; +} + +static inline int nft_compat_match_offset(struct xt_match *match) +{ +#ifdef CONFIG_COMPAT +	return xt_compat_match_offset(match); +#else +	return 0; +#endif +} + +static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	void *info = nft_expr_priv(expr); +	struct xt_match *match = expr->ops->data; + +	if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || +	    nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || +	    match_dump_info(skb, match, info)) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_match_validate(const struct nft_ctx *ctx, +			      const struct nft_expr *expr, +			      const struct nft_data **data) +{ +	struct xt_match *match = expr->ops->data; +	unsigned int hook_mask = 0; + +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		hook_mask = 1 << ops->hooknum; +		if (hook_mask & match->hooks) +			return 0; + +		/* This match is being called from an invalid chain */ +		return -EINVAL; +	} +	return 0; +} + +static int +nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, +		      int event, u16 family, const char *name, +		      int rev, int target) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0; + +	event |= NFNL_SUBSYS_NFT_COMPAT << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = family; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || +	    nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || +	    nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) +		goto nla_put_failure; + +	nlmsg_end(skb, nlh); +	return skb->len; + +nlmsg_failure: +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, +		const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	int ret = 0, target; +	struct nfgenmsg *nfmsg; +	const char *fmt; +	const char *name; +	u32 rev; +	struct sk_buff *skb2; + +	if (tb[NFTA_COMPAT_NAME] == NULL || +	    tb[NFTA_COMPAT_REV] == NULL || +	    tb[NFTA_COMPAT_TYPE] == NULL) +		return -EINVAL; + +	name = nla_data(tb[NFTA_COMPAT_NAME]); +	rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); +	target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); + +	nfmsg = nlmsg_data(nlh); + +	switch(nfmsg->nfgen_family) { +	case AF_INET: +		fmt = "ipt_%s"; +		break; +	case AF_INET6: +		fmt = "ip6t_%s"; +		break; +	default: +		pr_err("nft_compat: unsupported protocol %d\n", +			nfmsg->nfgen_family); +		return -EINVAL; +	} + +	try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, +						 rev, target, &ret), +						 fmt, name); + +	if (ret < 0) +		return ret; + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	/* include the best revision for this extension in the message */ +	if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, +				  nlh->nlmsg_seq, +				  NFNL_MSG_TYPE(nlh->nlmsg_type), +				  NFNL_MSG_COMPAT_GET, +				  nfmsg->nfgen_family, +				  name, ret, target) <= 0) { +		kfree_skb(skb2); +		return -ENOSPC; +	} + +	ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, +				MSG_DONTWAIT); +	if (ret > 0) +		ret = 0; + +	return ret == -EAGAIN ? -ENOBUFS : ret; +} + +static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { +	[NFTA_COMPAT_NAME]	= { .type = NLA_NUL_STRING, +				    .len = NFT_COMPAT_NAME_MAX-1 }, +	[NFTA_COMPAT_REV]	= { .type = NLA_U32 }, +	[NFTA_COMPAT_TYPE]	= { .type = NLA_U32 }, +}; + +static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { +	[NFNL_MSG_COMPAT_GET]		= { .call = nfnl_compat_get, +					    .attr_count = NFTA_COMPAT_MAX, +					    .policy = nfnl_compat_policy_get }, +}; + +static const struct nfnetlink_subsystem nfnl_compat_subsys = { +	.name		= "nft-compat", +	.subsys_id	= NFNL_SUBSYS_NFT_COMPAT, +	.cb_count	= NFNL_MSG_COMPAT_MAX, +	.cb		= nfnl_nft_compat_cb, +}; + +static LIST_HEAD(nft_match_list); + +struct nft_xt { +	struct list_head	head; +	struct nft_expr_ops	ops; +}; + +static struct nft_expr_type nft_match_type; + +static const struct nft_expr_ops * +nft_match_select_ops(const struct nft_ctx *ctx, +		     const struct nlattr * const tb[]) +{ +	struct nft_xt *nft_match; +	struct xt_match *match; +	char *mt_name; +	__u32 rev, family; + +	if (tb[NFTA_MATCH_NAME] == NULL || +	    tb[NFTA_MATCH_REV] == NULL || +	    tb[NFTA_MATCH_INFO] == NULL) +		return ERR_PTR(-EINVAL); + +	mt_name = nla_data(tb[NFTA_MATCH_NAME]); +	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); +	family = ctx->afi->family; + +	/* Re-use the existing match if it's already loaded. */ +	list_for_each_entry(nft_match, &nft_match_list, head) { +		struct xt_match *match = nft_match->ops.data; + +		if (strcmp(match->name, mt_name) == 0 && +		    match->revision == rev && match->family == family) +			return &nft_match->ops; +	} + +	match = xt_request_find_match(family, mt_name, rev); +	if (IS_ERR(match)) +		return ERR_PTR(-ENOENT); + +	/* This is the first time we use this match, allocate operations */ +	nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); +	if (nft_match == NULL) +		return ERR_PTR(-ENOMEM); + +	nft_match->ops.type = &nft_match_type; +	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) + +					    nft_compat_match_offset(match)); +	nft_match->ops.eval = nft_match_eval; +	nft_match->ops.init = nft_match_init; +	nft_match->ops.destroy = nft_match_destroy; +	nft_match->ops.dump = nft_match_dump; +	nft_match->ops.validate = nft_match_validate; +	nft_match->ops.data = match; + +	list_add(&nft_match->head, &nft_match_list); + +	return &nft_match->ops; +} + +static void nft_match_release(void) +{ +	struct nft_xt *nft_match, *tmp; + +	list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head) +		kfree(nft_match); +} + +static struct nft_expr_type nft_match_type __read_mostly = { +	.name		= "match", +	.select_ops	= nft_match_select_ops, +	.policy		= nft_match_policy, +	.maxattr	= NFTA_MATCH_MAX, +	.owner		= THIS_MODULE, +}; + +static LIST_HEAD(nft_target_list); + +static struct nft_expr_type nft_target_type; + +static const struct nft_expr_ops * +nft_target_select_ops(const struct nft_ctx *ctx, +		      const struct nlattr * const tb[]) +{ +	struct nft_xt *nft_target; +	struct xt_target *target; +	char *tg_name; +	__u32 rev, family; + +	if (tb[NFTA_TARGET_NAME] == NULL || +	    tb[NFTA_TARGET_REV] == NULL || +	    tb[NFTA_TARGET_INFO] == NULL) +		return ERR_PTR(-EINVAL); + +	tg_name = nla_data(tb[NFTA_TARGET_NAME]); +	rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); +	family = ctx->afi->family; + +	/* Re-use the existing target if it's already loaded. */ +	list_for_each_entry(nft_target, &nft_match_list, head) { +		struct xt_target *target = nft_target->ops.data; + +		if (strcmp(target->name, tg_name) == 0 && +		    target->revision == rev && target->family == family) +			return &nft_target->ops; +	} + +	target = xt_request_find_target(family, tg_name, rev); +	if (IS_ERR(target)) +		return ERR_PTR(-ENOENT); + +	/* This is the first time we use this target, allocate operations */ +	nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); +	if (nft_target == NULL) +		return ERR_PTR(-ENOMEM); + +	nft_target->ops.type = &nft_target_type; +	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) + +					     nft_compat_target_offset(target)); +	nft_target->ops.eval = nft_target_eval; +	nft_target->ops.init = nft_target_init; +	nft_target->ops.destroy = nft_target_destroy; +	nft_target->ops.dump = nft_target_dump; +	nft_target->ops.validate = nft_target_validate; +	nft_target->ops.data = target; + +	list_add(&nft_target->head, &nft_target_list); + +	return &nft_target->ops; +} + +static void nft_target_release(void) +{ +	struct nft_xt *nft_target, *tmp; + +	list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head) +		kfree(nft_target); +} + +static struct nft_expr_type nft_target_type __read_mostly = { +	.name		= "target", +	.select_ops	= nft_target_select_ops, +	.policy		= nft_target_policy, +	.maxattr	= NFTA_TARGET_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_compat_module_init(void) +{ +	int ret; + +	ret = nft_register_expr(&nft_match_type); +	if (ret < 0) +		return ret; + +	ret = nft_register_expr(&nft_target_type); +	if (ret < 0) +		goto err_match; + +	ret = nfnetlink_subsys_register(&nfnl_compat_subsys); +	if (ret < 0) { +		pr_err("nft_compat: cannot register with nfnetlink.\n"); +		goto err_target; +	} + +	pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n"); + +	return ret; + +err_target: +	nft_unregister_expr(&nft_target_type); +err_match: +	nft_unregister_expr(&nft_match_type); +	return ret; +} + +static void __exit nft_compat_module_exit(void) +{ +	nfnetlink_subsys_unregister(&nfnl_compat_subsys); +	nft_unregister_expr(&nft_target_type); +	nft_unregister_expr(&nft_match_type); +	nft_match_release(); +	nft_target_release(); +} + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); + +module_init(nft_compat_module_init); +module_exit(nft_compat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_EXPR("match"); +MODULE_ALIAS_NFT_EXPR("target"); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c new file mode 100644 index 00000000000..c89ee486ce5 --- /dev/null +++ b/net/netfilter/nft_counter.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/seqlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_counter { +	seqlock_t	lock; +	u64		bytes; +	u64		packets; +}; + +static void nft_counter_eval(const struct nft_expr *expr, +			     struct nft_data data[NFT_REG_MAX + 1], +			     const struct nft_pktinfo *pkt) +{ +	struct nft_counter *priv = nft_expr_priv(expr); + +	write_seqlock_bh(&priv->lock); +	priv->bytes += pkt->skb->len; +	priv->packets++; +	write_sequnlock_bh(&priv->lock); +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	struct nft_counter *priv = nft_expr_priv(expr); +	unsigned int seq; +	u64 bytes; +	u64 packets; + +	do { +		seq = read_seqbegin(&priv->lock); +		bytes	= priv->bytes; +		packets	= priv->packets; +	} while (read_seqretry(&priv->lock, seq)); + +	if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes))) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { +	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 }, +	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 }, +}; + +static int nft_counter_init(const struct nft_ctx *ctx, +			    const struct nft_expr *expr, +			    const struct nlattr * const tb[]) +{ +	struct nft_counter *priv = nft_expr_priv(expr); + +	if (tb[NFTA_COUNTER_PACKETS]) +	        priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); +	if (tb[NFTA_COUNTER_BYTES]) +		priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + +	seqlock_init(&priv->lock); +	return 0; +} + +static struct nft_expr_type nft_counter_type; +static const struct nft_expr_ops nft_counter_ops = { +	.type		= &nft_counter_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_counter)), +	.eval		= nft_counter_eval, +	.init		= nft_counter_init, +	.dump		= nft_counter_dump, +}; + +static struct nft_expr_type nft_counter_type __read_mostly = { +	.name		= "counter", +	.ops		= &nft_counter_ops, +	.policy		= nft_counter_policy, +	.maxattr	= NFTA_COUNTER_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_counter_module_init(void) +{ +	return nft_register_expr(&nft_counter_type); +} + +static void __exit nft_counter_module_exit(void) +{ +	nft_unregister_expr(&nft_counter_type); +} + +module_init(nft_counter_module_init); +module_exit(nft_counter_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("counter"); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c new file mode 100644 index 00000000000..cc560301624 --- /dev/null +++ b/net/netfilter/nft_ct.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +struct nft_ct { +	enum nft_ct_keys	key:8; +	enum ip_conntrack_dir	dir:8; +	union { +		enum nft_registers	dreg:8; +		enum nft_registers	sreg:8; +	}; +}; + +static void nft_ct_get_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); +	struct nft_data *dest = &data[priv->dreg]; +	enum ip_conntrack_info ctinfo; +	const struct nf_conn *ct; +	const struct nf_conn_help *help; +	const struct nf_conntrack_tuple *tuple; +	const struct nf_conntrack_helper *helper; +	long diff; +	unsigned int state; + +	ct = nf_ct_get(pkt->skb, &ctinfo); + +	switch (priv->key) { +	case NFT_CT_STATE: +		if (ct == NULL) +			state = NF_CT_STATE_INVALID_BIT; +		else if (nf_ct_is_untracked(ct)) +			state = NF_CT_STATE_UNTRACKED_BIT; +		else +			state = NF_CT_STATE_BIT(ctinfo); +		dest->data[0] = state; +		return; +	} + +	if (ct == NULL) +		goto err; + +	switch (priv->key) { +	case NFT_CT_DIRECTION: +		dest->data[0] = CTINFO2DIR(ctinfo); +		return; +	case NFT_CT_STATUS: +		dest->data[0] = ct->status; +		return; +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +		dest->data[0] = ct->mark; +		return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK +	case NFT_CT_SECMARK: +		dest->data[0] = ct->secmark; +		return; +#endif +	case NFT_CT_EXPIRATION: +		diff = (long)jiffies - (long)ct->timeout.expires; +		if (diff < 0) +			diff = 0; +		dest->data[0] = jiffies_to_msecs(diff); +		return; +	case NFT_CT_HELPER: +		if (ct->master == NULL) +			goto err; +		help = nfct_help(ct->master); +		if (help == NULL) +			goto err; +		helper = rcu_dereference(help->helper); +		if (helper == NULL) +			goto err; +		if (strlen(helper->name) >= sizeof(dest->data)) +			goto err; +		strncpy((char *)dest->data, helper->name, sizeof(dest->data)); +		return; +#ifdef CONFIG_NF_CONNTRACK_LABELS +	case NFT_CT_LABELS: { +		struct nf_conn_labels *labels = nf_ct_labels_find(ct); +		unsigned int size; + +		if (!labels) { +			memset(dest->data, 0, sizeof(dest->data)); +			return; +		} + +		BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data)); +		size = labels->words * sizeof(long); + +		memcpy(dest->data, labels->bits, size); +		if (size < sizeof(dest->data)) +			memset(((char *) dest->data) + size, 0, +			       sizeof(dest->data) - size); +		return; +	} +#endif +	} + +	tuple = &ct->tuplehash[priv->dir].tuple; +	switch (priv->key) { +	case NFT_CT_L3PROTOCOL: +		dest->data[0] = nf_ct_l3num(ct); +		return; +	case NFT_CT_SRC: +		memcpy(dest->data, tuple->src.u3.all, +		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); +		return; +	case NFT_CT_DST: +		memcpy(dest->data, tuple->dst.u3.all, +		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); +		return; +	case NFT_CT_PROTOCOL: +		dest->data[0] = nf_ct_protonum(ct); +		return; +	case NFT_CT_PROTO_SRC: +		dest->data[0] = (__force __u16)tuple->src.u.all; +		return; +	case NFT_CT_PROTO_DST: +		dest->data[0] = (__force __u16)tuple->dst.u.all; +		return; +	} +	return; +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static void nft_ct_set_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); +	struct sk_buff *skb = pkt->skb; +#ifdef CONFIG_NF_CONNTRACK_MARK +	u32 value = data[priv->sreg].data[0]; +#endif +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; + +	ct = nf_ct_get(skb, &ctinfo); +	if (ct == NULL) +		return; + +	switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +		if (ct->mark != value) { +			ct->mark = value; +			nf_conntrack_event_cache(IPCT_MARK, ct); +		} +		break; +#endif +	} +} + +static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { +	[NFTA_CT_DREG]		= { .type = NLA_U32 }, +	[NFTA_CT_KEY]		= { .type = NLA_U32 }, +	[NFTA_CT_DIRECTION]	= { .type = NLA_U8 }, +	[NFTA_CT_SREG]		= { .type = NLA_U32 }, +}; + +static int nft_ct_l3proto_try_module_get(uint8_t family) +{ +	int err; + +	if (family == NFPROTO_INET) { +		err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4); +		if (err < 0) +			goto err1; +		err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6); +		if (err < 0) +			goto err2; +	} else { +		err = nf_ct_l3proto_try_module_get(family); +		if (err < 0) +			goto err1; +	} +	return 0; + +err2: +	nf_ct_l3proto_module_put(NFPROTO_IPV4); +err1: +	return err; +} + +static void nft_ct_l3proto_module_put(uint8_t family) +{ +	if (family == NFPROTO_INET) { +		nf_ct_l3proto_module_put(NFPROTO_IPV4); +		nf_ct_l3proto_module_put(NFPROTO_IPV6); +	} else +		nf_ct_l3proto_module_put(family); +} + +static int nft_ct_get_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_ct *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); +	switch (priv->key) { +	case NFT_CT_STATE: +	case NFT_CT_DIRECTION: +	case NFT_CT_STATUS: +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK +	case NFT_CT_SECMARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS +	case NFT_CT_LABELS: +#endif +	case NFT_CT_EXPIRATION: +	case NFT_CT_HELPER: +		if (tb[NFTA_CT_DIRECTION] != NULL) +			return -EINVAL; +		break; +	case NFT_CT_L3PROTOCOL: +	case NFT_CT_PROTOCOL: +	case NFT_CT_SRC: +	case NFT_CT_DST: +	case NFT_CT_PROTO_SRC: +	case NFT_CT_PROTO_DST: +		if (tb[NFTA_CT_DIRECTION] == NULL) +			return -EINVAL; +		break; +	default: +		return -EOPNOTSUPP; +	} + +	if (tb[NFTA_CT_DIRECTION] != NULL) { +		priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); +		switch (priv->dir) { +		case IP_CT_DIR_ORIGINAL: +		case IP_CT_DIR_REPLY: +			break; +		default: +			return -EINVAL; +		} +	} + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; + +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	err = nft_ct_l3proto_try_module_get(ctx->afi->family); +	if (err < 0) +		return err; + +	return 0; +} + +static int nft_ct_set_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_ct *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); +	switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +		break; +#endif +	default: +		return -EOPNOTSUPP; +	} + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	err = nft_ct_l3proto_try_module_get(ctx->afi->family); +	if (err < 0) +		return err; + +	return 0; +} + +static void nft_ct_destroy(const struct nft_ctx *ctx, +			   const struct nft_expr *expr) +{ +	nft_ct_l3proto_module_put(ctx->afi->family); +} + +static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) +		goto nla_put_failure; + +	switch (priv->key) { +	case NFT_CT_PROTOCOL: +	case NFT_CT_SRC: +	case NFT_CT_DST: +	case NFT_CT_PROTO_SRC: +	case NFT_CT_PROTO_DST: +		if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) +			goto nla_put_failure; +	default: +		break; +	} + +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_ct_type; +static const struct nft_expr_ops nft_ct_get_ops = { +	.type		= &nft_ct_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)), +	.eval		= nft_ct_get_eval, +	.init		= nft_ct_get_init, +	.destroy	= nft_ct_destroy, +	.dump		= nft_ct_get_dump, +}; + +static const struct nft_expr_ops nft_ct_set_ops = { +	.type		= &nft_ct_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)), +	.eval		= nft_ct_set_eval, +	.init		= nft_ct_set_init, +	.destroy	= nft_ct_destroy, +	.dump		= nft_ct_set_dump, +}; + +static const struct nft_expr_ops * +nft_ct_select_ops(const struct nft_ctx *ctx, +		    const struct nlattr * const tb[]) +{ +	if (tb[NFTA_CT_KEY] == NULL) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_CT_DREG]) +		return &nft_ct_get_ops; + +	if (tb[NFTA_CT_SREG]) +		return &nft_ct_set_ops; + +	return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_ct_type __read_mostly = { +	.name		= "ct", +	.select_ops	= &nft_ct_select_ops, +	.policy		= nft_ct_policy, +	.maxattr	= NFTA_CT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_ct_module_init(void) +{ +	return nft_register_expr(&nft_ct_type); +} + +static void __exit nft_ct_module_exit(void) +{ +	nft_unregister_expr(&nft_ct_type); +} + +module_init(nft_ct_module_init); +module_exit(nft_ct_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("ct"); diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c new file mode 100644 index 00000000000..b6eed4d5a09 --- /dev/null +++ b/net/netfilter/nft_expr_template.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_template { + +}; + +static void nft_template_eval(const struct nft_expr *expr, +			      struct nft_data data[NFT_REG_MAX + 1], +			      const struct nft_pktinfo *pkt) +{ +	struct nft_template *priv = nft_expr_priv(expr); + +} + +static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = { +	[NFTA_TEMPLATE_ATTR]		= { .type = NLA_U32 }, +}; + +static int nft_template_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_template *priv = nft_expr_priv(expr); + +	return 0; +} + +static void nft_template_destroy(const struct nft_ctx *ctx, +			       const struct nft_expr *expr) +{ +	struct nft_template *priv = nft_expr_priv(expr); + +} + +static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_template *priv = nft_expr_priv(expr); + +	NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field); +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_template_type; +static const struct nft_expr_ops nft_template_ops = { +	.type		= &nft_template_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_template)), +	.eval		= nft_template_eval, +	.init		= nft_template_init, +	.destroy	= nft_template_destroy, +	.dump		= nft_template_dump, +}; + +static struct nft_expr_type nft_template_type __read_mostly = { +	.name		= "template", +	.ops		= &nft_template_ops, +	.policy		= nft_template_policy, +	.maxattr	= NFTA_TEMPLATE_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_template_module_init(void) +{ +	return nft_register_expr(&nft_template_type); +} + +static void __exit nft_template_module_exit(void) +{ +	nft_unregister_expr(&nft_template_type); +} + +module_init(nft_template_module_init); +module_exit(nft_template_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("template"); diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c new file mode 100644 index 00000000000..55c939f5371 --- /dev/null +++ b/net/netfilter/nft_exthdr.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +// FIXME: +#include <net/ipv6.h> + +struct nft_exthdr { +	u8			type; +	u8			offset; +	u8			len; +	enum nft_registers	dreg:8; +}; + +static void nft_exthdr_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	struct nft_exthdr *priv = nft_expr_priv(expr); +	struct nft_data *dest = &data[priv->dreg]; +	unsigned int offset = 0; +	int err; + +	err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); +	if (err < 0) +		goto err; +	offset += priv->offset; + +	if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0) +		goto err; +	return; +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { +	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 }, +	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 }, +	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 }, +	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 }, +}; + +static int nft_exthdr_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_exthdr *priv = nft_expr_priv(expr); +	int err; + +	if (tb[NFTA_EXTHDR_DREG] == NULL || +	    tb[NFTA_EXTHDR_TYPE] == NULL || +	    tb[NFTA_EXTHDR_OFFSET] == NULL || +	    tb[NFTA_EXTHDR_LEN] == NULL) +		return -EINVAL; + +	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); +	priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); +	priv->len    = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); +	if (priv->len == 0 || +	    priv->len > FIELD_SIZEOF(struct nft_data, data)) +		return -EINVAL; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_exthdr *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_exthdr_type; +static const struct nft_expr_ops nft_exthdr_ops = { +	.type		= &nft_exthdr_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), +	.eval		= nft_exthdr_eval, +	.init		= nft_exthdr_init, +	.dump		= nft_exthdr_dump, +}; + +static struct nft_expr_type nft_exthdr_type __read_mostly = { +	.name		= "exthdr", +	.ops		= &nft_exthdr_ops, +	.policy		= nft_exthdr_policy, +	.maxattr	= NFTA_EXTHDR_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_exthdr_module_init(void) +{ +	return nft_register_expr(&nft_exthdr_type); +} + +static void __exit nft_exthdr_module_exit(void) +{ +	nft_unregister_expr(&nft_exthdr_type); +} + +module_init(nft_exthdr_module_init); +module_exit(nft_exthdr_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("exthdr"); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c new file mode 100644 index 00000000000..4080ed6a072 --- /dev/null +++ b/net/netfilter/nft_hash.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/log2.h> +#include <linux/jhash.h> +#include <linux/netlink.h> +#include <linux/vmalloc.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +#define NFT_HASH_MIN_SIZE	4UL + +struct nft_hash { +	struct nft_hash_table __rcu	*tbl; +}; + +struct nft_hash_table { +	unsigned int			size; +	struct nft_hash_elem __rcu	*buckets[]; +}; + +struct nft_hash_elem { +	struct nft_hash_elem __rcu	*next; +	struct nft_data			key; +	struct nft_data			data[]; +}; + +#define nft_hash_for_each_entry(i, head) \ +	for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next)) +#define nft_hash_for_each_entry_rcu(i, head) \ +	for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next)) + +static u32 nft_hash_rnd __read_mostly; +static bool nft_hash_rnd_initted __read_mostly; + +static unsigned int nft_hash_data(const struct nft_data *data, +				  unsigned int hsize, unsigned int len) +{ +	unsigned int h; + +	h = jhash(data->data, len, nft_hash_rnd); +	return h & (hsize - 1); +} + +static bool nft_hash_lookup(const struct nft_set *set, +			    const struct nft_data *key, +			    struct nft_data *data) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = rcu_dereference(priv->tbl); +	const struct nft_hash_elem *he; +	unsigned int h; + +	h = nft_hash_data(key, tbl->size, set->klen); +	nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) { +		if (nft_data_cmp(&he->key, key, set->klen)) +			continue; +		if (set->flags & NFT_SET_MAP) +			nft_data_copy(data, he->data); +		return true; +	} +	return false; +} + +static void nft_hash_tbl_free(const struct nft_hash_table *tbl) +{ +	kvfree(tbl); +} + +static unsigned int nft_hash_tbl_size(unsigned int nelem) +{ +	return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); +} + +static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) +{ +	struct nft_hash_table *tbl; +	size_t size; + +	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); +	tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN); +	if (tbl == NULL) +		tbl = vzalloc(size); +	if (tbl == NULL) +		return NULL; +	tbl->size = nbuckets; + +	return tbl; +} + +static void nft_hash_chain_unzip(const struct nft_set *set, +				 const struct nft_hash_table *ntbl, +				 struct nft_hash_table *tbl, unsigned int n) +{ +	struct nft_hash_elem *he, *last, *next; +	unsigned int h; + +	he = nft_dereference(tbl->buckets[n]); +	if (he == NULL) +		return; +	h = nft_hash_data(&he->key, ntbl->size, set->klen); + +	/* Find last element of first chain hashing to bucket h */ +	last = he; +	nft_hash_for_each_entry(he, he->next) { +		if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) +			break; +		last = he; +	} + +	/* Unlink first chain from the old table */ +	RCU_INIT_POINTER(tbl->buckets[n], last->next); + +	/* If end of chain reached, done */ +	if (he == NULL) +		return; + +	/* Find first element of second chain hashing to bucket h */ +	next = NULL; +	nft_hash_for_each_entry(he, he->next) { +		if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) +			continue; +		next = he; +		break; +	} + +	/* Link the two chains */ +	RCU_INIT_POINTER(last->next, next); +} + +static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) +{ +	struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; +	struct nft_hash_elem *he; +	unsigned int i, h; +	bool complete; + +	ntbl = nft_hash_tbl_alloc(tbl->size * 2); +	if (ntbl == NULL) +		return -ENOMEM; + +	/* Link new table's buckets to first element in the old table +	 * hashing to the new bucket. +	 */ +	for (i = 0; i < ntbl->size; i++) { +		h = i < tbl->size ? i : i - tbl->size; +		nft_hash_for_each_entry(he, tbl->buckets[h]) { +			if (nft_hash_data(&he->key, ntbl->size, set->klen) != i) +				continue; +			RCU_INIT_POINTER(ntbl->buckets[i], he); +			break; +		} +	} + +	/* Publish new table */ +	rcu_assign_pointer(priv->tbl, ntbl); + +	/* Unzip interleaved hash chains */ +	do { +		/* Wait for readers to use new table/unzipped chains */ +		synchronize_rcu(); + +		complete = true; +		for (i = 0; i < tbl->size; i++) { +			nft_hash_chain_unzip(set, ntbl, tbl, i); +			if (tbl->buckets[i] != NULL) +				complete = false; +		} +	} while (!complete); + +	nft_hash_tbl_free(tbl); +	return 0; +} + +static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) +{ +	struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; +	struct nft_hash_elem __rcu **pprev; +	unsigned int i; + +	ntbl = nft_hash_tbl_alloc(tbl->size / 2); +	if (ntbl == NULL) +		return -ENOMEM; + +	for (i = 0; i < ntbl->size; i++) { +		ntbl->buckets[i] = tbl->buckets[i]; + +		for (pprev = &ntbl->buckets[i]; *pprev != NULL; +		     pprev = &nft_dereference(*pprev)->next) +			; +		RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); +	} + +	/* Publish new table */ +	rcu_assign_pointer(priv->tbl, ntbl); +	synchronize_rcu(); + +	nft_hash_tbl_free(tbl); +	return 0; +} + +static int nft_hash_insert(const struct nft_set *set, +			   const struct nft_set_elem *elem) +{ +	struct nft_hash *priv = nft_set_priv(set); +	struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem *he; +	unsigned int size, h; + +	if (elem->flags != 0) +		return -EINVAL; + +	size = sizeof(*he); +	if (set->flags & NFT_SET_MAP) +		size += sizeof(he->data[0]); + +	he = kzalloc(size, GFP_KERNEL); +	if (he == NULL) +		return -ENOMEM; + +	nft_data_copy(&he->key, &elem->key); +	if (set->flags & NFT_SET_MAP) +		nft_data_copy(he->data, &elem->data); + +	h = nft_hash_data(&he->key, tbl->size, set->klen); +	RCU_INIT_POINTER(he->next, tbl->buckets[h]); +	rcu_assign_pointer(tbl->buckets[h], he); + +	/* Expand table when exceeding 75% load */ +	if (set->nelems + 1 > tbl->size / 4 * 3) +		nft_hash_tbl_expand(set, priv); + +	return 0; +} + +static void nft_hash_elem_destroy(const struct nft_set *set, +				  struct nft_hash_elem *he) +{ +	nft_data_uninit(&he->key, NFT_DATA_VALUE); +	if (set->flags & NFT_SET_MAP) +		nft_data_uninit(he->data, set->dtype); +	kfree(he); +} + +static void nft_hash_remove(const struct nft_set *set, +			    const struct nft_set_elem *elem) +{ +	struct nft_hash *priv = nft_set_priv(set); +	struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem *he, __rcu **pprev; + +	pprev = elem->cookie; +	he = nft_dereference((*pprev)); + +	RCU_INIT_POINTER(*pprev, he->next); +	synchronize_rcu(); +	kfree(he); + +	/* Shrink table beneath 30% load */ +	if (set->nelems - 1 < tbl->size * 3 / 10 && +	    tbl->size > NFT_HASH_MIN_SIZE) +		nft_hash_tbl_shrink(set, priv); +} + +static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem __rcu * const *pprev; +	struct nft_hash_elem *he; +	unsigned int h; + +	h = nft_hash_data(&elem->key, tbl->size, set->klen); +	pprev = &tbl->buckets[h]; +	nft_hash_for_each_entry(he, tbl->buckets[h]) { +		if (nft_data_cmp(&he->key, &elem->key, set->klen)) { +			pprev = &he->next; +			continue; +		} + +		elem->cookie = (void *)pprev; +		elem->flags = 0; +		if (set->flags & NFT_SET_MAP) +			nft_data_copy(&elem->data, he->data); +		return 0; +	} +	return -ENOENT; +} + +static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, +			  struct nft_set_iter *iter) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	const struct nft_hash_elem *he; +	struct nft_set_elem elem; +	unsigned int i; + +	for (i = 0; i < tbl->size; i++) { +		nft_hash_for_each_entry(he, tbl->buckets[i]) { +			if (iter->count < iter->skip) +				goto cont; + +			memcpy(&elem.key, &he->key, sizeof(elem.key)); +			if (set->flags & NFT_SET_MAP) +				memcpy(&elem.data, he->data, sizeof(elem.data)); +			elem.flags = 0; + +			iter->err = iter->fn(ctx, set, iter, &elem); +			if (iter->err < 0) +				return; +cont: +			iter->count++; +		} +	} +} + +static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) +{ +	return sizeof(struct nft_hash); +} + +static int nft_hash_init(const struct nft_set *set, +			 const struct nft_set_desc *desc, +			 const struct nlattr * const tb[]) +{ +	struct nft_hash *priv = nft_set_priv(set); +	struct nft_hash_table *tbl; +	unsigned int size; + +	if (unlikely(!nft_hash_rnd_initted)) { +		get_random_bytes(&nft_hash_rnd, 4); +		nft_hash_rnd_initted = true; +	} + +	size = NFT_HASH_MIN_SIZE; +	if (desc->size) +		size = nft_hash_tbl_size(desc->size); + +	tbl = nft_hash_tbl_alloc(size); +	if (tbl == NULL) +		return -ENOMEM; +	RCU_INIT_POINTER(priv->tbl, tbl); +	return 0; +} + +static void nft_hash_destroy(const struct nft_set *set) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem *he, *next; +	unsigned int i; + +	for (i = 0; i < tbl->size; i++) { +		for (he = nft_dereference(tbl->buckets[i]); he != NULL; +		     he = next) { +			next = nft_dereference(he->next); +			nft_hash_elem_destroy(set, he); +		} +	} +	kfree(tbl); +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, +			      struct nft_set_estimate *est) +{ +	unsigned int esize; + +	esize = sizeof(struct nft_hash_elem); +	if (features & NFT_SET_MAP) +		esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); + +	if (desc->size) { +		est->size = sizeof(struct nft_hash) + +			    nft_hash_tbl_size(desc->size) * +			    sizeof(struct nft_hash_elem *) + +			    desc->size * esize; +	} else { +		/* Resizing happens when the load drops below 30% or goes +		 * above 75%. The average of 52.5% load (approximated by 50%) +		 * is used for the size estimation of the hash buckets, +		 * meaning we calculate two buckets per element. +		 */ +		est->size = esize + 2 * sizeof(struct nft_hash_elem *); +	} + +	est->class = NFT_SET_CLASS_O_1; + +	return true; +} + +static struct nft_set_ops nft_hash_ops __read_mostly = { +	.privsize       = nft_hash_privsize, +	.estimate	= nft_hash_estimate, +	.init		= nft_hash_init, +	.destroy	= nft_hash_destroy, +	.get		= nft_hash_get, +	.insert		= nft_hash_insert, +	.remove		= nft_hash_remove, +	.lookup		= nft_hash_lookup, +	.walk		= nft_hash_walk, +	.features	= NFT_SET_MAP, +	.owner		= THIS_MODULE, +}; + +static int __init nft_hash_module_init(void) +{ +	return nft_register_set(&nft_hash_ops); +} + +static void __exit nft_hash_module_exit(void) +{ +	nft_unregister_set(&nft_hash_ops); +} + +module_init(nft_hash_module_init); +module_exit(nft_hash_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c new file mode 100644 index 00000000000..810385eb724 --- /dev/null +++ b/net/netfilter/nft_immediate.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_immediate_expr { +	struct nft_data		data; +	enum nft_registers	dreg:8; +	u8			dlen; +}; + +static void nft_immediate_eval(const struct nft_expr *expr, +			       struct nft_data data[NFT_REG_MAX + 1], +			       const struct nft_pktinfo *pkt) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); + +	nft_data_copy(&data[priv->dreg], &priv->data); +} + +static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { +	[NFTA_IMMEDIATE_DREG]	= { .type = NLA_U32 }, +	[NFTA_IMMEDIATE_DATA]	= { .type = NLA_NESTED }, +}; + +static int nft_immediate_init(const struct nft_ctx *ctx, +			      const struct nft_expr *expr, +			      const struct nlattr * const tb[]) +{ +	struct nft_immediate_expr *priv = nft_expr_priv(expr); +	struct nft_data_desc desc; +	int err; + +	if (tb[NFTA_IMMEDIATE_DREG] == NULL || +	    tb[NFTA_IMMEDIATE_DATA] == NULL) +		return -EINVAL; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; + +	err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); +	if (err < 0) +		return err; +	priv->dlen = desc.len; + +	err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type); +	if (err < 0) +		goto err1; + +	return 0; + +err1: +	nft_data_uninit(&priv->data, desc.type); +	return err; +} + +static void nft_immediate_destroy(const struct nft_ctx *ctx, +				  const struct nft_expr *expr) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); +	return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); +} + +static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg))) +		goto nla_put_failure; + +	return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, +			     nft_dreg_to_type(priv->dreg), priv->dlen); + +nla_put_failure: +	return -1; +} + +static int nft_immediate_validate(const struct nft_ctx *ctx, +				  const struct nft_expr *expr, +				  const struct nft_data **data) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); + +	if (priv->dreg == NFT_REG_VERDICT) +		*data = &priv->data; + +	return 0; +} + +static struct nft_expr_type nft_imm_type; +static const struct nft_expr_ops nft_imm_ops = { +	.type		= &nft_imm_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), +	.eval		= nft_immediate_eval, +	.init		= nft_immediate_init, +	.destroy	= nft_immediate_destroy, +	.dump		= nft_immediate_dump, +	.validate	= nft_immediate_validate, +}; + +static struct nft_expr_type nft_imm_type __read_mostly = { +	.name		= "immediate", +	.ops		= &nft_imm_ops, +	.policy		= nft_immediate_policy, +	.maxattr	= NFTA_IMMEDIATE_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_immediate_module_init(void) +{ +	return nft_register_expr(&nft_imm_type); +} + +void nft_immediate_module_exit(void) +{ +	nft_unregister_expr(&nft_imm_type); +} diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c new file mode 100644 index 00000000000..85da5bd02f6 --- /dev/null +++ b/net/netfilter/nft_limit.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(limit_lock); + +struct nft_limit { +	u64		tokens; +	u64		rate; +	u64		unit; +	unsigned long	stamp; +}; + +static void nft_limit_eval(const struct nft_expr *expr, +			   struct nft_data data[NFT_REG_MAX + 1], +			   const struct nft_pktinfo *pkt) +{ +	struct nft_limit *priv = nft_expr_priv(expr); + +	spin_lock_bh(&limit_lock); +	if (time_after_eq(jiffies, priv->stamp)) { +		priv->tokens = priv->rate; +		priv->stamp = jiffies + priv->unit * HZ; +	} + +	if (priv->tokens >= 1) { +		priv->tokens--; +		spin_unlock_bh(&limit_lock); +		return; +	} +	spin_unlock_bh(&limit_lock); + +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { +	[NFTA_LIMIT_RATE]	= { .type = NLA_U64 }, +	[NFTA_LIMIT_UNIT]	= { .type = NLA_U64 }, +}; + +static int nft_limit_init(const struct nft_ctx *ctx, +			  const struct nft_expr *expr, +			  const struct nlattr * const tb[]) +{ +	struct nft_limit *priv = nft_expr_priv(expr); + +	if (tb[NFTA_LIMIT_RATE] == NULL || +	    tb[NFTA_LIMIT_UNIT] == NULL) +		return -EINVAL; + +	priv->rate   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); +	priv->unit   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); +	priv->stamp  = jiffies + priv->unit * HZ; +	priv->tokens = priv->rate; +	return 0; +} + +static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_limit *priv = nft_expr_priv(expr); + +	if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate))) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_limit_type; +static const struct nft_expr_ops nft_limit_ops = { +	.type		= &nft_limit_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit)), +	.eval		= nft_limit_eval, +	.init		= nft_limit_init, +	.dump		= nft_limit_dump, +}; + +static struct nft_expr_type nft_limit_type __read_mostly = { +	.name		= "limit", +	.ops		= &nft_limit_ops, +	.policy		= nft_limit_policy, +	.maxattr	= NFTA_LIMIT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_limit_module_init(void) +{ +	return nft_register_expr(&nft_limit_type); +} + +static void __exit nft_limit_module_exit(void) +{ +	nft_unregister_expr(&nft_limit_type); +} + +module_init(nft_limit_module_init); +module_exit(nft_limit_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("limit"); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c new file mode 100644 index 00000000000..10cfb156cdf --- /dev/null +++ b/net/netfilter/nft_log.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netdevice.h> + +static const char *nft_log_null_prefix = ""; + +struct nft_log { +	struct nf_loginfo	loginfo; +	char			*prefix; +}; + +static void nft_log_eval(const struct nft_expr *expr, +			 struct nft_data data[NFT_REG_MAX + 1], +			 const struct nft_pktinfo *pkt) +{ +	const struct nft_log *priv = nft_expr_priv(expr); +	struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + +	nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, +		      pkt->out, &priv->loginfo, "%s", priv->prefix); +} + +static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { +	[NFTA_LOG_GROUP]	= { .type = NLA_U16 }, +	[NFTA_LOG_PREFIX]	= { .type = NLA_STRING }, +	[NFTA_LOG_SNAPLEN]	= { .type = NLA_U32 }, +	[NFTA_LOG_QTHRESHOLD]	= { .type = NLA_U16 }, +}; + +static int nft_log_init(const struct nft_ctx *ctx, +			const struct nft_expr *expr, +			const struct nlattr * const tb[]) +{ +	struct nft_log *priv = nft_expr_priv(expr); +	struct nf_loginfo *li = &priv->loginfo; +	const struct nlattr *nla; + +	nla = tb[NFTA_LOG_PREFIX]; +	if (nla != NULL) { +		priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); +		if (priv->prefix == NULL) +			return -ENOMEM; +		nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); +	} else +		priv->prefix = (char *)nft_log_null_prefix; + +	li->type = NF_LOG_TYPE_ULOG; +	if (tb[NFTA_LOG_GROUP] != NULL) +		li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); + +	if (tb[NFTA_LOG_SNAPLEN] != NULL) +		li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); +	if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { +		li->u.ulog.qthreshold = +			ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); +	} + +	return 0; +} + +static void nft_log_destroy(const struct nft_ctx *ctx, +			    const struct nft_expr *expr) +{ +	struct nft_log *priv = nft_expr_priv(expr); + +	if (priv->prefix != nft_log_null_prefix) +		kfree(priv->prefix); +} + +static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_log *priv = nft_expr_priv(expr); +	const struct nf_loginfo *li = &priv->loginfo; + +	if (priv->prefix != nft_log_null_prefix) +		if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) +			goto nla_put_failure; +	if (li->u.ulog.group) +		if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) +			goto nla_put_failure; +	if (li->u.ulog.copy_len) +		if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, +				 htonl(li->u.ulog.copy_len))) +			goto nla_put_failure; +	if (li->u.ulog.qthreshold) +		if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, +				 htons(li->u.ulog.qthreshold))) +			goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_log_type; +static const struct nft_expr_ops nft_log_ops = { +	.type		= &nft_log_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_log)), +	.eval		= nft_log_eval, +	.init		= nft_log_init, +	.destroy	= nft_log_destroy, +	.dump		= nft_log_dump, +}; + +static struct nft_expr_type nft_log_type __read_mostly = { +	.name		= "log", +	.ops		= &nft_log_ops, +	.policy		= nft_log_policy, +	.maxattr	= NFTA_LOG_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_log_module_init(void) +{ +	return nft_register_expr(&nft_log_type); +} + +static void __exit nft_log_module_exit(void) +{ +	nft_unregister_expr(&nft_log_type); +} + +module_init(nft_log_module_init); +module_exit(nft_log_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("log"); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c new file mode 100644 index 00000000000..6404a726d17 --- /dev/null +++ b/net/netfilter/nft_lookup.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> + +struct nft_lookup { +	struct nft_set			*set; +	enum nft_registers		sreg:8; +	enum nft_registers		dreg:8; +	struct nft_set_binding		binding; +}; + +static void nft_lookup_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	const struct nft_lookup *priv = nft_expr_priv(expr); +	const struct nft_set *set = priv->set; + +	if (set->ops->lookup(set, &data[priv->sreg], &data[priv->dreg])) +		return; +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { +	[NFTA_LOOKUP_SET]	= { .type = NLA_STRING }, +	[NFTA_LOOKUP_SREG]	= { .type = NLA_U32 }, +	[NFTA_LOOKUP_DREG]	= { .type = NLA_U32 }, +}; + +static int nft_lookup_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_lookup *priv = nft_expr_priv(expr); +	struct nft_set *set; +	int err; + +	if (tb[NFTA_LOOKUP_SET] == NULL || +	    tb[NFTA_LOOKUP_SREG] == NULL) +		return -EINVAL; + +	set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); +	if (IS_ERR(set)) { +		if (tb[NFTA_LOOKUP_SET_ID]) { +			set = nf_tables_set_lookup_byid(ctx->net, +							tb[NFTA_LOOKUP_SET_ID]); +		} +		if (IS_ERR(set)) +			return PTR_ERR(set); +	} + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	if (tb[NFTA_LOOKUP_DREG] != NULL) { +		if (!(set->flags & NFT_SET_MAP)) +			return -EINVAL; + +		priv->dreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_DREG])); +		err = nft_validate_output_register(priv->dreg); +		if (err < 0) +			return err; + +		if (priv->dreg == NFT_REG_VERDICT) { +			if (set->dtype != NFT_DATA_VERDICT) +				return -EINVAL; +		} else if (set->dtype == NFT_DATA_VERDICT) +			return -EINVAL; +	} else if (set->flags & NFT_SET_MAP) +		return -EINVAL; + +	err = nf_tables_bind_set(ctx, set, &priv->binding); +	if (err < 0) +		return err; + +	priv->set = set; +	return 0; +} + +static void nft_lookup_destroy(const struct nft_ctx *ctx, +			       const struct nft_expr *expr) +{ +	struct nft_lookup *priv = nft_expr_priv(expr); + +	nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_lookup *priv = nft_expr_priv(expr); + +	if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_LOOKUP_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (priv->set->flags & NFT_SET_MAP) +		if (nla_put_be32(skb, NFTA_LOOKUP_DREG, htonl(priv->dreg))) +			goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_lookup_type; +static const struct nft_expr_ops nft_lookup_ops = { +	.type		= &nft_lookup_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)), +	.eval		= nft_lookup_eval, +	.init		= nft_lookup_init, +	.destroy	= nft_lookup_destroy, +	.dump		= nft_lookup_dump, +}; + +static struct nft_expr_type nft_lookup_type __read_mostly = { +	.name		= "lookup", +	.ops		= &nft_lookup_ops, +	.policy		= nft_lookup_policy, +	.maxattr	= NFTA_LOOKUP_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_lookup_module_init(void) +{ +	return nft_register_expr(&nft_lookup_type); +} + +void nft_lookup_module_exit(void) +{ +	nft_unregister_expr(&nft_lookup_type); +} diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c new file mode 100644 index 00000000000..852b178c6ae --- /dev/null +++ b/net/netfilter/nft_meta.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/tcp_states.h> /* for TCP_TIME_WAIT */ +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> + +void nft_meta_get_eval(const struct nft_expr *expr, +		       struct nft_data data[NFT_REG_MAX + 1], +		       const struct nft_pktinfo *pkt) +{ +	const struct nft_meta *priv = nft_expr_priv(expr); +	const struct sk_buff *skb = pkt->skb; +	const struct net_device *in = pkt->in, *out = pkt->out; +	struct nft_data *dest = &data[priv->dreg]; + +	switch (priv->key) { +	case NFT_META_LEN: +		dest->data[0] = skb->len; +		break; +	case NFT_META_PROTOCOL: +		*(__be16 *)dest->data = skb->protocol; +		break; +	case NFT_META_NFPROTO: +		dest->data[0] = pkt->ops->pf; +		break; +	case NFT_META_L4PROTO: +		dest->data[0] = pkt->tprot; +		break; +	case NFT_META_PRIORITY: +		dest->data[0] = skb->priority; +		break; +	case NFT_META_MARK: +		dest->data[0] = skb->mark; +		break; +	case NFT_META_IIF: +		if (in == NULL) +			goto err; +		dest->data[0] = in->ifindex; +		break; +	case NFT_META_OIF: +		if (out == NULL) +			goto err; +		dest->data[0] = out->ifindex; +		break; +	case NFT_META_IIFNAME: +		if (in == NULL) +			goto err; +		strncpy((char *)dest->data, in->name, sizeof(dest->data)); +		break; +	case NFT_META_OIFNAME: +		if (out == NULL) +			goto err; +		strncpy((char *)dest->data, out->name, sizeof(dest->data)); +		break; +	case NFT_META_IIFTYPE: +		if (in == NULL) +			goto err; +		*(u16 *)dest->data = in->type; +		break; +	case NFT_META_OIFTYPE: +		if (out == NULL) +			goto err; +		*(u16 *)dest->data = out->type; +		break; +	case NFT_META_SKUID: +		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) +			goto err; + +		read_lock_bh(&skb->sk->sk_callback_lock); +		if (skb->sk->sk_socket == NULL || +		    skb->sk->sk_socket->file == NULL) { +			read_unlock_bh(&skb->sk->sk_callback_lock); +			goto err; +		} + +		dest->data[0] = +			from_kuid_munged(&init_user_ns, +				skb->sk->sk_socket->file->f_cred->fsuid); +		read_unlock_bh(&skb->sk->sk_callback_lock); +		break; +	case NFT_META_SKGID: +		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) +			goto err; + +		read_lock_bh(&skb->sk->sk_callback_lock); +		if (skb->sk->sk_socket == NULL || +		    skb->sk->sk_socket->file == NULL) { +			read_unlock_bh(&skb->sk->sk_callback_lock); +			goto err; +		} +		dest->data[0] = +			from_kgid_munged(&init_user_ns, +				 skb->sk->sk_socket->file->f_cred->fsgid); +		read_unlock_bh(&skb->sk->sk_callback_lock); +		break; +#ifdef CONFIG_IP_ROUTE_CLASSID +	case NFT_META_RTCLASSID: { +		const struct dst_entry *dst = skb_dst(skb); + +		if (dst == NULL) +			goto err; +		dest->data[0] = dst->tclassid; +		break; +	} +#endif +#ifdef CONFIG_NETWORK_SECMARK +	case NFT_META_SECMARK: +		dest->data[0] = skb->secmark; +		break; +#endif +	default: +		WARN_ON(1); +		goto err; +	} +	return; + +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} +EXPORT_SYMBOL_GPL(nft_meta_get_eval); + +void nft_meta_set_eval(const struct nft_expr *expr, +		       struct nft_data data[NFT_REG_MAX + 1], +		       const struct nft_pktinfo *pkt) +{ +	const struct nft_meta *meta = nft_expr_priv(expr); +	struct sk_buff *skb = pkt->skb; +	u32 value = data[meta->sreg].data[0]; + +	switch (meta->key) { +	case NFT_META_MARK: +		skb->mark = value; +		break; +	case NFT_META_PRIORITY: +		skb->priority = value; +		break; +	case NFT_META_NFTRACE: +		skb->nf_trace = 1; +		break; +	default: +		WARN_ON(1); +	} +} +EXPORT_SYMBOL_GPL(nft_meta_set_eval); + +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +	[NFTA_META_DREG]	= { .type = NLA_U32 }, +	[NFTA_META_KEY]		= { .type = NLA_U32 }, +	[NFTA_META_SREG]	= { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_meta_policy); + +int nft_meta_get_init(const struct nft_ctx *ctx, +		      const struct nft_expr *expr, +		      const struct nlattr * const tb[]) +{ +	struct nft_meta *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); +	switch (priv->key) { +	case NFT_META_LEN: +	case NFT_META_PROTOCOL: +	case NFT_META_NFPROTO: +	case NFT_META_L4PROTO: +	case NFT_META_PRIORITY: +	case NFT_META_MARK: +	case NFT_META_IIF: +	case NFT_META_OIF: +	case NFT_META_IIFNAME: +	case NFT_META_OIFNAME: +	case NFT_META_IIFTYPE: +	case NFT_META_OIFTYPE: +	case NFT_META_SKUID: +	case NFT_META_SKGID: +#ifdef CONFIG_IP_ROUTE_CLASSID +	case NFT_META_RTCLASSID: +#endif +#ifdef CONFIG_NETWORK_SECMARK +	case NFT_META_SECMARK: +#endif +		break; +	default: +		return -EOPNOTSUPP; +	} + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; + +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_get_init); + +int nft_meta_set_init(const struct nft_ctx *ctx, +		      const struct nft_expr *expr, +		      const struct nlattr * const tb[]) +{ +	struct nft_meta *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); +	switch (priv->key) { +	case NFT_META_MARK: +	case NFT_META_PRIORITY: +	case NFT_META_NFTRACE: +		break; +	default: +		return -EOPNOTSUPP; +	} + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_set_init); + +int nft_meta_get_dump(struct sk_buff *skb, +		      const struct nft_expr *expr) +{ +	const struct nft_meta *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_get_dump); + +int nft_meta_set_dump(struct sk_buff *skb, +		      const struct nft_expr *expr) +{ +	const struct nft_meta *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_META_SREG, htonl(priv->sreg))) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_set_dump); + +static struct nft_expr_type nft_meta_type; +static const struct nft_expr_ops nft_meta_get_ops = { +	.type		= &nft_meta_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)), +	.eval		= nft_meta_get_eval, +	.init		= nft_meta_get_init, +	.dump		= nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_set_ops = { +	.type		= &nft_meta_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)), +	.eval		= nft_meta_set_eval, +	.init		= nft_meta_set_init, +	.dump		= nft_meta_set_dump, +}; + +static const struct nft_expr_ops * +nft_meta_select_ops(const struct nft_ctx *ctx, +		    const struct nlattr * const tb[]) +{ +	if (tb[NFTA_META_KEY] == NULL) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_META_DREG]) +		return &nft_meta_get_ops; + +	if (tb[NFTA_META_SREG]) +		return &nft_meta_set_ops; + +	return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_type __read_mostly = { +	.name		= "meta", +	.select_ops	= &nft_meta_select_ops, +	.policy		= nft_meta_policy, +	.maxattr	= NFTA_META_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_meta_module_init(void) +{ +	return nft_register_expr(&nft_meta_type); +} + +static void __exit nft_meta_module_exit(void) +{ +	nft_unregister_expr(&nft_meta_type); +} + +module_init(nft_meta_module_init); +module_exit(nft_meta_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("meta"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c new file mode 100644 index 00000000000..79ff58cd36d --- /dev/null +++ b/net/netfilter/nft_nat.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/string.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +struct nft_nat { +	enum nft_registers      sreg_addr_min:8; +	enum nft_registers      sreg_addr_max:8; +	enum nft_registers      sreg_proto_min:8; +	enum nft_registers      sreg_proto_max:8; +	enum nf_nat_manip_type  type:8; +	u8			family; +}; + +static void nft_nat_eval(const struct nft_expr *expr, +			 struct nft_data data[NFT_REG_MAX + 1], +			 const struct nft_pktinfo *pkt) +{ +	const struct nft_nat *priv = nft_expr_priv(expr); +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); +	struct nf_nat_range range; + +	memset(&range, 0, sizeof(range)); +	if (priv->sreg_addr_min) { +		if (priv->family == AF_INET) { +			range.min_addr.ip = (__force __be32) +					data[priv->sreg_addr_min].data[0]; +			range.max_addr.ip = (__force __be32) +					data[priv->sreg_addr_max].data[0]; + +		} else { +			memcpy(range.min_addr.ip6, +			       data[priv->sreg_addr_min].data, +			       sizeof(struct nft_data)); +			memcpy(range.max_addr.ip6, +			       data[priv->sreg_addr_max].data, +			       sizeof(struct nft_data)); +		} +		range.flags |= NF_NAT_RANGE_MAP_IPS; +	} + +	if (priv->sreg_proto_min) { +		range.min_proto.all = (__force __be16) +					data[priv->sreg_proto_min].data[0]; +		range.max_proto.all = (__force __be16) +					data[priv->sreg_proto_max].data[0]; +		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; +	} + +	data[NFT_REG_VERDICT].verdict = +		nf_nat_setup_info(ct, &range, priv->type); +} + +static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { +	[NFTA_NAT_TYPE]		 = { .type = NLA_U32 }, +	[NFTA_NAT_FAMILY]	 = { .type = NLA_U32 }, +	[NFTA_NAT_REG_ADDR_MIN]	 = { .type = NLA_U32 }, +	[NFTA_NAT_REG_ADDR_MAX]	 = { .type = NLA_U32 }, +	[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, +	[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, +}; + +static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +			const struct nlattr * const tb[]) +{ +	struct nft_nat *priv = nft_expr_priv(expr); +	u32 family; +	int err; + +	if (tb[NFTA_NAT_TYPE] == NULL) +		return -EINVAL; + +	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { +	case NFT_NAT_SNAT: +		priv->type = NF_NAT_MANIP_SRC; +		break; +	case NFT_NAT_DNAT: +		priv->type = NF_NAT_MANIP_DST; +		break; +	default: +		return -EINVAL; +	} + +	if (tb[NFTA_NAT_FAMILY] == NULL) +		return -EINVAL; + +	family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); +	if (family != AF_INET && family != AF_INET6) +		return -EAFNOSUPPORT; +	if (family != ctx->afi->family) +		return -EOPNOTSUPP; +	priv->family = family; + +	if (tb[NFTA_NAT_REG_ADDR_MIN]) { +		priv->sreg_addr_min = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_ADDR_MIN])); +		err = nft_validate_input_register(priv->sreg_addr_min); +		if (err < 0) +			return err; +	} + +	if (tb[NFTA_NAT_REG_ADDR_MAX]) { +		priv->sreg_addr_max = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_ADDR_MAX])); +		err = nft_validate_input_register(priv->sreg_addr_max); +		if (err < 0) +			return err; +	} else +		priv->sreg_addr_max = priv->sreg_addr_min; + +	if (tb[NFTA_NAT_REG_PROTO_MIN]) { +		priv->sreg_proto_min = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_PROTO_MIN])); +		err = nft_validate_input_register(priv->sreg_proto_min); +		if (err < 0) +			return err; +	} + +	if (tb[NFTA_NAT_REG_PROTO_MAX]) { +		priv->sreg_proto_max = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_PROTO_MAX])); +		err = nft_validate_input_register(priv->sreg_proto_max); +		if (err < 0) +			return err; +	} else +		priv->sreg_proto_max = priv->sreg_proto_min; + +	return 0; +} + +static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_nat *priv = nft_expr_priv(expr); + +	switch (priv->type) { +	case NF_NAT_MANIP_SRC: +		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) +			goto nla_put_failure; +		break; +	case NF_NAT_MANIP_DST: +		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) +			goto nla_put_failure; +		break; +	} + +	if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) +		goto nla_put_failure; +	if (nla_put_be32(skb, +			 NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min))) +		goto nla_put_failure; +	if (nla_put_be32(skb, +			 NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) +		goto nla_put_failure; +	if (priv->sreg_proto_min) { +		if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, +				 htonl(priv->sreg_proto_min))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, +				 htonl(priv->sreg_proto_max))) +			goto nla_put_failure; +	} +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_nat_type; +static const struct nft_expr_ops nft_nat_ops = { +	.type           = &nft_nat_type, +	.size           = NFT_EXPR_SIZE(sizeof(struct nft_nat)), +	.eval           = nft_nat_eval, +	.init           = nft_nat_init, +	.dump           = nft_nat_dump, +}; + +static struct nft_expr_type nft_nat_type __read_mostly = { +	.name           = "nat", +	.ops            = &nft_nat_ops, +	.policy         = nft_nat_policy, +	.maxattr        = NFTA_NAT_MAX, +	.owner          = THIS_MODULE, +}; + +static int __init nft_nat_module_init(void) +{ +	return nft_register_expr(&nft_nat_type); +} + +static void __exit nft_nat_module_exit(void) +{ +	nft_unregister_expr(&nft_nat_type); +} + +module_init(nft_nat_module_init); +module_exit(nft_nat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_EXPR("nat"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c new file mode 100644 index 00000000000..85daa84bfdf --- /dev/null +++ b/net/netfilter/nft_payload.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +static void nft_payload_eval(const struct nft_expr *expr, +			     struct nft_data data[NFT_REG_MAX + 1], +			     const struct nft_pktinfo *pkt) +{ +	const struct nft_payload *priv = nft_expr_priv(expr); +	const struct sk_buff *skb = pkt->skb; +	struct nft_data *dest = &data[priv->dreg]; +	int offset; + +	switch (priv->base) { +	case NFT_PAYLOAD_LL_HEADER: +		if (!skb_mac_header_was_set(skb)) +			goto err; +		offset = skb_mac_header(skb) - skb->data; +		break; +	case NFT_PAYLOAD_NETWORK_HEADER: +		offset = skb_network_offset(skb); +		break; +	case NFT_PAYLOAD_TRANSPORT_HEADER: +		offset = pkt->xt.thoff; +		break; +	default: +		BUG(); +	} +	offset += priv->offset; + +	if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0) +		goto err; +	return; +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { +	[NFTA_PAYLOAD_DREG]	= { .type = NLA_U32 }, +	[NFTA_PAYLOAD_BASE]	= { .type = NLA_U32 }, +	[NFTA_PAYLOAD_OFFSET]	= { .type = NLA_U32 }, +	[NFTA_PAYLOAD_LEN]	= { .type = NLA_U32 }, +}; + +static int nft_payload_init(const struct nft_ctx *ctx, +			    const struct nft_expr *expr, +			    const struct nlattr * const tb[]) +{ +	struct nft_payload *priv = nft_expr_priv(expr); +	int err; + +	priv->base   = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); +	priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); +	priv->len    = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_payload *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) || +	    nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || +	    nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || +	    nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_payload_type; +static const struct nft_expr_ops nft_payload_ops = { +	.type		= &nft_payload_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)), +	.eval		= nft_payload_eval, +	.init		= nft_payload_init, +	.dump		= nft_payload_dump, +}; + +const struct nft_expr_ops nft_payload_fast_ops = { +	.type		= &nft_payload_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)), +	.eval		= nft_payload_eval, +	.init		= nft_payload_init, +	.dump		= nft_payload_dump, +}; + +static const struct nft_expr_ops * +nft_payload_select_ops(const struct nft_ctx *ctx, +		       const struct nlattr * const tb[]) +{ +	enum nft_payload_bases base; +	unsigned int offset, len; + +	if (tb[NFTA_PAYLOAD_DREG] == NULL || +	    tb[NFTA_PAYLOAD_BASE] == NULL || +	    tb[NFTA_PAYLOAD_OFFSET] == NULL || +	    tb[NFTA_PAYLOAD_LEN] == NULL) +		return ERR_PTR(-EINVAL); + +	base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); +	switch (base) { +	case NFT_PAYLOAD_LL_HEADER: +	case NFT_PAYLOAD_NETWORK_HEADER: +	case NFT_PAYLOAD_TRANSPORT_HEADER: +		break; +	default: +		return ERR_PTR(-EOPNOTSUPP); +	} + +	offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); +	len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); +	if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data)) +		return ERR_PTR(-EINVAL); + +	if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && +	    base != NFT_PAYLOAD_LL_HEADER) +		return &nft_payload_fast_ops; +	else +		return &nft_payload_ops; +} + +static struct nft_expr_type nft_payload_type __read_mostly = { +	.name		= "payload", +	.select_ops	= nft_payload_select_ops, +	.policy		= nft_payload_policy, +	.maxattr	= NFTA_PAYLOAD_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_payload_module_init(void) +{ +	return nft_register_expr(&nft_payload_type); +} + +void nft_payload_module_exit(void) +{ +	nft_unregister_expr(&nft_payload_type); +} diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c new file mode 100644 index 00000000000..e8ae2f6bf23 --- /dev/null +++ b/net/netfilter/nft_queue.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code partly funded by OISF + * (http://www.openinfosecfoundation.org/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/jhash.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_queue.h> + +static u32 jhash_initval __read_mostly; + +struct nft_queue { +	u16	queuenum; +	u16	queues_total; +	u16	flags; +}; + +static void nft_queue_eval(const struct nft_expr *expr, +			   struct nft_data data[NFT_REG_MAX + 1], +			   const struct nft_pktinfo *pkt) +{ +	struct nft_queue *priv = nft_expr_priv(expr); +	u32 queue = priv->queuenum; +	u32 ret; + +	if (priv->queues_total > 1) { +		if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { +			int cpu = smp_processor_id(); + +			queue = priv->queuenum + cpu % priv->queues_total; +		} else { +			queue = nfqueue_hash(pkt->skb, queue, +					     priv->queues_total, pkt->ops->pf, +					     jhash_initval); +		} +	} + +	ret = NF_QUEUE_NR(queue); +	if (priv->flags & NFT_QUEUE_FLAG_BYPASS) +		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + +	data[NFT_REG_VERDICT].verdict = ret; +} + +static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { +	[NFTA_QUEUE_NUM]	= { .type = NLA_U16 }, +	[NFTA_QUEUE_TOTAL]	= { .type = NLA_U16 }, +	[NFTA_QUEUE_FLAGS]	= { .type = NLA_U16 }, +}; + +static int nft_queue_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_queue *priv = nft_expr_priv(expr); + +	if (tb[NFTA_QUEUE_NUM] == NULL) +		return -EINVAL; + +	init_hashrandom(&jhash_initval); +	priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); + +	if (tb[NFTA_QUEUE_TOTAL] != NULL) +		priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); +	if (tb[NFTA_QUEUE_FLAGS] != NULL) { +		priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); +		if (priv->flags & ~NFT_QUEUE_FLAG_MASK) +			return -EINVAL; +	} +	return 0; +} + +static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_queue *priv = nft_expr_priv(expr); + +	if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) || +	    nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) || +	    nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_queue_type; +static const struct nft_expr_ops nft_queue_ops = { +	.type		= &nft_queue_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_queue)), +	.eval		= nft_queue_eval, +	.init		= nft_queue_init, +	.dump		= nft_queue_dump, +}; + +static struct nft_expr_type nft_queue_type __read_mostly = { +	.name		= "queue", +	.ops		= &nft_queue_ops, +	.policy		= nft_queue_policy, +	.maxattr	= NFTA_QUEUE_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_queue_module_init(void) +{ +	return nft_register_expr(&nft_queue_type); +} + +static void __exit nft_queue_module_exit(void) +{ +	nft_unregister_expr(&nft_queue_type); +} + +module_init(nft_queue_module_init); +module_exit(nft_queue_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); +MODULE_ALIAS_NFT_EXPR("queue"); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c new file mode 100644 index 00000000000..e1836ff8819 --- /dev/null +++ b/net/netfilter/nft_rbtree.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(nft_rbtree_lock); + +struct nft_rbtree { +	struct rb_root		root; +}; + +struct nft_rbtree_elem { +	struct rb_node		node; +	u16			flags; +	struct nft_data		key; +	struct nft_data		data[]; +}; + +static bool nft_rbtree_lookup(const struct nft_set *set, +			      const struct nft_data *key, +			      struct nft_data *data) +{ +	const struct nft_rbtree *priv = nft_set_priv(set); +	const struct nft_rbtree_elem *rbe, *interval = NULL; +	const struct rb_node *parent = priv->root.rb_node; +	int d; + +	spin_lock_bh(&nft_rbtree_lock); +	while (parent != NULL) { +		rbe = rb_entry(parent, struct nft_rbtree_elem, node); + +		d = nft_data_cmp(&rbe->key, key, set->klen); +		if (d < 0) { +			parent = parent->rb_left; +			interval = rbe; +		} else if (d > 0) +			parent = parent->rb_right; +		else { +found: +			if (rbe->flags & NFT_SET_ELEM_INTERVAL_END) +				goto out; +			if (set->flags & NFT_SET_MAP) +				nft_data_copy(data, rbe->data); + +			spin_unlock_bh(&nft_rbtree_lock); +			return true; +		} +	} + +	if (set->flags & NFT_SET_INTERVAL && interval != NULL) { +		rbe = interval; +		goto found; +	} +out: +	spin_unlock_bh(&nft_rbtree_lock); +	return false; +} + +static void nft_rbtree_elem_destroy(const struct nft_set *set, +				    struct nft_rbtree_elem *rbe) +{ +	nft_data_uninit(&rbe->key, NFT_DATA_VALUE); +	if (set->flags & NFT_SET_MAP && +	    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +		nft_data_uninit(rbe->data, set->dtype); + +	kfree(rbe); +} + +static int __nft_rbtree_insert(const struct nft_set *set, +			       struct nft_rbtree_elem *new) +{ +	struct nft_rbtree *priv = nft_set_priv(set); +	struct nft_rbtree_elem *rbe; +	struct rb_node *parent, **p; +	int d; + +	parent = NULL; +	p = &priv->root.rb_node; +	while (*p != NULL) { +		parent = *p; +		rbe = rb_entry(parent, struct nft_rbtree_elem, node); +		d = nft_data_cmp(&rbe->key, &new->key, set->klen); +		if (d < 0) +			p = &parent->rb_left; +		else if (d > 0) +			p = &parent->rb_right; +		else +			return -EEXIST; +	} +	rb_link_node(&new->node, parent, p); +	rb_insert_color(&new->node, &priv->root); +	return 0; +} + +static int nft_rbtree_insert(const struct nft_set *set, +			     const struct nft_set_elem *elem) +{ +	struct nft_rbtree_elem *rbe; +	unsigned int size; +	int err; + +	size = sizeof(*rbe); +	if (set->flags & NFT_SET_MAP && +	    !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) +		size += sizeof(rbe->data[0]); + +	rbe = kzalloc(size, GFP_KERNEL); +	if (rbe == NULL) +		return -ENOMEM; + +	rbe->flags = elem->flags; +	nft_data_copy(&rbe->key, &elem->key); +	if (set->flags & NFT_SET_MAP && +	    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +		nft_data_copy(rbe->data, &elem->data); + +	spin_lock_bh(&nft_rbtree_lock); +	err = __nft_rbtree_insert(set, rbe); +	if (err < 0) +		kfree(rbe); + +	spin_unlock_bh(&nft_rbtree_lock); +	return err; +} + +static void nft_rbtree_remove(const struct nft_set *set, +			      const struct nft_set_elem *elem) +{ +	struct nft_rbtree *priv = nft_set_priv(set); +	struct nft_rbtree_elem *rbe = elem->cookie; + +	spin_lock_bh(&nft_rbtree_lock); +	rb_erase(&rbe->node, &priv->root); +	spin_unlock_bh(&nft_rbtree_lock); +	kfree(rbe); +} + +static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) +{ +	const struct nft_rbtree *priv = nft_set_priv(set); +	const struct rb_node *parent = priv->root.rb_node; +	struct nft_rbtree_elem *rbe; +	int d; + +	spin_lock_bh(&nft_rbtree_lock); +	while (parent != NULL) { +		rbe = rb_entry(parent, struct nft_rbtree_elem, node); + +		d = nft_data_cmp(&rbe->key, &elem->key, set->klen); +		if (d < 0) +			parent = parent->rb_left; +		else if (d > 0) +			parent = parent->rb_right; +		else { +			elem->cookie = rbe; +			if (set->flags & NFT_SET_MAP && +			    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +				nft_data_copy(&elem->data, rbe->data); +			elem->flags = rbe->flags; +			spin_unlock_bh(&nft_rbtree_lock); +			return 0; +		} +	} +	spin_unlock_bh(&nft_rbtree_lock); +	return -ENOENT; +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, +			    const struct nft_set *set, +			    struct nft_set_iter *iter) +{ +	const struct nft_rbtree *priv = nft_set_priv(set); +	const struct nft_rbtree_elem *rbe; +	struct nft_set_elem elem; +	struct rb_node *node; + +	spin_lock_bh(&nft_rbtree_lock); +	for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { +		if (iter->count < iter->skip) +			goto cont; + +		rbe = rb_entry(node, struct nft_rbtree_elem, node); +		nft_data_copy(&elem.key, &rbe->key); +		if (set->flags & NFT_SET_MAP && +		    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +			nft_data_copy(&elem.data, rbe->data); +		elem.flags = rbe->flags; + +		iter->err = iter->fn(ctx, set, iter, &elem); +		if (iter->err < 0) { +			spin_unlock_bh(&nft_rbtree_lock); +			return; +		} +cont: +		iter->count++; +	} +	spin_unlock_bh(&nft_rbtree_lock); +} + +static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) +{ +	return sizeof(struct nft_rbtree); +} + +static int nft_rbtree_init(const struct nft_set *set, +			   const struct nft_set_desc *desc, +			   const struct nlattr * const nla[]) +{ +	struct nft_rbtree *priv = nft_set_priv(set); + +	priv->root = RB_ROOT; +	return 0; +} + +static void nft_rbtree_destroy(const struct nft_set *set) +{ +	struct nft_rbtree *priv = nft_set_priv(set); +	struct nft_rbtree_elem *rbe; +	struct rb_node *node; + +	spin_lock_bh(&nft_rbtree_lock); +	while ((node = priv->root.rb_node) != NULL) { +		rb_erase(node, &priv->root); +		rbe = rb_entry(node, struct nft_rbtree_elem, node); +		nft_rbtree_elem_destroy(set, rbe); +	} +	spin_unlock_bh(&nft_rbtree_lock); +} + +static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, +				struct nft_set_estimate *est) +{ +	unsigned int nsize; + +	nsize = sizeof(struct nft_rbtree_elem); +	if (features & NFT_SET_MAP) +		nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); + +	if (desc->size) +		est->size = sizeof(struct nft_rbtree) + desc->size * nsize; +	else +		est->size = nsize; + +	est->class = NFT_SET_CLASS_O_LOG_N; + +	return true; +} + +static struct nft_set_ops nft_rbtree_ops __read_mostly = { +	.privsize	= nft_rbtree_privsize, +	.estimate	= nft_rbtree_estimate, +	.init		= nft_rbtree_init, +	.destroy	= nft_rbtree_destroy, +	.insert		= nft_rbtree_insert, +	.remove		= nft_rbtree_remove, +	.get		= nft_rbtree_get, +	.lookup		= nft_rbtree_lookup, +	.walk		= nft_rbtree_walk, +	.features	= NFT_SET_INTERVAL | NFT_SET_MAP, +	.owner		= THIS_MODULE, +}; + +static int __init nft_rbtree_module_init(void) +{ +	return nft_register_set(&nft_rbtree_ops); +} + +static void __exit nft_rbtree_module_exit(void) +{ +	nft_unregister_set(&nft_rbtree_ops); +} + +module_init(nft_rbtree_module_init); +module_exit(nft_rbtree_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c new file mode 100644 index 00000000000..f3448c29644 --- /dev/null +++ b/net/netfilter/nft_reject.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { +	[NFTA_REJECT_TYPE]		= { .type = NLA_U32 }, +	[NFTA_REJECT_ICMP_CODE]		= { .type = NLA_U8 }, +}; +EXPORT_SYMBOL_GPL(nft_reject_policy); + +int nft_reject_init(const struct nft_ctx *ctx, +		    const struct nft_expr *expr, +		    const struct nlattr * const tb[]) +{ +	struct nft_reject *priv = nft_expr_priv(expr); + +	if (tb[NFTA_REJECT_TYPE] == NULL) +		return -EINVAL; + +	priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); +	switch (priv->type) { +	case NFT_REJECT_ICMP_UNREACH: +		if (tb[NFTA_REJECT_ICMP_CODE] == NULL) +			return -EINVAL; +		priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); +	case NFT_REJECT_TCP_RST: +		break; +	default: +		return -EINVAL; +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(nft_reject_init); + +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_reject *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) +		goto nla_put_failure; + +	switch (priv->type) { +	case NFT_REJECT_ICMP_UNREACH: +		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) +			goto nla_put_failure; +		break; +	} + +	return 0; + +nla_put_failure: +	return -1; +} +EXPORT_SYMBOL_GPL(nft_reject_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 00000000000..b718a52a465 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_inet_eval(const struct nft_expr *expr, +				 struct nft_data data[NFT_REG_MAX + 1], +				 const struct nft_pktinfo *pkt) +{ +	switch (pkt->ops->pf) { +	case NFPROTO_IPV4: +		return nft_reject_ipv4_eval(expr, data, pkt); +	case NFPROTO_IPV6: +		return nft_reject_ipv6_eval(expr, data, pkt); +	} +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { +	.type		= &nft_reject_inet_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)), +	.eval		= nft_reject_inet_eval, +	.init		= nft_reject_init, +	.dump		= nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { +	.family		= NFPROTO_INET, +	.name		= "reject", +	.ops		= &nft_reject_inet_ops, +	.policy		= nft_reject_policy, +	.maxattr	= NFTA_REJECT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ +	return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ +	nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8b03028cca6..227aa11e840 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -845,8 +845,13 @@ xt_replace_table(struct xt_table *table,  		return NULL;  	} -	table->private = newinfo;  	newinfo->initial_entries = private->initial_entries; +	/* +	 * Ensure contents of newinfo are visible before assigning to +	 * private. +	 */ +	smp_wmb(); +	table->private = newinfo;  	/*  	 * Even though table entries have now been swapped, other CPU's diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 3228d7f24eb..4973cbddc44 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -146,11 +146,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)  		if (par->family == NFPROTO_BRIDGE) {  			switch (eth_hdr(skb)->h_proto) { -			case __constant_htons(ETH_P_IP): +			case htons(ETH_P_IP):  				audit_ip4(ab, skb);  				break; -			case __constant_htons(ETH_P_IPV6): +			case htons(ETH_P_IPV6):  				audit_ip6(ab, skb);  				break;  			} diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index da35ac06a97..75747aecdeb 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -211,8 +211,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,  	ret = 0;  	if ((info->ct_events || info->exp_events) &&  	    !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, -				  GFP_KERNEL)) +				  GFP_KERNEL)) { +		ret = -EINVAL;  		goto err3; +	}  	if (info->helper[0]) {  		ret = xt_ct_set_helper(ct, info->helper, par); @@ -226,12 +228,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,  			goto err3;  	} -	__set_bit(IPS_TEMPLATE_BIT, &ct->status); -	__set_bit(IPS_CONFIRMED_BIT, &ct->status); - -	/* Overload tuple linked list to put us in template list. */ -	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, -				 &par->net->ct.tmpl); +	nf_conntrack_tmpl_insert(par->net, ct);  out:  	info->ct = ct;  	return 0; diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 1e2fae32f81..8f1779ff7e3 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -11,15 +11,13 @@  #include <linux/module.h>  #include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/jhash.h> -  #include <linux/netfilter.h>  #include <linux/netfilter_arp.h>  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter/xt_NFQUEUE.h> +#include <net/netfilter/nf_queue.h> +  MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");  MODULE_DESCRIPTION("Xtables: packet forwarding to netlink");  MODULE_LICENSE("GPL"); @@ -28,7 +26,6 @@ MODULE_ALIAS("ip6t_NFQUEUE");  MODULE_ALIAS("arpt_NFQUEUE");  static u32 jhash_initval __read_mostly; -static bool rnd_inited __read_mostly;  static unsigned int  nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -38,69 +35,16 @@ nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par)  	return NF_QUEUE_NR(tinfo->queuenum);  } -static u32 hash_v4(const struct sk_buff *skb) -{ -	const struct iphdr *iph = ip_hdr(skb); - -	/* packets in either direction go into same queue */ -	if ((__force u32)iph->saddr < (__force u32)iph->daddr) -		return jhash_3words((__force u32)iph->saddr, -			(__force u32)iph->daddr, iph->protocol, jhash_initval); - -	return jhash_3words((__force u32)iph->daddr, -			(__force u32)iph->saddr, iph->protocol, jhash_initval); -} - -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -static u32 hash_v6(const struct sk_buff *skb) -{ -	const struct ipv6hdr *ip6h = ipv6_hdr(skb); -	u32 a, b, c; - -	if ((__force u32)ip6h->saddr.s6_addr32[3] < -	    (__force u32)ip6h->daddr.s6_addr32[3]) { -		a = (__force u32) ip6h->saddr.s6_addr32[3]; -		b = (__force u32) ip6h->daddr.s6_addr32[3]; -	} else { -		b = (__force u32) ip6h->saddr.s6_addr32[3]; -		a = (__force u32) ip6h->daddr.s6_addr32[3]; -	} - -	if ((__force u32)ip6h->saddr.s6_addr32[1] < -	    (__force u32)ip6h->daddr.s6_addr32[1]) -		c = (__force u32) ip6h->saddr.s6_addr32[1]; -	else -		c = (__force u32) ip6h->daddr.s6_addr32[1]; - -	return jhash_3words(a, b, c, jhash_initval); -} -#endif - -static u32 -nfqueue_hash(const struct sk_buff *skb, const struct xt_action_param *par) -{ -	const struct xt_NFQ_info_v1 *info = par->targinfo; -	u32 queue = info->queuenum; - -	if (par->family == NFPROTO_IPV4) -		queue += ((u64) hash_v4(skb) * info->queues_total) >> 32; -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -	else if (par->family == NFPROTO_IPV6) -		queue += ((u64) hash_v6(skb) * info->queues_total) >> 32; -#endif - -	return queue; -} -  static unsigned int  nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)  {  	const struct xt_NFQ_info_v1 *info = par->targinfo;  	u32 queue = info->queuenum; -	if (info->queues_total > 1) -		queue = nfqueue_hash(skb, par); - +	if (info->queues_total > 1) { +		queue = nfqueue_hash(skb, queue, info->queues_total, +				     par->family, jhash_initval); +	}  	return NF_QUEUE_NR(queue);  } @@ -120,10 +64,8 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par)  	const struct xt_NFQ_info_v3 *info = par->targinfo;  	u32 maxid; -	if (unlikely(!rnd_inited)) { -		get_random_bytes(&jhash_initval, sizeof(jhash_initval)); -		rnd_inited = true; -	} +	init_hashrandom(&jhash_initval); +  	if (info->queues_total == 0) {  		pr_err("NFQUEUE: number of total queues is 0\n");  		return -EINVAL; @@ -147,17 +89,24 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)  {  	const struct xt_NFQ_info_v3 *info = par->targinfo;  	u32 queue = info->queuenum; +	int ret;  	if (info->queues_total > 1) {  		if (info->flags & NFQ_FLAG_CPU_FANOUT) {  			int cpu = smp_processor_id();  			queue = info->queuenum + cpu % info->queues_total; -		} else -			queue = nfqueue_hash(skb, par); +		} else { +			queue = nfqueue_hash(skb, queue, info->queues_total, +					     par->family, jhash_initval); +		}  	} -	return NF_QUEUE_NR(queue); +	ret = NF_QUEUE_NR(queue); +	if (info->flags & NFQ_FLAG_BYPASS) +		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + +	return ret;  }  static struct xt_target nfqueue_tg_reg[] __read_mostly = { diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index cd24290f3b2..e762de5ee89 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -43,10 +43,42 @@ optlen(const u_int8_t *opt, unsigned int offset)  		return opt[offset+1];  } +static u_int32_t tcpmss_reverse_mtu(struct net *net, +				    const struct sk_buff *skb, +				    unsigned int family) +{ +	struct flowi fl; +	const struct nf_afinfo *ai; +	struct rtable *rt = NULL; +	u_int32_t mtu     = ~0U; + +	if (family == PF_INET) { +		struct flowi4 *fl4 = &fl.u.ip4; +		memset(fl4, 0, sizeof(*fl4)); +		fl4->daddr = ip_hdr(skb)->saddr; +	} else { +		struct flowi6 *fl6 = &fl.u.ip6; + +		memset(fl6, 0, sizeof(*fl6)); +		fl6->daddr = ipv6_hdr(skb)->saddr; +	} +	rcu_read_lock(); +	ai = nf_get_afinfo(family); +	if (ai != NULL) +		ai->route(net, (struct dst_entry **)&rt, &fl, false); +	rcu_read_unlock(); + +	if (rt != NULL) { +		mtu = dst_mtu(&rt->dst); +		dst_release(&rt->dst); +	} +	return mtu; +} +  static int  tcpmss_mangle_packet(struct sk_buff *skb,  		     const struct xt_action_param *par, -		     unsigned int in_mtu, +		     unsigned int family,  		     unsigned int tcphoff,  		     unsigned int minlen)  { @@ -76,6 +108,9 @@ tcpmss_mangle_packet(struct sk_buff *skb,  		return -1;  	if (info->mss == XT_TCPMSS_CLAMP_PMTU) { +		struct net *net = dev_net(par->in ? par->in : par->out); +		unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); +  		if (dst_mtu(skb_dst(skb)) <= minlen) {  			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",  					    dst_mtu(skb_dst(skb))); @@ -165,37 +200,6 @@ tcpmss_mangle_packet(struct sk_buff *skb,  	return TCPOLEN_MSS;  } -static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, -				    unsigned int family) -{ -	struct flowi fl; -	const struct nf_afinfo *ai; -	struct rtable *rt = NULL; -	u_int32_t mtu     = ~0U; - -	if (family == PF_INET) { -		struct flowi4 *fl4 = &fl.u.ip4; -		memset(fl4, 0, sizeof(*fl4)); -		fl4->daddr = ip_hdr(skb)->saddr; -	} else { -		struct flowi6 *fl6 = &fl.u.ip6; - -		memset(fl6, 0, sizeof(*fl6)); -		fl6->daddr = ipv6_hdr(skb)->saddr; -	} -	rcu_read_lock(); -	ai = nf_get_afinfo(family); -	if (ai != NULL) -		ai->route(&init_net, (struct dst_entry **)&rt, &fl, false); -	rcu_read_unlock(); - -	if (rt != NULL) { -		mtu = dst_mtu(&rt->dst); -		dst_release(&rt->dst); -	} -	return mtu; -} -  static unsigned int  tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)  { @@ -204,7 +208,7 @@ tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)  	int ret;  	ret = tcpmss_mangle_packet(skb, par, -				   tcpmss_reverse_mtu(skb, PF_INET), +				   PF_INET,  				   iph->ihl * 4,  				   sizeof(*iph) + sizeof(struct tcphdr));  	if (ret < 0) @@ -233,7 +237,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)  	if (tcphoff < 0)  		return NF_DROP;  	ret = tcpmss_mangle_packet(skb, par, -				   tcpmss_reverse_mtu(skb, PF_INET6), +				   PF_INET6,  				   tcphoff,  				   sizeof(*ipv6h) + sizeof(struct tcphdr));  	if (ret < 0) diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 5d8a3a3cd5a..ef8a926752a 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -200,7 +200,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,  				     in->ifindex);  		if (sk) {  			int connected = (sk->sk_state == TCP_ESTABLISHED); -			int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr); +			int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);  			/* NOTE: we return listeners even if bound to  			 * 0.0.0.0, those are filtered out in diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 12d4da8e6c7..bbffdbdaf60 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -23,10 +23,11 @@ MODULE_ALIAS("ip6t_bpf");  static int bpf_mt_check(const struct xt_mtchk_param *par)  {  	struct xt_bpf_info *info = par->matchinfo; -	struct sock_fprog program; +	struct sock_fprog_kern program;  	program.len = info->bpf_program_num_elem; -	program.filter = (struct sock_filter __user *) info->bpf_program; +	program.filter = info->bpf_program; +  	if (sk_unattached_filter_create(&info->filter, &program)) {  		pr_info("bpf: check failed: parse error\n");  		return -EINVAL; diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c new file mode 100644 index 00000000000..f4e83300532 --- /dev/null +++ b/net/netfilter/xt_cgroup.c @@ -0,0 +1,72 @@ +/* + * Xtables module to match the process control group. + * + * Might be used to implement individual "per-application" firewall + * policies in contrast to global policies based on control groups. + * Matching is based upon processes tagged to net_cls' classid marker. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/skbuff.h> +#include <linux/module.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_cgroup.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("Xtables: process control group matching"); +MODULE_ALIAS("ipt_cgroup"); +MODULE_ALIAS("ip6t_cgroup"); + +static int cgroup_mt_check(const struct xt_mtchk_param *par) +{ +	struct xt_cgroup_info *info = par->matchinfo; + +	if (info->invert & ~1) +		return -EINVAL; + +	return info->id ? 0 : -EINVAL; +} + +static bool +cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_cgroup_info *info = par->matchinfo; + +	if (skb->sk == NULL) +		return false; + +	return (info->id == skb->sk->sk_classid) ^ info->invert; +} + +static struct xt_match cgroup_mt_reg __read_mostly = { +	.name       = "cgroup", +	.revision   = 0, +	.family     = NFPROTO_UNSPEC, +	.checkentry = cgroup_mt_check, +	.match      = cgroup_mt, +	.matchsize  = sizeof(struct xt_cgroup_info), +	.me         = THIS_MODULE, +	.hooks      = (1 << NF_INET_LOCAL_OUT) | +		      (1 << NF_INET_POST_ROUTING) | +		      (1 << NF_INET_LOCAL_IN), +}; + +static int __init cgroup_mt_init(void) +{ +	return xt_register_match(&cgroup_mt_reg); +} + +static void __exit cgroup_mt_exit(void) +{ +	xt_unregister_match(&cgroup_mt_reg); +} + +module_init(cgroup_mt_init); +module_exit(cgroup_mt_exit); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index e595e07a759..1e634615ab9 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -26,16 +26,18 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)  	u_int64_t what = 0;	/* initialize to make gcc happy */  	u_int64_t bytes = 0;  	u_int64_t pkts = 0; +	const struct nf_conn_acct *acct;  	const struct nf_conn_counter *counters;  	ct = nf_ct_get(skb, &ctinfo);  	if (!ct)  		return false; -	counters = nf_conn_acct_find(ct); -	if (!counters) +	acct = nf_conn_acct_find(ct); +	if (!acct)  		return false; +	counters = acct->counter;  	switch (sinfo->what) {  	case XT_CONNBYTES_PKTS:  		switch (sinfo->direction) { diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index c40b2695633..fbc66bb250d 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -19,6 +19,7 @@  #include <linux/jhash.h>  #include <linux/slab.h>  #include <linux/list.h> +#include <linux/rbtree.h>  #include <linux/module.h>  #include <linux/random.h>  #include <linux/skbuff.h> @@ -31,6 +32,16 @@  #include <net/netfilter/nf_conntrack_tuple.h>  #include <net/netfilter/nf_conntrack_zones.h> +#define CONNLIMIT_SLOTS		256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS	8U +#else +#define CONNLIMIT_LOCK_SLOTS	256U +#endif + +#define CONNLIMIT_GC_MAX_NODES	8 +  /* we will save the tuples of all connections we care about */  struct xt_connlimit_conn {  	struct hlist_node		node; @@ -38,16 +49,27 @@ struct xt_connlimit_conn {  	union nf_inet_addr		addr;  }; +struct xt_connlimit_rb { +	struct rb_node node; +	struct hlist_head hhead; /* connections/hosts in same subnet */ +	union nf_inet_addr addr; /* search key */ +}; + +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; +  struct xt_connlimit_data { -	struct hlist_head	iphash[256]; -	spinlock_t		lock; +	struct rb_root climit_root4[CONNLIMIT_SLOTS]; +	struct rb_root climit_root6[CONNLIMIT_SLOTS];  };  static u_int32_t connlimit_rnd __read_mostly; +static struct kmem_cache *connlimit_rb_cachep __read_mostly; +static struct kmem_cache *connlimit_conn_cachep __read_mostly;  static inline unsigned int connlimit_iphash(__be32 addr)  { -	return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; +	return jhash_1word((__force __u32)addr, +			    connlimit_rnd) % CONNLIMIT_SLOTS;  }  static inline unsigned int @@ -60,7 +82,8 @@ connlimit_iphash6(const union nf_inet_addr *addr,  	for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)  		res.ip6[i] = addr->ip6[i] & mask->ip6[i]; -	return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; +	return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), +		       connlimit_rnd) % CONNLIMIT_SLOTS;  }  static inline bool already_closed(const struct nf_conn *conn) @@ -72,13 +95,14 @@ static inline bool already_closed(const struct nf_conn *conn)  		return 0;  } -static inline unsigned int +static int  same_source_net(const union nf_inet_addr *addr,  		const union nf_inet_addr *mask,  		const union nf_inet_addr *u3, u_int8_t family)  {  	if (family == NFPROTO_IPV4) { -		return (addr->ip & mask->ip) == (u3->ip & mask->ip); +		return ntohl(addr->ip & mask->ip) - +		       ntohl(u3->ip & mask->ip);  	} else {  		union nf_inet_addr lh, rh;  		unsigned int i; @@ -88,89 +112,205 @@ same_source_net(const union nf_inet_addr *addr,  			rh.ip6[i] = u3->ip6[i] & mask->ip6[i];  		} -		return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; +		return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6));  	}  } -static int count_them(struct net *net, -		      struct xt_connlimit_data *data, +static bool add_hlist(struct hlist_head *head,  		      const struct nf_conntrack_tuple *tuple, -		      const union nf_inet_addr *addr, -		      const union nf_inet_addr *mask, -		      u_int8_t family) +		      const union nf_inet_addr *addr) +{ +	struct xt_connlimit_conn *conn; + +	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); +	if (conn == NULL) +		return false; +	conn->tuple = *tuple; +	conn->addr = *addr; +	hlist_add_head(&conn->node, head); +	return true; +} + +static unsigned int check_hlist(struct net *net, +				struct hlist_head *head, +				const struct nf_conntrack_tuple *tuple, +				bool *addit)  {  	const struct nf_conntrack_tuple_hash *found;  	struct xt_connlimit_conn *conn;  	struct hlist_node *n;  	struct nf_conn *found_ct; -	struct hlist_head *hash; -	bool addit = true; -	int matches = 0; - -	if (family == NFPROTO_IPV6) -		hash = &data->iphash[connlimit_iphash6(addr, mask)]; -	else -		hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; +	unsigned int length = 0; +	*addit = true;  	rcu_read_lock();  	/* check the saved connections */ -	hlist_for_each_entry_safe(conn, n, hash, node) { +	hlist_for_each_entry_safe(conn, n, head, node) {  		found    = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE,  						 &conn->tuple); -		found_ct = NULL; +		if (found == NULL) { +			hlist_del(&conn->node); +			kmem_cache_free(connlimit_conn_cachep, conn); +			continue; +		} -		if (found != NULL) -			found_ct = nf_ct_tuplehash_to_ctrack(found); +		found_ct = nf_ct_tuplehash_to_ctrack(found); -		if (found_ct != NULL && -		    nf_ct_tuple_equal(&conn->tuple, tuple) && -		    !already_closed(found_ct)) +		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {  			/*  			 * Just to be sure we have it only once in the list.  			 * We should not see tuples twice unless someone hooks  			 * this into a table without "-p tcp --syn".  			 */ -			addit = false; - -		if (found == NULL) { -			/* this one is gone */ -			hlist_del(&conn->node); -			kfree(conn); -			continue; -		} - -		if (already_closed(found_ct)) { +			*addit = false; +		} else if (already_closed(found_ct)) {  			/*  			 * we do not care about connections which are  			 * closed already -> ditch it  			 */  			nf_ct_put(found_ct);  			hlist_del(&conn->node); -			kfree(conn); +			kmem_cache_free(connlimit_conn_cachep, conn);  			continue;  		} -		if (same_source_net(addr, mask, &conn->addr, family)) -			/* same source network -> be counted! */ -			++matches;  		nf_ct_put(found_ct); +		length++;  	}  	rcu_read_unlock(); -	if (addit) { -		/* save the new connection in our list */ -		conn = kmalloc(sizeof(*conn), GFP_ATOMIC); -		if (conn == NULL) -			return -ENOMEM; -		conn->tuple = *tuple; -		conn->addr = *addr; -		hlist_add_head(&conn->node, hash); -		++matches; +	return length; +} + +static void tree_nodes_free(struct rb_root *root, +			    struct xt_connlimit_rb *gc_nodes[], +			    unsigned int gc_count) +{ +	struct xt_connlimit_rb *rbconn; + +	while (gc_count) { +		rbconn = gc_nodes[--gc_count]; +		rb_erase(&rbconn->node, root); +		kmem_cache_free(connlimit_rb_cachep, rbconn); +	} +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, +	   const struct nf_conntrack_tuple *tuple, +	   const union nf_inet_addr *addr, const union nf_inet_addr *mask, +	   u8 family) +{ +	struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; +	struct rb_node **rbnode, *parent; +	struct xt_connlimit_rb *rbconn; +	struct xt_connlimit_conn *conn; +	unsigned int gc_count; +	bool no_gc = false; + + restart: +	gc_count = 0; +	parent = NULL; +	rbnode = &(root->rb_node); +	while (*rbnode) { +		int diff; +		bool addit; + +		rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); + +		parent = *rbnode; +		diff = same_source_net(addr, mask, &rbconn->addr, family); +		if (diff < 0) { +			rbnode = &((*rbnode)->rb_left); +		} else if (diff > 0) { +			rbnode = &((*rbnode)->rb_right); +		} else { +			/* same source network -> be counted! */ +			unsigned int count; +			count = check_hlist(net, &rbconn->hhead, tuple, &addit); + +			tree_nodes_free(root, gc_nodes, gc_count); +			if (!addit) +				return count; + +			if (!add_hlist(&rbconn->hhead, tuple, addr)) +				return 0; /* hotdrop */ + +			return count + 1; +		} + +		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) +			continue; + +		/* only used for GC on hhead, retval and 'addit' ignored */ +		check_hlist(net, &rbconn->hhead, tuple, &addit); +		if (hlist_empty(&rbconn->hhead)) +			gc_nodes[gc_count++] = rbconn; +	} + +	if (gc_count) { +		no_gc = true; +		tree_nodes_free(root, gc_nodes, gc_count); +		/* tree_node_free before new allocation permits +		 * allocator to re-use newly free'd object. +		 * +		 * This is a rare event; in most cases we will find +		 * existing node to re-use. (or gc_count is 0). +		 */ +		goto restart; +	} + +	/* no match, need to insert new node */ +	rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); +	if (rbconn == NULL) +		return 0; + +	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); +	if (conn == NULL) { +		kmem_cache_free(connlimit_rb_cachep, rbconn); +		return 0; +	} + +	conn->tuple = *tuple; +	conn->addr = *addr; +	rbconn->addr = *addr; + +	INIT_HLIST_HEAD(&rbconn->hhead); +	hlist_add_head(&conn->node, &rbconn->hhead); + +	rb_link_node(&rbconn->node, parent, rbnode); +	rb_insert_color(&rbconn->node, root); +	return 1; +} + +static int count_them(struct net *net, +		      struct xt_connlimit_data *data, +		      const struct nf_conntrack_tuple *tuple, +		      const union nf_inet_addr *addr, +		      const union nf_inet_addr *mask, +		      u_int8_t family) +{ +	struct rb_root *root; +	int count; +	u32 hash; + +	if (family == NFPROTO_IPV6) { +		hash = connlimit_iphash6(addr, mask); +		root = &data->climit_root6[hash]; +	} else { +		hash = connlimit_iphash(addr->ip & mask->ip); +		root = &data->climit_root4[hash];  	} -	return matches; +	spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + +	count = count_tree(net, root, tuple, addr, mask, family); + +	spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + +	return count;  }  static bool @@ -183,7 +323,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)  	const struct nf_conntrack_tuple *tuple_ptr = &tuple;  	enum ip_conntrack_info ctinfo;  	const struct nf_conn *ct; -	int connections; +	unsigned int connections;  	ct = nf_ct_get(skb, &ctinfo);  	if (ct != NULL) @@ -202,12 +342,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)  			  iph->daddr : iph->saddr;  	} -	spin_lock_bh(&info->data->lock);  	connections = count_them(net, info->data, tuple_ptr, &addr,  	                         &info->mask, par->family); -	spin_unlock_bh(&info->data->lock); - -	if (connections < 0) +	if (connections == 0)  		/* kmalloc failed, drop it entirely */  		goto hotdrop; @@ -247,29 +384,44 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)  		return -ENOMEM;  	} -	spin_lock_init(&info->data->lock); -	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) -		INIT_HLIST_HEAD(&info->data->iphash[i]); +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) +		info->data->climit_root4[i] = RB_ROOT; +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) +		info->data->climit_root6[i] = RB_ROOT;  	return 0;  } -static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +static void destroy_tree(struct rb_root *r)  { -	const struct xt_connlimit_info *info = par->matchinfo;  	struct xt_connlimit_conn *conn; +	struct xt_connlimit_rb *rbconn;  	struct hlist_node *n; -	struct hlist_head *hash = info->data->iphash; +	struct rb_node *node; + +	while ((node = rb_first(r)) != NULL) { +		rbconn = container_of(node, struct xt_connlimit_rb, node); + +		rb_erase(node, r); + +		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) +			kmem_cache_free(connlimit_conn_cachep, conn); + +		kmem_cache_free(connlimit_rb_cachep, rbconn); +	} +} + +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ +	const struct xt_connlimit_info *info = par->matchinfo;  	unsigned int i;  	nf_ct_l3proto_module_put(par->family); -	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { -		hlist_for_each_entry_safe(conn, n, &hash[i], node) { -			hlist_del(&conn->node); -			kfree(conn); -		} -	} +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) +		destroy_tree(&info->data->climit_root4[i]); +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) +		destroy_tree(&info->data->climit_root6[i]);  	kfree(info->data);  } @@ -287,12 +439,40 @@ static struct xt_match connlimit_mt_reg __read_mostly = {  static int __init connlimit_mt_init(void)  { -	return xt_register_match(&connlimit_mt_reg); +	int ret, i; + +	BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); +	BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + +	for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) +		spin_lock_init(&xt_connlimit_locks[i]); + +	connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", +					   sizeof(struct xt_connlimit_conn), +					   0, 0, NULL); +	if (!connlimit_conn_cachep) +		return -ENOMEM; + +	connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", +					   sizeof(struct xt_connlimit_rb), +					   0, 0, NULL); +	if (!connlimit_rb_cachep) { +		kmem_cache_destroy(connlimit_conn_cachep); +		return -ENOMEM; +	} +	ret = xt_register_match(&connlimit_mt_reg); +	if (ret != 0) { +		kmem_cache_destroy(connlimit_conn_cachep); +		kmem_cache_destroy(connlimit_rb_cachep); +	} +	return ret;  }  static void __exit connlimit_mt_exit(void)  {  	xt_unregister_match(&connlimit_mt_reg); +	kmem_cache_destroy(connlimit_conn_cachep); +	kmem_cache_destroy(connlimit_rb_cachep);  }  module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 7278145e6a6..69f78e96fdb 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -17,8 +17,7 @@   * GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #include <linux/module.h> diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9ff035c7140..a3910fc2122 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -325,21 +325,24 @@ static void htable_gc(unsigned long htlong)  	add_timer(&ht->timer);  } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) +static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)  {  	struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net);  	struct proc_dir_entry *parent; -	del_timer_sync(&hinfo->timer); -  	if (hinfo->family == NFPROTO_IPV4)  		parent = hashlimit_net->ipt_hashlimit;  	else  		parent = hashlimit_net->ip6t_hashlimit; -	if(parent != NULL) +	if (parent != NULL)  		remove_proc_entry(hinfo->name, parent); +} +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ +	del_timer_sync(&hinfo->timer); +	htable_remove_proc_entry(hinfo);  	htable_selective_cleanup(hinfo, select_all);  	kfree(hinfo->name);  	vfree(hinfo); @@ -883,21 +886,15 @@ static int __net_init hashlimit_proc_net_init(struct net *net)  static void __net_exit hashlimit_proc_net_exit(struct net *net)  {  	struct xt_hashlimit_htable *hinfo; -	struct proc_dir_entry *pde;  	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); -	/* recent_net_exit() is called before recent_mt_destroy(). Make sure -	 * that the parent xt_recent proc entry is is empty before trying to -	 * remove it. +	/* hashlimit_net_exit() is called before hashlimit_mt_destroy(). +	 * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc +	 * entries is empty before trying to remove it.  	 */  	mutex_lock(&hashlimit_mutex); -	pde = hashlimit_net->ipt_hashlimit; -	if (pde == NULL) -		pde = hashlimit_net->ip6t_hashlimit; -  	hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) -		remove_proc_entry(hinfo->name, pde); - +		htable_remove_proc_entry(hinfo);  	hashlimit_net->ipt_hashlimit = NULL;  	hashlimit_net->ip6t_hashlimit = NULL;  	mutex_unlock(&hashlimit_mutex); diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c new file mode 100644 index 00000000000..89d53104c6b --- /dev/null +++ b/net/netfilter/xt_ipcomp.c @@ -0,0 +1,111 @@ +/*  Kernel module to match IPComp parameters for IPv4 and IPv6 + * + *  Copyright (C) 2013 WindRiver + * + *  Author: + *  Fan Du <fan.du@windriver.com> + * + *  Based on: + *  net/netfilter/xt_esp.c + * + *  This program is free software; you can redistribute it and/or + *  modify it under the terms of the GNU General Public License + *  as published by the Free Software Foundation; either version + *  2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter/xt_ipcomp.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fan Du <fan.du@windriver.com>"); +MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ +	bool r; +	pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", +		 invert ? '!' : ' ', min, spi, max); +	r = (spi >= min && spi <= max) ^ invert; +	pr_debug(" result %s\n", r ? "PASS" : "FAILED"); +	return r; +} + +static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	struct ip_comp_hdr _comphdr; +	const struct ip_comp_hdr *chdr; +	const struct xt_ipcomp *compinfo = par->matchinfo; + +	/* Must not be a fragment. */ +	if (par->fragoff != 0) +		return false; + +	chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr); +	if (chdr == NULL) { +		/* We've been asked to examine this packet, and we +		 * can't.  Hence, no choice but to drop. +		 */ +		pr_debug("Dropping evil IPComp tinygram.\n"); +		par->hotdrop = true; +		return 0; +	} + +	return spi_match(compinfo->spis[0], compinfo->spis[1], +			 ntohs(chdr->cpi), +			 !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); +} + +static int comp_mt_check(const struct xt_mtchk_param *par) +{ +	const struct xt_ipcomp *compinfo = par->matchinfo; + +	/* Must specify no unknown invflags */ +	if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) { +		pr_err("unknown flags %X\n", compinfo->invflags); +		return -EINVAL; +	} +	return 0; +} + +static struct xt_match comp_mt_reg[] __read_mostly = { +	{ +		.name		= "ipcomp", +		.family		= NFPROTO_IPV4, +		.match		= comp_mt, +		.matchsize	= sizeof(struct xt_ipcomp), +		.proto		= IPPROTO_COMP, +		.checkentry	= comp_mt_check, +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "ipcomp", +		.family		= NFPROTO_IPV6, +		.match		= comp_mt, +		.matchsize	= sizeof(struct xt_ipcomp), +		.proto		= IPPROTO_COMP, +		.checkentry	= comp_mt_check, +		.me		= THIS_MODULE, +	}, +}; + +static int __init comp_mt_init(void) +{ +	return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +static void __exit comp_mt_exit(void) +{ +	xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +module_init(comp_mt_init); +module_exit(comp_mt_exit); diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c new file mode 100644 index 00000000000..8aee572771f --- /dev/null +++ b/net/netfilter/xt_l2tp.c @@ -0,0 +1,354 @@ +/* Kernel module to match L2TP header parameters. */ + +/* (C) 2013      James Chapman <jchapman@katalix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <linux/l2tp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_l2tp.h> + +/* L2TP header masks */ +#define L2TP_HDR_T_BIT	0x8000 +#define L2TP_HDR_L_BIT	0x4000 +#define L2TP_HDR_VER	0x000f + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); +MODULE_DESCRIPTION("Xtables: L2TP header match"); +MODULE_ALIAS("ipt_l2tp"); +MODULE_ALIAS("ip6t_l2tp"); + +/* The L2TP fields that can be matched */ +struct l2tp_data { +	u32 tid; +	u32 sid; +	u8 type; +	u8 version; +}; + +union l2tp_val { +	__be16 val16[2]; +	__be32 val32; +}; + +static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data) +{ +	if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type)) +		return false; + +	if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version)) +		return false; + +	/* Check tid only for L2TPv3 control or any L2TPv2 packets */ +	if ((info->flags & XT_L2TP_TID) && +	    ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) && +	    (info->tid != data->tid)) +		return false; + +	/* Check sid only for L2TP data packets */ +	if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) && +	    (info->sid != data->sid)) +		return false; + +	return true; +} + +/* Parse L2TP header fields when UDP encapsulation is used. Handles + * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a + * different format. See + * RFC2661, Section 3.1, L2TPv2 Header Format + * RFC3931, Section 3.2.1, L2TPv3 Control Message Header + * RFC3931, Section 3.2.2, L2TPv3 Data Message Header + * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP + */ +static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	int uhlen = sizeof(struct udphdr); +	int offs = thoff + uhlen; +	union l2tp_val *lh; +	union l2tp_val lhbuf; +	u16 flags; +	struct l2tp_data data = { 0, }; + +	if (par->fragoff != 0) +		return false; + +	/* Extract L2TP header fields. The flags in the first 16 bits +	 * tell us where the other fields are. +	 */ +	lh = skb_header_pointer(skb, offs, 2, &lhbuf); +	if (lh == NULL) +		return false; + +	flags = ntohs(lh->val16[0]); +	if (flags & L2TP_HDR_T_BIT) +		data.type = XT_L2TP_TYPE_CONTROL; +	else +		data.type = XT_L2TP_TYPE_DATA; +	data.version = (u8) flags & L2TP_HDR_VER; + +	/* Now extract the L2TP tid/sid. These are in different places +	 * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we +	 * must also check to see if the length field is present, +	 * since this affects the offsets into the packet of the +	 * tid/sid fields. +	 */ +	if (data.version == 3) { +		lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf); +		if (lh == NULL) +			return false; +		if (data.type == XT_L2TP_TYPE_CONTROL) +			data.tid = ntohl(lh->val32); +		else +			data.sid = ntohl(lh->val32); +	} else if (data.version == 2) { +		if (flags & L2TP_HDR_L_BIT) +			offs += 2; +		lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf); +		if (lh == NULL) +			return false; +		data.tid = (u32) ntohs(lh->val16[0]); +		data.sid = (u32) ntohs(lh->val16[1]); +	} else +		return false; + +	return l2tp_match(info, &data); +} + +/* Parse L2TP header fields for IP encapsulation (no UDP header). + * L2TPv3 data packets have a different form with IP encap. See + * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP. + * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP. + */ +static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	union l2tp_val *lh; +	union l2tp_val lhbuf; +	struct l2tp_data data = { 0, }; + +	/* For IP encap, the L2TP sid is the first 32-bits. */ +	lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf); +	if (lh == NULL) +		return false; +	if (lh->val32 == 0) { +		/* Must be a control packet. The L2TP tid is further +		 * into the packet. +		 */ +		data.type = XT_L2TP_TYPE_CONTROL; +		lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf), +					&lhbuf); +		if (lh == NULL) +			return false; +		data.tid = ntohl(lh->val32); +	} else { +		data.sid = ntohl(lh->val32); +		data.type = XT_L2TP_TYPE_DATA; +	} + +	data.version = 3; + +	return l2tp_match(info, &data); +} + +static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ +	struct iphdr *iph = ip_hdr(skb); +	u8 ipproto = iph->protocol; + +	/* l2tp_mt_check4 already restricts the transport protocol */ +	switch (ipproto) { +	case IPPROTO_UDP: +		return l2tp_udp_mt(skb, par, par->thoff); +	case IPPROTO_L2TP: +		return l2tp_ip_mt(skb, par, par->thoff); +	} + +	return false; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ +	unsigned int thoff = 0; +	unsigned short fragoff = 0; +	int ipproto; + +	ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); +	if (fragoff != 0) +		return false; + +	/* l2tp_mt_check6 already restricts the transport protocol */ +	switch (ipproto) { +	case IPPROTO_UDP: +		return l2tp_udp_mt(skb, par, thoff); +	case IPPROTO_L2TP: +		return l2tp_ip_mt(skb, par, thoff); +	} + +	return false; +} +#endif + +static int l2tp_mt_check(const struct xt_mtchk_param *par) +{ +	const struct xt_l2tp_info *info = par->matchinfo; + +	/* Check for invalid flags */ +	if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | +			    XT_L2TP_TYPE)) { +		pr_info("unknown flags: %x\n", info->flags); +		return -EINVAL; +	} + +	/* At least one of tid, sid or type=control must be specified */ +	if ((!(info->flags & XT_L2TP_TID)) && +	    (!(info->flags & XT_L2TP_SID)) && +	    ((!(info->flags & XT_L2TP_TYPE)) || +	     (info->type != XT_L2TP_TYPE_CONTROL))) { +		pr_info("invalid flags combination: %x\n", info->flags); +		return -EINVAL; +	} + +	/* If version 2 is specified, check that incompatible params +	 * are not supplied +	 */ +	if (info->flags & XT_L2TP_VERSION) { +		if ((info->version < 2) || (info->version > 3)) { +			pr_info("wrong L2TP version: %u\n", info->version); +			return -EINVAL; +		} + +		if (info->version == 2) { +			if ((info->flags & XT_L2TP_TID) && +			    (info->tid > 0xffff)) { +				pr_info("v2 tid > 0xffff: %u\n", info->tid); +				return -EINVAL; +			} +			if ((info->flags & XT_L2TP_SID) && +			    (info->sid > 0xffff)) { +				pr_info("v2 sid > 0xffff: %u\n", info->sid); +				return -EINVAL; +			} +		} +	} + +	return 0; +} + +static int l2tp_mt_check4(const struct xt_mtchk_param *par) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	const struct ipt_entry *e = par->entryinfo; +	const struct ipt_ip *ip = &e->ip; +	int ret; + +	ret = l2tp_mt_check(par); +	if (ret != 0) +		return ret; + +	if ((ip->proto != IPPROTO_UDP) && +	    (ip->proto != IPPROTO_L2TP)) { +		pr_info("missing protocol rule (udp|l2tpip)\n"); +		return -EINVAL; +	} + +	if ((ip->proto == IPPROTO_L2TP) && +	    (info->version == 2)) { +		pr_info("v2 doesn't support IP mode\n"); +		return -EINVAL; +	} + +	return 0; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int l2tp_mt_check6(const struct xt_mtchk_param *par) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	const struct ip6t_entry *e = par->entryinfo; +	const struct ip6t_ip6 *ip = &e->ipv6; +	int ret; + +	ret = l2tp_mt_check(par); +	if (ret != 0) +		return ret; + +	if ((ip->proto != IPPROTO_UDP) && +	    (ip->proto != IPPROTO_L2TP)) { +		pr_info("missing protocol rule (udp|l2tpip)\n"); +		return -EINVAL; +	} + +	if ((ip->proto == IPPROTO_L2TP) && +	    (info->version == 2)) { +		pr_info("v2 doesn't support IP mode\n"); +		return -EINVAL; +	} + +	return 0; +} +#endif + +static struct xt_match l2tp_mt_reg[] __read_mostly = { +	{ +		.name      = "l2tp", +		.revision  = 0, +		.family    = NFPROTO_IPV4, +		.match     = l2tp_mt4, +		.matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), +		.checkentry = l2tp_mt_check4, +		.hooks     = ((1 << NF_INET_PRE_ROUTING) | +			      (1 << NF_INET_LOCAL_IN) | +			      (1 << NF_INET_LOCAL_OUT) | +			      (1 << NF_INET_FORWARD)), +		.me        = THIS_MODULE, +	}, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	{ +		.name      = "l2tp", +		.revision  = 0, +		.family    = NFPROTO_IPV6, +		.match     = l2tp_mt6, +		.matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), +		.checkentry = l2tp_mt_check6, +		.hooks     = ((1 << NF_INET_PRE_ROUTING) | +			      (1 << NF_INET_LOCAL_IN) | +			      (1 << NF_INET_LOCAL_OUT) | +			      (1 << NF_INET_FORWARD)), +		.me        = THIS_MODULE, +	}, +#endif +}; + +static int __init l2tp_mt_init(void) +{ +	return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +static void __exit l2tp_mt_exit(void) +{ +	xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +module_init(l2tp_mt_init); +module_exit(l2tp_mt_exit); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index b3be0ef21f1..8c646ed9c92 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -21,11 +21,14 @@ MODULE_ALIAS("ip6t_nfacct");  static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)  { +	int overquota;  	const struct xt_nfacct_match_info *info = par->targinfo;  	nfnl_acct_update(skb, info->nfacct); -	return true; +	overquota = nfnl_acct_overquota(skb, info->nfacct); + +	return overquota == NFACCT_UNDERQUOTA ? false : true;  }  static int diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 647d989a01e..c529161cdbf 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -13,8 +13,7 @@   * GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/module.h> @@ -423,4 +422,6 @@ module_exit(xt_osf_fini);  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");  MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf");  MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 1e657cf715c..a9faae89f95 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -313,10 +313,7 @@ out:  static void recent_table_free(void *addr)  { -	if (is_vmalloc_addr(addr)) -		vfree(addr); -	else -		kfree(addr); +	kvfree(addr);  }  static int recent_mt_check(const struct xt_mtchk_param *par, diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 6efe4e5a81c..8fd324116e6 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -5,23 +5,35 @@   * they serve as the hanging-off data accessed through repl.data[].   */ +/* tbl has the following structure equivalent, but is C99 compliant: + * struct { + *	struct type##_replace repl; + *	struct type##_standard entries[nhooks]; + *	struct type##_error term; + * } *tbl; + */ +  #define xt_alloc_initial_table(type, typ2) ({ \  	unsigned int hook_mask = info->valid_hooks; \  	unsigned int nhooks = hweight32(hook_mask); \  	unsigned int bytes = 0, hooknum = 0, i = 0; \  	struct { \  		struct type##_replace repl; \ -		struct type##_standard entries[nhooks]; \ -		struct type##_error term; \ -	} *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ +		struct type##_standard entries[]; \ +	} *tbl; \ +	struct type##_error *term; \ +	size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \ +		__alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \ +	tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \  	if (tbl == NULL) \  		return NULL; \ +	term = (struct type##_error *)&(((char *)tbl)[term_offset]); \  	strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ -	tbl->term = (struct type##_error)typ2##_ERROR_INIT;  \ +	*term = (struct type##_error)typ2##_ERROR_INIT;  \  	tbl->repl.valid_hooks = hook_mask; \  	tbl->repl.num_entries = nhooks + 1; \  	tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ -	                 sizeof(struct type##_error); \ +			 sizeof(struct type##_error); \  	for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \  		if (!(hook_mask & 1)) \  			continue; \ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 31790e789e2..80c2e2d603e 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -81,17 +81,17 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)  	struct xt_set_info_match_v0 *info = par->matchinfo;  	ip_set_id_t index; -	index = ip_set_nfnl_get_byindex(info->match_set.index); +	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);  	if (index == IPSET_INVALID_ID) { -		pr_warning("Cannot find set indentified by id %u to match\n", +		pr_warning("Cannot find set identified by id %u to match\n",  			   info->match_set.index);  		return -ENOENT;  	}  	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {  		pr_warning("Protocol error: set match dimension "  			   "is over the limit!\n"); -		ip_set_nfnl_put(info->match_set.index); +		ip_set_nfnl_put(par->net, info->match_set.index);  		return -ERANGE;  	} @@ -106,9 +106,104 @@ set_match_v0_destroy(const struct xt_mtdtor_param *par)  {  	struct xt_set_info_match_v0 *info = par->matchinfo; -	ip_set_nfnl_put(info->match_set.index); +	ip_set_nfnl_put(par->net, info->match_set.index);  } +/* Revision 1 match */ + +static bool +set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_set_info_match_v1 *info = par->matchinfo; +	ADT_OPT(opt, par->family, info->match_set.dim, +		info->match_set.flags, 0, UINT_MAX); + +	if (opt.flags & IPSET_RETURN_NOMATCH) +		opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; + +	return match_set(info->match_set.index, skb, par, &opt, +			 info->match_set.flags & IPSET_INV_MATCH); +} + +static int +set_match_v1_checkentry(const struct xt_mtchk_param *par) +{ +	struct xt_set_info_match_v1 *info = par->matchinfo; +	ip_set_id_t index; + +	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); + +	if (index == IPSET_INVALID_ID) { +		pr_warning("Cannot find set identified by id %u to match\n", +			   info->match_set.index); +		return -ENOENT; +	} +	if (info->match_set.dim > IPSET_DIM_MAX) { +		pr_warning("Protocol error: set match dimension " +			   "is over the limit!\n"); +		ip_set_nfnl_put(par->net, info->match_set.index); +		return -ERANGE; +	} + +	return 0; +} + +static void +set_match_v1_destroy(const struct xt_mtdtor_param *par) +{ +	struct xt_set_info_match_v1 *info = par->matchinfo; + +	ip_set_nfnl_put(par->net, info->match_set.index); +} + +/* Revision 3 match */ + +static bool +match_counter(u64 counter, const struct ip_set_counter_match *info) +{ +	switch (info->op) { +	case IPSET_COUNTER_NONE: +		return true; +	case IPSET_COUNTER_EQ: +		return counter == info->value; +	case IPSET_COUNTER_NE: +		return counter != info->value; +	case IPSET_COUNTER_LT: +		return counter < info->value; +	case IPSET_COUNTER_GT: +		return counter > info->value; +	} +	return false; +} + +static bool +set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_set_info_match_v3 *info = par->matchinfo; +	ADT_OPT(opt, par->family, info->match_set.dim, +		info->match_set.flags, info->flags, UINT_MAX); +	int ret; + +	if (info->packets.op != IPSET_COUNTER_NONE || +	    info->bytes.op != IPSET_COUNTER_NONE) +		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; + +	ret = match_set(info->match_set.index, skb, par, &opt, +			info->match_set.flags & IPSET_INV_MATCH); + +	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) +		return ret; + +	if (!match_counter(opt.ext.packets, &info->packets)) +		return 0; +	return match_counter(opt.ext.bytes, &info->bytes); +} + +#define set_match_v3_checkentry	set_match_v1_checkentry +#define set_match_v3_destroy	set_match_v1_destroy + +/* Revision 0 interface: backward compatible with netfilter/iptables */ +  static unsigned int  set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)  { @@ -133,7 +228,7 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)  	ip_set_id_t index;  	if (info->add_set.index != IPSET_INVALID_ID) { -		index = ip_set_nfnl_get_byindex(info->add_set.index); +		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);  		if (index == IPSET_INVALID_ID) {  			pr_warning("Cannot find add_set index %u as target\n",  				   info->add_set.index); @@ -142,12 +237,12 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)  	}  	if (info->del_set.index != IPSET_INVALID_ID) { -		index = ip_set_nfnl_get_byindex(info->del_set.index); +		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);  		if (index == IPSET_INVALID_ID) {  			pr_warning("Cannot find del_set index %u as target\n",  				   info->del_set.index);  			if (info->add_set.index != IPSET_INVALID_ID) -				ip_set_nfnl_put(info->add_set.index); +				ip_set_nfnl_put(par->net, info->add_set.index);  			return -ENOENT;  		}  	} @@ -156,9 +251,9 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)  		pr_warning("Protocol error: SET target dimension "  			   "is over the limit!\n");  		if (info->add_set.index != IPSET_INVALID_ID) -			ip_set_nfnl_put(info->add_set.index); +			ip_set_nfnl_put(par->net, info->add_set.index);  		if (info->del_set.index != IPSET_INVALID_ID) -			ip_set_nfnl_put(info->del_set.index); +			ip_set_nfnl_put(par->net, info->del_set.index);  		return -ERANGE;  	} @@ -175,57 +270,12 @@ set_target_v0_destroy(const struct xt_tgdtor_param *par)  	const struct xt_set_info_target_v0 *info = par->targinfo;  	if (info->add_set.index != IPSET_INVALID_ID) -		ip_set_nfnl_put(info->add_set.index); +		ip_set_nfnl_put(par->net, info->add_set.index);  	if (info->del_set.index != IPSET_INVALID_ID) -		ip_set_nfnl_put(info->del_set.index); -} - -/* Revision 1 match and target */ - -static bool -set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) -{ -	const struct xt_set_info_match_v1 *info = par->matchinfo; -	ADT_OPT(opt, par->family, info->match_set.dim, -		info->match_set.flags, 0, UINT_MAX); - -	if (opt.flags & IPSET_RETURN_NOMATCH) -		opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; - -	return match_set(info->match_set.index, skb, par, &opt, -			 info->match_set.flags & IPSET_INV_MATCH); -} - -static int -set_match_v1_checkentry(const struct xt_mtchk_param *par) -{ -	struct xt_set_info_match_v1 *info = par->matchinfo; -	ip_set_id_t index; - -	index = ip_set_nfnl_get_byindex(info->match_set.index); - -	if (index == IPSET_INVALID_ID) { -		pr_warning("Cannot find set indentified by id %u to match\n", -			   info->match_set.index); -		return -ENOENT; -	} -	if (info->match_set.dim > IPSET_DIM_MAX) { -		pr_warning("Protocol error: set match dimension " -			   "is over the limit!\n"); -		ip_set_nfnl_put(info->match_set.index); -		return -ERANGE; -	} - -	return 0; +		ip_set_nfnl_put(par->net, info->del_set.index);  } -static void -set_match_v1_destroy(const struct xt_mtdtor_param *par) -{ -	struct xt_set_info_match_v1 *info = par->matchinfo; - -	ip_set_nfnl_put(info->match_set.index); -} +/* Revision 1 target */  static unsigned int  set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) @@ -251,7 +301,7 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)  	ip_set_id_t index;  	if (info->add_set.index != IPSET_INVALID_ID) { -		index = ip_set_nfnl_get_byindex(info->add_set.index); +		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);  		if (index == IPSET_INVALID_ID) {  			pr_warning("Cannot find add_set index %u as target\n",  				   info->add_set.index); @@ -260,12 +310,12 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)  	}  	if (info->del_set.index != IPSET_INVALID_ID) { -		index = ip_set_nfnl_get_byindex(info->del_set.index); +		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);  		if (index == IPSET_INVALID_ID) {  			pr_warning("Cannot find del_set index %u as target\n",  				   info->del_set.index);  			if (info->add_set.index != IPSET_INVALID_ID) -				ip_set_nfnl_put(info->add_set.index); +				ip_set_nfnl_put(par->net, info->add_set.index);  			return -ENOENT;  		}  	} @@ -274,9 +324,9 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)  		pr_warning("Protocol error: SET target dimension "  			   "is over the limit!\n");  		if (info->add_set.index != IPSET_INVALID_ID) -			ip_set_nfnl_put(info->add_set.index); +			ip_set_nfnl_put(par->net, info->add_set.index);  		if (info->del_set.index != IPSET_INVALID_ID) -			ip_set_nfnl_put(info->del_set.index); +			ip_set_nfnl_put(par->net, info->del_set.index);  		return -ERANGE;  	} @@ -289,9 +339,9 @@ set_target_v1_destroy(const struct xt_tgdtor_param *par)  	const struct xt_set_info_target_v1 *info = par->targinfo;  	if (info->add_set.index != IPSET_INVALID_ID) -		ip_set_nfnl_put(info->add_set.index); +		ip_set_nfnl_put(par->net, info->add_set.index);  	if (info->del_set.index != IPSET_INVALID_ID) -		ip_set_nfnl_put(info->del_set.index); +		ip_set_nfnl_put(par->net, info->del_set.index);  }  /* Revision 2 target */ @@ -320,52 +370,6 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)  #define set_target_v2_checkentry	set_target_v1_checkentry  #define set_target_v2_destroy		set_target_v1_destroy -/* Revision 3 match */ - -static bool -match_counter(u64 counter, const struct ip_set_counter_match *info) -{ -	switch (info->op) { -	case IPSET_COUNTER_NONE: -		return true; -	case IPSET_COUNTER_EQ: -		return counter == info->value; -	case IPSET_COUNTER_NE: -		return counter != info->value; -	case IPSET_COUNTER_LT: -		return counter < info->value; -	case IPSET_COUNTER_GT: -		return counter > info->value; -	} -	return false; -} - -static bool -set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) -{ -	const struct xt_set_info_match_v3 *info = par->matchinfo; -	ADT_OPT(opt, par->family, info->match_set.dim, -		info->match_set.flags, info->flags, UINT_MAX); -	int ret; - -	if (info->packets.op != IPSET_COUNTER_NONE || -	    info->bytes.op != IPSET_COUNTER_NONE) -		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; - -	ret = match_set(info->match_set.index, skb, par, &opt, -			info->match_set.flags & IPSET_INV_MATCH); - -	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) -		return ret; - -	if (!match_counter(opt.ext.packets, &info->packets)) -		return 0; -	return match_counter(opt.ext.bytes, &info->bytes); -} - -#define set_match_v3_checkentry	set_match_v1_checkentry -#define set_match_v3_destroy	set_match_v1_destroy -  static struct xt_match set_matches[] __read_mostly = {  	{  		.name		= "set", diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 06df2b9110f..1ba67931eb1 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -35,15 +35,6 @@  #include <net/netfilter/nf_conntrack.h>  #endif -static void -xt_socket_put_sk(struct sock *sk) -{ -	if (sk->sk_state == TCP_TIME_WAIT) -		inet_twsk_put(inet_twsk(sk)); -	else -		sock_put(sk); -} -  static int  extract_icmp4_fields(const struct sk_buff *skb,  		    u8 *protocol, @@ -216,7 +207,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,  					inet_twsk(sk)->tw_transparent));  		if (sk != skb->sk) -			xt_socket_put_sk(sk); +			sock_gen_put(sk);  		if (wildcard || !transparent)  			sk = NULL; @@ -370,7 +361,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)  		 */  		wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&  			    sk->sk_state != TCP_TIME_WAIT && -			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)); +			    ipv6_addr_any(&sk->sk_v6_rcv_saddr));  		/* Ignore non-transparent sockets,  		   if XT_SOCKET_TRANSPARENT is used */ @@ -381,7 +372,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)  					inet_twsk(sk)->tw_transparent));  		if (sk != skb->sk) -			xt_socket_put_sk(sk); +			sock_gen_put(sk);  		if (wildcard || !transparent)  			sk = NULL; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 4fe4fb4276d..11de55e7a86 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -37,7 +37,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)  	switch (info->mode) {  	case XT_STATISTIC_MODE_RANDOM: -		if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) +		if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)  			ret = !ret;  		break;  	case XT_STATISTIC_MODE_NTH:  | 
