diff options
Diffstat (limited to 'net/netfilter')
185 files changed, 44616 insertions, 8661 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 1534f2b44ca..e9410d17619 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -4,6 +4,14 @@ menu "Core Netfilter Configuration"  config NETFILTER_NETLINK  	tristate +config NETFILTER_NETLINK_ACCT +tristate "Netfilter NFACCT over NFNETLINK interface" +	depends on NETFILTER_ADVANCED +	select NETFILTER_NETLINK +	help +	  If this option is enabled, the kernel will include support +	  for extended accounting via NFNETLINK. +  config NETFILTER_NETLINK_QUEUE  	tristate "Netfilter NFQUEUE over NFNETLINK interface"  	depends on NETFILTER_ADVANCED @@ -75,6 +83,16 @@ config NF_CONNTRACK_ZONES  	  If unsure, say `N'. +config NF_CONNTRACK_PROCFS +	bool "Supply CT list in procfs (OBSOLETE)" +	default y +	depends on PROC_FS +	---help--- +	This option enables for the list of known conntrack entries +	to be shown in procfs under net/netfilter/nf_conntrack. This +	is considered obsolete in favor of using the conntrack(8) +	tool which uses Netlink. +  config NF_CONNTRACK_EVENTS  	bool "Connection tracking events"  	depends on NETFILTER_ADVANCED @@ -85,9 +103,35 @@ config NF_CONNTRACK_EVENTS  	  If unsure, say `N'. +config NF_CONNTRACK_TIMEOUT +	bool  'Connection tracking timeout' +	depends on NETFILTER_ADVANCED +	help +	  This option enables support for connection tracking timeout +	  extension. This allows you to attach timeout policies to flow +	  via the CT target. + +	  If unsure, say `N'. + +config NF_CONNTRACK_TIMESTAMP +	bool  'Connection tracking timestamping' +	depends on NETFILTER_ADVANCED +	help +	  This option enables support for connection tracking timestamping. +	  This allows you to store the flow start-time and to obtain +	  the flow-stop time (once it has been destroyed) via Connection +	  tracking events. + +	  If unsure, say `N'. + +config NF_CONNTRACK_LABELS +	bool +	help +	  This option enables support for assigning user-defined flag bits +	  to connection tracking entries.  It selected by the connlabel match. +  config NF_CT_PROTO_DCCP -	tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' -	depends on EXPERIMENTAL +	tristate 'DCCP protocol connection tracking support'  	depends on NETFILTER_ADVANCED  	default IP_DCCP  	help @@ -100,8 +144,7 @@ config NF_CT_PROTO_GRE  	tristate  config NF_CT_PROTO_SCTP -	tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' -	depends on EXPERIMENTAL +	tristate 'SCTP protocol connection tracking support'  	depends on NETFILTER_ADVANCED  	default IP_SCTP  	help @@ -185,9 +228,12 @@ config NF_CONNTRACK_IRC  	  To compile it as a module, choose M here.  If unsure, say N. +config NF_CONNTRACK_BROADCAST +	tristate +  config NF_CONNTRACK_NETBIOS_NS  	tristate "NetBIOS name service protocol support" -	depends on NETFILTER_ADVANCED +	select NF_CONNTRACK_BROADCAST  	help  	  NetBIOS name service requests are sent as broadcast messages from an  	  unprivileged port and responded to with unicast messages to the @@ -204,6 +250,21 @@ config NF_CONNTRACK_NETBIOS_NS  	  To compile it as a module, choose M here.  If unsure, say N. +config NF_CONNTRACK_SNMP +	tristate "SNMP service protocol support" +	depends on NETFILTER_ADVANCED +	select NF_CONNTRACK_BROADCAST +	help +	  SNMP service requests are sent as broadcast messages from an +	  unprivileged port and responded to with unicast messages to the +	  same port. This make them hard to firewall properly because connection +	  tracking doesn't deal with broadcasts. This helper tracks locally +	  originating SNMP service requests and the corresponding +	  responses. It relies on correct IP address configuration, specifically +	  netmask and broadcast address. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config NF_CONNTRACK_PPTP  	tristate "PPtP protocol support"  	depends on NETFILTER_ADVANCED @@ -224,8 +285,7 @@ config NF_CONNTRACK_PPTP  	  To compile it as a module, choose M here.  If unsure, say N.  config NF_CONNTRACK_SANE -	tristate "SANE protocol support (EXPERIMENTAL)" -	depends on EXPERIMENTAL +	tristate "SANE protocol support"  	depends on NETFILTER_ADVANCED  	help  	  SANE is a protocol for remote access to scanners as implemented @@ -267,22 +327,212 @@ config NF_CT_NETLINK  	help  	  This option enables support for a netlink-based userspace interface -endif # NF_CONNTRACK +config NF_CT_NETLINK_TIMEOUT +	tristate  'Connection tracking timeout tuning via Netlink' +	select NETFILTER_NETLINK +	depends on NETFILTER_ADVANCED +	help +	  This option enables support for connection tracking timeout +	  fine-grain tuning. This allows you to attach specific timeout +	  policies to flows, instead of using the global timeout policy. -# transparent proxy support -config NETFILTER_TPROXY -	tristate "Transparent proxying support (EXPERIMENTAL)" -	depends on EXPERIMENTAL -	depends on IP_NF_MANGLE +	  If unsure, say `N'. + +config NF_CT_NETLINK_HELPER +	tristate 'Connection tracking helpers in user-space via Netlink' +	select NETFILTER_NETLINK +	depends on NF_CT_NETLINK +	depends on NETFILTER_NETLINK_QUEUE +	depends on NETFILTER_NETLINK_QUEUE_CT  	depends on NETFILTER_ADVANCED  	help -	  This option enables transparent proxying support, that is, -	  support for handling non-locally bound IPv4 TCP and UDP sockets. -	  For it to work you will have to configure certain iptables rules -	  and use policy routing. For more information on how to set it up -	  see Documentation/networking/tproxy.txt. +	  This option enables the user-space connection tracking helpers +	  infrastructure. -	  To compile it as a module, choose M here.  If unsure, say N. +	  If unsure, say `N'. + +config NETFILTER_NETLINK_QUEUE_CT +        bool "NFQUEUE integration with Connection Tracking" +        default n +        depends on NETFILTER_NETLINK_QUEUE +	help +	  If this option is enabled, NFQUEUE can include Connection Tracking +	  information together with the packet is the enqueued via NFNETLINK. + +config NF_NAT +	tristate + +config NF_NAT_NEEDED +	bool +	depends on NF_NAT +	default y + +config NF_NAT_PROTO_DCCP +	tristate +	depends on NF_NAT && NF_CT_PROTO_DCCP +	default NF_NAT && NF_CT_PROTO_DCCP + +config NF_NAT_PROTO_UDPLITE +	tristate +	depends on NF_NAT && NF_CT_PROTO_UDPLITE +	default NF_NAT && NF_CT_PROTO_UDPLITE + +config NF_NAT_PROTO_SCTP +	tristate +	default NF_NAT && NF_CT_PROTO_SCTP +	depends on NF_NAT && NF_CT_PROTO_SCTP +	select LIBCRC32C + +config NF_NAT_AMANDA +	tristate +	depends on NF_CONNTRACK && NF_NAT +	default NF_NAT && NF_CONNTRACK_AMANDA + +config NF_NAT_FTP +	tristate +	depends on NF_CONNTRACK && NF_NAT +	default NF_NAT && NF_CONNTRACK_FTP + +config NF_NAT_IRC +	tristate +	depends on NF_CONNTRACK && NF_NAT +	default NF_NAT && NF_CONNTRACK_IRC + +config NF_NAT_SIP +	tristate +	depends on NF_CONNTRACK && NF_NAT +	default NF_NAT && NF_CONNTRACK_SIP + +config NF_NAT_TFTP +	tristate +	depends on NF_CONNTRACK && NF_NAT +	default NF_NAT && NF_CONNTRACK_TFTP + +config NETFILTER_SYNPROXY +	tristate + +endif # NF_CONNTRACK + +config NF_TABLES +	select NETFILTER_NETLINK +	tristate "Netfilter nf_tables support" +	help +	  nftables is the new packet classification framework that intends to +	  replace the existing {ip,ip6,arp,eb}_tables infrastructure. It +	  provides a pseudo-state machine with an extensible instruction-set +	  (also known as expressions) that the userspace 'nft' utility +	  (http://www.netfilter.org/projects/nftables) uses to build the +	  rule-set. It also comes with the generic set infrastructure that +	  allows you to construct mappings between matchings and actions +	  for performance lookups. + +	  To compile it as a module, choose M here. + +config NF_TABLES_INET +	depends on NF_TABLES && IPV6 +	select NF_TABLES_IPV4 +	select NF_TABLES_IPV6 +	tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" +	help +	  This option enables support for a mixed IPv4/IPv6 "inet" table. + +config NFT_EXTHDR +	depends on NF_TABLES +	tristate "Netfilter nf_tables IPv6 exthdr module" +	help +	  This option adds the "exthdr" expression that you can use to match +	  IPv6 extension headers. + +config NFT_META +	depends on NF_TABLES +	tristate "Netfilter nf_tables meta module" +	help +	  This option adds the "meta" expression that you can use to match and +	  to set packet metainformation such as the packet mark. + +config NFT_CT +	depends on NF_TABLES +	depends on NF_CONNTRACK +	tristate "Netfilter nf_tables conntrack module" +	help +	  This option adds the "meta" expression that you can use to match +	  connection tracking information such as the flow state. + +config NFT_RBTREE +	depends on NF_TABLES +	tristate "Netfilter nf_tables rbtree set module" +	help +	  This option adds the "rbtree" set type (Red Black tree) that is used +	  to build interval-based sets. + +config NFT_HASH +	depends on NF_TABLES +	tristate "Netfilter nf_tables hash set module" +	help +	  This option adds the "hash" set type that is used to build one-way +	  mappings between matchings and actions. + +config NFT_COUNTER +	depends on NF_TABLES +	tristate "Netfilter nf_tables counter module" +	help +	  This option adds the "counter" expression that you can use to +	  include packet and byte counters in a rule. + +config NFT_LOG +	depends on NF_TABLES +	tristate "Netfilter nf_tables log module" +	help +	  This option adds the "log" expression that you can use to log +	  packets matching some criteria. + +config NFT_LIMIT +	depends on NF_TABLES +	tristate "Netfilter nf_tables limit module" +	help +	  This option adds the "limit" expression that you can use to +	  ratelimit rule matchings. + +config NFT_NAT +	depends on NF_TABLES +	depends on NF_CONNTRACK +	depends on NF_NAT +	tristate "Netfilter nf_tables nat module" +	help +	  This option adds the "nat" expression that you can use to perform +	  typical Network Address Translation (NAT) packet transformations. + +config NFT_QUEUE +	depends on NF_TABLES +	depends on NETFILTER_XTABLES +	depends on NETFILTER_NETLINK_QUEUE +	tristate "Netfilter nf_tables queue module" +	help +	  This is required if you intend to use the userspace queueing +	  infrastructure (also known as NFQUEUE) from nftables. + +config NFT_REJECT +	depends on NF_TABLES +	default m if NETFILTER_ADVANCED=n +	tristate "Netfilter nf_tables reject support" +	help +	  This option adds the "reject" expression that you can use to +	  explicitly deny and notify via TCP reset/ICMP informational errors +	  unallowed traffic. + +config NFT_REJECT_INET +	depends on NF_TABLES_INET +	default NFT_REJECT +	tristate + +config NFT_COMPAT +	depends on NF_TABLES +	depends on NETFILTER_XTABLES +	tristate "Netfilter x_tables over nf_tables module" +	help +	  This is required if you intend to use any of existing +	  x_tables match/target extensions over the nf_tables +	  framework.  config NETFILTER_XTABLES  	tristate "Netfilter Xtables support (required for ip_tables)" @@ -322,10 +572,32 @@ config NETFILTER_XT_CONNMARK  	ctmark), similarly to the packet mark (nfmark). Using this  	target and match, you can set and match on this mark. +config NETFILTER_XT_SET +	tristate 'set target and match support' +	depends on IP_SET +	depends on NETFILTER_ADVANCED +	help +	  This option adds the "SET" target and "set" match. + +	  Using this target and match, you can add/delete and match +	  elements in the sets created by ipset(8). + +	  To compile it as a module, choose M here.  If unsure, say N. +  # alphabetically ordered list of targets  comment "Xtables targets" +config NETFILTER_XT_TARGET_AUDIT +	tristate "AUDIT target support" +	depends on AUDIT +	depends on NETFILTER_ADVANCED +	---help--- +	  This option adds a 'AUDIT' target, which can be used to create +	  audit records for packets dropped/accepted. + +	  To compileit as a module, choose M here. If unsure, say N. +  config NETFILTER_XT_TARGET_CHECKSUM  	tristate "CHECKSUM target support"  	depends on IP_NF_MANGLE || IP6_NF_MANGLE @@ -419,6 +691,21 @@ config NETFILTER_XT_TARGET_HL  	since you can easily create immortal packets that loop  	forever on the network. +config NETFILTER_XT_TARGET_HMARK +	tristate '"HMARK" target support' +	depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) +	depends on NETFILTER_ADVANCED +	---help--- +	This option adds the "HMARK" target. + +	The target allows you to create rules in the "raw" and "mangle" tables +	which set the skbuff mark by means of hash calculation within a given +	range. The nfmark can influence the routing method (see "Use netfilter +	MARK value as routing key") and can also be used by other subsystems to +	change their behaviour. + +	To compile it as a module, choose M here. If unsure, say N. +  config NETFILTER_XT_TARGET_IDLETIMER  	tristate  "IDLETIMER target support"  	depends on NETFILTER_ADVANCED @@ -453,7 +740,16 @@ config NETFILTER_XT_TARGET_LED  	    echo netfilter-ssh > /sys/class/leds/<ledname>/trigger  	  For more information on the LEDs available on your system, see -	  Documentation/leds-class.txt +	  Documentation/leds/leds-class.txt + +config NETFILTER_XT_TARGET_LOG +	tristate "LOG target support" +	default m if NETFILTER_ADVANCED=n +	help +	  This option adds a `LOG' target, which allows you to create rules in +	  any iptables table which records the packet header to the syslog. + +	  To compile it as a module, choose M here.  If unsure, say N.  config NETFILTER_XT_TARGET_MARK  	tristate '"MARK" target support' @@ -464,6 +760,16 @@ config NETFILTER_XT_TARGET_MARK  	(e.g. when running oldconfig). It selects  	CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). +config NETFILTER_XT_TARGET_NETMAP +	tristate '"NETMAP" target support' +	depends on NF_NAT +	---help--- +	NETMAP is an implementation of static 1:1 NAT mapping of network +	addresses. It maps the network address part, while keeping the host +	address part intact. + +	To compile it as a module, choose M here. If unsure, say N. +  config NETFILTER_XT_TARGET_NFLOG  	tristate '"NFLOG" target support'  	default m if NETFILTER_ADVANCED=n @@ -477,6 +783,7 @@ config NETFILTER_XT_TARGET_NFLOG  config NETFILTER_XT_TARGET_NFQUEUE  	tristate '"NFQUEUE" target Support'  	depends on NETFILTER_ADVANCED +	select NETFILTER_NETLINK_QUEUE  	help  	  This target replaced the old obsolete QUEUE target. @@ -486,18 +793,11 @@ config NETFILTER_XT_TARGET_NFQUEUE  	  To compile it as a module, choose M here.  If unsure, say N.  config NETFILTER_XT_TARGET_NOTRACK -	tristate  '"NOTRACK" target support' -	depends on IP_NF_RAW || IP6_NF_RAW +	tristate  '"NOTRACK" target support (DEPRECATED)'  	depends on NF_CONNTRACK +	depends on IP_NF_RAW || IP6_NF_RAW  	depends on NETFILTER_ADVANCED -	help -	  The NOTRACK target allows a select rule to specify -	  which packets *not* to enter the conntrack/NAT -	  subsystem with all the consequences (no ICMP error tracking, -	  no protocol helpers for the selected packets). - -	  If you want to compile it as a module, say M here and read -	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'. +	select NETFILTER_XT_TARGET_CT  config NETFILTER_XT_TARGET_RATEEST  	tristate '"RATEEST" target support' @@ -509,6 +809,17 @@ config NETFILTER_XT_TARGET_RATEEST  	  To compile it as a module, choose M here.  If unsure, say N. +config NETFILTER_XT_TARGET_REDIRECT +	tristate "REDIRECT target support" +	depends on NF_NAT +	---help--- +	REDIRECT is a special case of NAT: all incoming connections are +	mapped onto the incoming interface's address, causing the packets to +	come to the local machine instead of passing through. This is +	useful for transparent proxies. + +	To compile it as a module, choose M here. If unsure, say N. +  config NETFILTER_XT_TARGET_TEE  	tristate '"TEE" - packet cloning to alternate destination'  	depends on NETFILTER_ADVANCED @@ -519,11 +830,10 @@ config NETFILTER_XT_TARGET_TEE  	this clone be rerouted to another nexthop.  config NETFILTER_XT_TARGET_TPROXY -	tristate '"TPROXY" target support (EXPERIMENTAL)' -	depends on EXPERIMENTAL -	depends on NETFILTER_TPROXY +	tristate '"TPROXY" target transparent proxying support'  	depends on NETFILTER_XTABLES  	depends on NETFILTER_ADVANCED +	depends on IP_NF_MANGLE  	select NF_DEFRAG_IPV4  	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES  	help @@ -531,6 +841,9 @@ config NETFILTER_XT_TARGET_TPROXY  	  REDIRECT.  It can only be used in the mangle table and is useful  	  to redirect traffic to a transparent proxy.  It does _not_ depend  	  on Netfilter connection tracking and NAT, unlike REDIRECT. +	  For it to work you will have to configure certain iptables rules +	  and use policy routing. For more information on how to set it up +	  see Documentation/networking/tproxy.txt.  	  To compile it as a module, choose M here.  If unsure, say N. @@ -584,8 +897,7 @@ config NETFILTER_XT_TARGET_TCPMSS  	  To compile it as a module, choose M here.  If unsure, say N.  config NETFILTER_XT_TARGET_TCPOPTSTRIP -	tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' -	depends on EXPERIMENTAL +	tristate '"TCPOPTSTRIP" target support'  	depends on IP_NF_MANGLE || IP6_NF_MANGLE  	depends on NETFILTER_ADVANCED  	help @@ -596,6 +908,35 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP  comment "Xtables matches" +config NETFILTER_XT_MATCH_ADDRTYPE +	tristate '"addrtype" address type match support' +	depends on NETFILTER_ADVANCED +	---help--- +	  This option allows you to match what routing thinks of an address, +	  eg. UNICAST, LOCAL, BROADCAST, ... + +	  If you want to compile it as a module, say M here and read +	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'. + +config NETFILTER_XT_MATCH_BPF +	tristate '"bpf" match support' +	depends on NETFILTER_ADVANCED +	help +	  BPF matching applies a linux socket filter to each packet and +	  accepts those for which the filter returns non-zero. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config NETFILTER_XT_MATCH_CGROUP +	tristate '"control group" match support' +	depends on NETFILTER_ADVANCED +	depends on CGROUPS +	select CGROUP_NET_CLASSID +	---help--- +	Socket/process control group matching allows you to match locally +	generated packets based on which net_cls control group processes +	belong to. +  config NETFILTER_XT_MATCH_CLUSTER  	tristate '"cluster" match support'  	depends on NF_CONNTRACK @@ -633,8 +974,21 @@ config NETFILTER_XT_MATCH_CONNBYTES  	  If you want to compile it as a module, say M here and read  	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'. +config NETFILTER_XT_MATCH_CONNLABEL +	tristate '"connlabel" match support' +	select NF_CONNTRACK_LABELS +	depends on NF_CONNTRACK +	depends on NETFILTER_ADVANCED +	---help--- +	  This match allows you to test and assign userspace-defined labels names +	  to a connection.  The kernel only stores bit values - mapping +	  names to bits is done by userspace. + +	  Unlike connmark, more than 32 flag bits may be assigned to a +	  connection simultaneously. +  config NETFILTER_XT_MATCH_CONNLIMIT -	tristate '"connlimit" match support"' +	tristate '"connlimit" match support'  	depends on NF_CONNTRACK  	depends on NETFILTER_ADVANCED  	---help--- @@ -685,6 +1039,15 @@ config NETFILTER_XT_MATCH_DCCP  	  If you want to compile it as a module, say M here and read  	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'. +config NETFILTER_XT_MATCH_DEVGROUP +	tristate '"devgroup" match support' +	depends on NETFILTER_ADVANCED +	help +	  This options adds a `devgroup' match, which allows to match on the +	  device group a network device is assigned to. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config NETFILTER_XT_MATCH_DSCP  	tristate '"dscp" and "tos" match support'  	depends on NETFILTER_ADVANCED @@ -700,6 +1063,15 @@ config NETFILTER_XT_MATCH_DSCP  	  To compile it as a module, choose M here.  If unsure, say N. +config NETFILTER_XT_MATCH_ECN +	tristate '"ecn" match support' +	depends on NETFILTER_ADVANCED +	---help--- +	This option adds an "ECN" match, which allows you to match against +	the IPv4 and TCP header ECN fields. + +	To compile it as a module, choose M here. If unsure, say N. +  config NETFILTER_XT_MATCH_ESP  	tristate '"esp" match support'  	depends on NETFILTER_ADVANCED @@ -742,6 +1114,15 @@ config NETFILTER_XT_MATCH_HL  	in the IPv6 header, or the time-to-live field in the IPv4  	header of the packet. +config NETFILTER_XT_MATCH_IPCOMP +	tristate '"ipcomp" match support' +	depends on NETFILTER_ADVANCED +	help +	  This match extension allows you to match a range of CPIs(16 bits) +	  inside IPComp header of IPSec packets. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config NETFILTER_XT_MATCH_IPRANGE  	tristate '"iprange" address range match support'  	depends on NETFILTER_ADVANCED @@ -762,6 +1143,16 @@ config NETFILTER_XT_MATCH_IPVS  	  If unsure, say N. +config NETFILTER_XT_MATCH_L2TP +	tristate '"l2tp" match support' +	depends on NETFILTER_ADVANCED +	default L2TP +	---help--- +	This option adds an "L2TP" match, which allows you to match against +	L2TP protocol header fields. + +	To compile it as a module, choose M here. If unsure, say N. +  config NETFILTER_XT_MATCH_LENGTH  	tristate '"length" match support'  	depends on NETFILTER_ADVANCED @@ -809,6 +1200,16 @@ config NETFILTER_XT_MATCH_MULTIPORT  	  To compile it as a module, choose M here.  If unsure, say N. +config NETFILTER_XT_MATCH_NFACCT +	tristate '"nfacct" match support' +	depends on NETFILTER_ADVANCED +	select NETFILTER_NETLINK_ACCT +	help +	  This option allows you to use the extended accounting through +	  nfnetlink_acct. + +	  To compile it as a module, choose M here.  If unsure, say N. +  config NETFILTER_XT_MATCH_OSF  	tristate '"osf" Passive OS fingerprint match'  	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK @@ -886,7 +1287,7 @@ config NETFILTER_XT_MATCH_RATEEST  config NETFILTER_XT_MATCH_REALM  	tristate  '"realm" match support'  	depends on NETFILTER_ADVANCED -	select NET_CLS_ROUTE +	select IP_ROUTE_CLASSID  	help  	  This option adds a `realm' match, which allows you to use the realm  	  key from the routing subsystem inside iptables. @@ -908,8 +1309,7 @@ config NETFILTER_XT_MATCH_RECENT  	Official Website: <http://snowman.net/projects/ipt_recent/>  config NETFILTER_XT_MATCH_SCTP -	tristate  '"sctp" protocol match support (EXPERIMENTAL)' -	depends on EXPERIMENTAL +	tristate  '"sctp" protocol match support'  	depends on NETFILTER_ADVANCED  	default IP_SCTP  	help @@ -921,12 +1321,11 @@ config NETFILTER_XT_MATCH_SCTP  	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.  config NETFILTER_XT_MATCH_SOCKET -	tristate '"socket" match support (EXPERIMENTAL)' -	depends on EXPERIMENTAL -	depends on NETFILTER_TPROXY +	tristate '"socket" match support'  	depends on NETFILTER_XTABLES  	depends on NETFILTER_ADVANCED  	depends on !NF_CONNTRACK || NF_CONNTRACK +	depends on (IPV6 || IPV6=n)  	select NF_DEFRAG_IPV4  	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES  	help @@ -1011,4 +1410,6 @@ endif # NETFILTER_XTABLES  endmenu +source "net/netfilter/ipset/Kconfig" +  source "net/netfilter/ipvs/Kconfig" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 441050f3111..bffdad774da 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,11 +1,17 @@  netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o -nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o  nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o  obj-$(CONFIG_NETFILTER) = netfilter.o  obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o +obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o +nfnetlink_queue-y := nfnetlink_queue_core.o +nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o  obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o  obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o @@ -20,6 +26,8 @@ obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o  # netlink interface for nf_conntrack  obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o +obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o +obj-$(CONFIG_NF_CT_NETLINK_HELPER) += nfnetlink_cthelper.o  # connection tracking helpers  nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o @@ -28,14 +36,54 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o  obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o  obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o  obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o +obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o  obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o +obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o  obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o  obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o  obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o  obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o -# transparent proxy support -obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o +nf_nat-y	:= nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ +		   nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o + +obj-$(CONFIG_NF_NAT) += nf_nat.o + +# NAT protocols (nf_nat) +obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o +obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o +obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o + +# NAT helpers +obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o +obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o +obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o +obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o +obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o + +# SYNPROXY +obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o + +# nf_tables +nf_tables-objs += nf_tables_core.o nf_tables_api.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o +nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o + +obj-$(CONFIG_NF_TABLES)		+= nf_tables.o +obj-$(CONFIG_NF_TABLES_INET)	+= nf_tables_inet.o +obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o +obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o +obj-$(CONFIG_NFT_META)		+= nft_meta.o +obj-$(CONFIG_NFT_CT)		+= nft_ct.o +obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o +obj-$(CONFIG_NFT_NAT)		+= nft_nat.o +obj-$(CONFIG_NFT_QUEUE)		+= nft_queue.o +obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET)	+= nft_reject_inet.o +obj-$(CONFIG_NFT_RBTREE)	+= nft_rbtree.o +obj-$(CONFIG_NFT_HASH)		+= nft_hash.o +obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o +obj-$(CONFIG_NFT_LOG)		+= nft_log.o  # generic X tables   obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o @@ -43,19 +91,25 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o  # combos  obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o  obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o +obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o +obj-$(CONFIG_NF_NAT) += xt_nat.o  # targets +obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o  obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o  obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o  obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o  obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o  obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o  obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o  obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o  obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o  obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o -obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o  obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o +obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o  obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o  obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o  obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o @@ -65,26 +119,35 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o  obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o  # matches +obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_BPF) += xt_bpf.o  obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o  obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o  obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLABEL) += xt_connlabel.o  obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o  obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o  obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o  obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o  obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_ECN) += xt_ecn.o  obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o  obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o  obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o  obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o  obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o  obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o +obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o  obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o  obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o  obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o  obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o +obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o  obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o  obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o  obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o  obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o  obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o @@ -101,5 +164,8 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o  obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o  obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o +# ipset +obj-$(CONFIG_IP_SET) += ipset/ +  # IPVS  obj-$(CONFIG_IP_VS) += ipvs/ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 32fcbe290c0..1fbab0cdd30 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -5,6 +5,7 @@   * way.   *   * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012   */  #include <linux/kernel.h>  #include <linux/netfilter.h> @@ -29,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex);  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;  EXPORT_SYMBOL(nf_afinfo); +const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; +EXPORT_SYMBOL_GPL(nf_ipv6_ops);  int nf_register_afinfo(const struct nf_afinfo *afinfo)  { @@ -37,7 +40,7 @@ int nf_register_afinfo(const struct nf_afinfo *afinfo)  	err = mutex_lock_interruptible(&afinfo_mutex);  	if (err < 0)  		return err; -	rcu_assign_pointer(nf_afinfo[afinfo->family], afinfo); +	RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);  	mutex_unlock(&afinfo_mutex);  	return 0;  } @@ -46,7 +49,7 @@ EXPORT_SYMBOL_GPL(nf_register_afinfo);  void nf_unregister_afinfo(const struct nf_afinfo *afinfo)  {  	mutex_lock(&afinfo_mutex); -	rcu_assign_pointer(nf_afinfo[afinfo->family], NULL); +	RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL);  	mutex_unlock(&afinfo_mutex);  	synchronize_rcu();  } @@ -54,6 +57,12 @@ EXPORT_SYMBOL_GPL(nf_unregister_afinfo);  struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;  EXPORT_SYMBOL(nf_hooks); + +#if defined(CONFIG_JUMP_LABEL) +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +EXPORT_SYMBOL(nf_hooks_needed); +#endif +  static DEFINE_MUTEX(nf_hook_mutex);  int nf_register_hook(struct nf_hook_ops *reg) @@ -70,6 +79,9 @@ int nf_register_hook(struct nf_hook_ops *reg)  	}  	list_add_rcu(®->list, elem->list.prev);  	mutex_unlock(&nf_hook_mutex); +#if defined(CONFIG_JUMP_LABEL) +	static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif  	return 0;  }  EXPORT_SYMBOL(nf_register_hook); @@ -79,7 +91,9 @@ void nf_unregister_hook(struct nf_hook_ops *reg)  	mutex_lock(&nf_hook_mutex);  	list_del_rcu(®->list);  	mutex_unlock(&nf_hook_mutex); - +#if defined(CONFIG_JUMP_LABEL) +	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif  	synchronize_net();  }  EXPORT_SYMBOL(nf_unregister_hook); @@ -115,7 +129,7 @@ unsigned int nf_iterate(struct list_head *head,  			unsigned int hook,  			const struct net_device *indev,  			const struct net_device *outdev, -			struct list_head **i, +			struct nf_hook_ops **elemp,  			int (*okfn)(struct sk_buff *),  			int hook_thresh)  { @@ -125,27 +139,26 @@ unsigned int nf_iterate(struct list_head *head,  	 * The caller must not block between calls to this  	 * function because of risk of continuing from deleted element.  	 */ -	list_for_each_continue_rcu(*i, head) { -		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; - -		if (hook_thresh > elem->priority) +	list_for_each_entry_continue_rcu((*elemp), head, list) { +		if (hook_thresh > (*elemp)->priority)  			continue;  		/* Optimization: we don't need to hold module  		   reference here, since function can't sleep. --RR */ -		verdict = elem->hook(hook, skb, indev, outdev, okfn); +repeat: +		verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn);  		if (verdict != NF_ACCEPT) {  #ifdef CONFIG_NETFILTER_DEBUG  			if (unlikely((verdict & NF_VERDICT_MASK)  							> NF_MAX_VERDICT)) {  				NFDEBUG("Evil return from %p(%u).\n", -					elem->hook, hook); +					(*elemp)->hook, hook);  				continue;  			}  #endif  			if (verdict != NF_REPEAT)  				return verdict; -			*i = (*i)->prev; +			goto repeat;  		}  	}  	return NF_ACCEPT; @@ -160,14 +173,14 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,  		 int (*okfn)(struct sk_buff *),  		 int hook_thresh)  { -	struct list_head *elem; +	struct nf_hook_ops *elem;  	unsigned int verdict;  	int ret = 0;  	/* We may already have this, but read-locks nest anyway */  	rcu_read_lock(); -	elem = &nf_hooks[pf][hook]; +	elem = list_entry_rcu(&nf_hooks[pf][hook], struct nf_hook_ops, list);  next_hook:  	verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,  			     outdev, &elem, okfn, hook_thresh); @@ -175,13 +188,20 @@ next_hook:  		ret = 1;  	} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {  		kfree_skb(skb); -		ret = -(verdict >> NF_VERDICT_BITS); +		ret = NF_DROP_GETERR(verdict);  		if (ret == 0)  			ret = -EPERM;  	} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { -		if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, -			      verdict >> NF_VERDICT_BITS)) -			goto next_hook; +		int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, +						verdict >> NF_VERDICT_QBITS); +		if (err < 0) { +			if (err == -ECANCELED) +				goto next_hook; +			if (err == -ESRCH && +			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) +				goto next_hook; +			kfree_skb(skb); +		}  	}  	rcu_read_unlock();  	return ret; @@ -210,16 +230,17 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)  }  EXPORT_SYMBOL(skb_make_writable); -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  /* This does not belong here, but locally generated errors need it if connection     tracking in use: without this, connection may not be in hash table, and hence     manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) +		__rcu __read_mostly;  EXPORT_SYMBOL(ip_ct_attach); -void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)  { -	void (*attach)(struct sk_buff *, struct sk_buff *); +	void (*attach)(struct sk_buff *, const struct sk_buff *);  	if (skb->nfct) {  		rcu_read_lock(); @@ -231,7 +252,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)  }  EXPORT_SYMBOL(nf_ct_attach); -void (*nf_ct_destroy)(struct nf_conntrack *); +void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;  EXPORT_SYMBOL(nf_ct_destroy);  void nf_conntrack_destroy(struct nf_conntrack *nfct) @@ -245,38 +266,65 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)  	rcu_read_unlock();  }  EXPORT_SYMBOL(nf_conntrack_destroy); + +struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfq_ct_hook); + +struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfq_ct_nat_hook); +  #endif /* CONFIG_NF_CONNTRACK */ +#ifdef CONFIG_NF_NAT_NEEDED +void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); +EXPORT_SYMBOL(nf_nat_decode_session_hook); +#endif + +static int __net_init netfilter_net_init(struct net *net) +{  #ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_net_netfilter; -EXPORT_SYMBOL(proc_net_netfilter); +	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", +						net->proc_net); +	if (!net->nf.proc_netfilter) { +		if (!net_eq(net, &init_net)) +			pr_err("cannot create netfilter proc entry"); + +		return -ENOMEM; +	}  #endif +	return 0; +} + +static void __net_exit netfilter_net_exit(struct net *net) +{ +	remove_proc_entry("netfilter", net->proc_net); +} + +static struct pernet_operations netfilter_net_ops = { +	.init = netfilter_net_init, +	.exit = netfilter_net_exit, +}; -void __init netfilter_init(void) +int __init netfilter_init(void)  { -	int i, h; +	int i, h, ret; +  	for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {  		for (h = 0; h < NF_MAX_HOOKS; h++)  			INIT_LIST_HEAD(&nf_hooks[i][h]);  	} -#ifdef CONFIG_PROC_FS -	proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net); -	if (!proc_net_netfilter) -		panic("cannot create netfilter proc entry"); -#endif +	ret = register_pernet_subsys(&netfilter_net_ops); +	if (ret < 0) +		goto err; -	if (netfilter_queue_init() < 0) -		panic("cannot initialize nf_queue"); -	if (netfilter_log_init() < 0) -		panic("cannot initialize nf_log"); -} +	ret = netfilter_log_init(); +	if (ret < 0) +		goto err_pernet; -#ifdef CONFIG_SYSCTL -struct ctl_path nf_net_netfilter_sysctl_path[] = { -	{ .procname = "net", }, -	{ .procname = "netfilter", }, -	{ } -}; -EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path); -#endif /* CONFIG_SYSCTL */ +	return 0; +err_pernet: +	unregister_pernet_subsys(&netfilter_net_ops); +err: +	return ret; +} diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig new file mode 100644 index 00000000000..2f7f5c32c6f --- /dev/null +++ b/net/netfilter/ipset/Kconfig @@ -0,0 +1,159 @@ +menuconfig IP_SET +	tristate "IP set support" +	depends on INET && NETFILTER +	select NETFILTER_NETLINK +	help +	  This option adds IP set support to the kernel. +	  In order to define and use the sets, you need the userspace utility +	  ipset(8). You can use the sets in netfilter via the "set" match +	  and "SET" target. + +	  To compile it as a module, choose M here.  If unsure, say N. + +if IP_SET + +config IP_SET_MAX +	int "Maximum number of IP sets" +	default 256 +	range 2 65534 +	depends on IP_SET +	help +	  You can define here default value of the maximum number  +	  of IP sets for the kernel. + +	  The value can be overridden by the 'max_sets' module +	  parameter of the 'ip_set' module. + +config IP_SET_BITMAP_IP +	tristate "bitmap:ip set support" +	depends on IP_SET +	help +	  This option adds the bitmap:ip set type support, by which one +	  can store IPv4 addresses (or network addresse) from a range. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_BITMAP_IPMAC +	tristate "bitmap:ip,mac set support" +	depends on IP_SET +	help +	  This option adds the bitmap:ip,mac set type support, by which one +	  can store IPv4 address and (source) MAC address pairs from a range. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_BITMAP_PORT +	tristate "bitmap:port set support" +	depends on IP_SET +	help +	  This option adds the bitmap:port set type support, by which one +	  can store TCP/UDP port numbers from a range. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_IP +	tristate "hash:ip set support" +	depends on IP_SET +	help +	  This option adds the hash:ip set type support, by which one +	  can store arbitrary IPv4 or IPv6 addresses (or network addresses) +	  in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_IPMARK +	tristate "hash:ip,mark set support" +	depends on IP_SET +	help +	  This option adds the hash:ip,mark set type support, by which one +	  can store IPv4/IPv6 address and mark pairs. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_IPPORT +	tristate "hash:ip,port set support" +	depends on IP_SET +	help +	  This option adds the hash:ip,port set type support, by which one +	  can store IPv4/IPv6 address and protocol/port pairs. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_IPPORTIP +	tristate "hash:ip,port,ip set support" +	depends on IP_SET +	help +	  This option adds the hash:ip,port,ip set type support, by which +	  one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 +	  address triples in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_IPPORTNET +	tristate "hash:ip,port,net set support" +	depends on IP_SET +	help +	  This option adds the hash:ip,port,net set type support, by which +	  one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 +	  network address/prefix triples in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_NETPORTNET +	tristate "hash:net,port,net set support" +	depends on IP_SET +	help +	  This option adds the hash:net,port,net set type support, by which +	  one can store two IPv4/IPv6 subnets, and a protocol/port in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_NET +	tristate "hash:net set support" +	depends on IP_SET +	help +	  This option adds the hash:net set type support, by which +	  one can store IPv4/IPv6 network address/prefix elements in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_NETNET +	tristate "hash:net,net set support" +	depends on IP_SET +	help +	  This option adds the hash:net,net  set type support, by which +	  one can store IPv4/IPv6 network address/prefix pairs in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_NETPORT +	tristate "hash:net,port set support" +	depends on IP_SET +	help +	  This option adds the hash:net,port set type support, by which +	  one can store IPv4/IPv6 network address/prefix and +	  protocol/port pairs as elements in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_HASH_NETIFACE +	tristate "hash:net,iface set support" +	depends on IP_SET +	help +	  This option adds the hash:net,iface set type support, by which +	  one can store IPv4/IPv6 network address/prefix and +	  interface name pairs as elements in a set. + +	  To compile it as a module, choose M here.  If unsure, say N. + +config IP_SET_LIST_SET +	tristate "list:set set support" +	depends on IP_SET +	help +	  This option adds the list:set set type support. In this +	  kind of set one can store the name of other sets and it forms +	  an ordered union of the member sets. + +	  To compile it as a module, choose M here.  If unsure, say N. + +endif # IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile new file mode 100644 index 00000000000..231f10196cb --- /dev/null +++ b/net/netfilter/ipset/Makefile @@ -0,0 +1,28 @@ +# +# Makefile for the ipset modules +# + +ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o + +# ipset core +obj-$(CONFIG_IP_SET) += ip_set.o + +# bitmap types +obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o +obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o +obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o + +# hash types +obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o +obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o +obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o +obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o +obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o +obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o +obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o +obj-$(CONFIG_IP_SET_HASH_NETNET) += ip_set_hash_netnet.o +obj-$(CONFIG_IP_SET_HASH_NETPORTNET) += ip_set_hash_netportnet.o + +# list types +obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h new file mode 100644 index 00000000000..f2c7d83dc23 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -0,0 +1,289 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __IP_SET_BITMAP_IP_GEN_H +#define __IP_SET_BITMAP_IP_GEN_H + +#define mtype_do_test		IPSET_TOKEN(MTYPE, _do_test) +#define mtype_gc_test		IPSET_TOKEN(MTYPE, _gc_test) +#define mtype_is_filled		IPSET_TOKEN(MTYPE, _is_filled) +#define mtype_do_add		IPSET_TOKEN(MTYPE, _do_add) +#define mtype_ext_cleanup	IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_do_del		IPSET_TOKEN(MTYPE, _do_del) +#define mtype_do_list		IPSET_TOKEN(MTYPE, _do_list) +#define mtype_do_head		IPSET_TOKEN(MTYPE, _do_head) +#define mtype_adt_elem		IPSET_TOKEN(MTYPE, _adt_elem) +#define mtype_add_timeout	IPSET_TOKEN(MTYPE, _add_timeout) +#define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_kadt		IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt		IPSET_TOKEN(MTYPE, _uadt) +#define mtype_destroy		IPSET_TOKEN(MTYPE, _destroy) +#define mtype_flush		IPSET_TOKEN(MTYPE, _flush) +#define mtype_head		IPSET_TOKEN(MTYPE, _head) +#define mtype_same_set		IPSET_TOKEN(MTYPE, _same_set) +#define mtype_elem		IPSET_TOKEN(MTYPE, _elem) +#define mtype_test		IPSET_TOKEN(MTYPE, _test) +#define mtype_add		IPSET_TOKEN(MTYPE, _add) +#define mtype_del		IPSET_TOKEN(MTYPE, _del) +#define mtype_list		IPSET_TOKEN(MTYPE, _list) +#define mtype_gc		IPSET_TOKEN(MTYPE, _gc) +#define mtype			MTYPE + +#define get_ext(set, map, id)	((map)->extensions + (set)->dsize * (id)) + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ +	struct mtype *map = set->data; + +	init_timer(&map->gc); +	map->gc.data = (unsigned long) set; +	map->gc.function = gc; +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; +	add_timer(&map->gc); +} + +static void +mtype_ext_cleanup(struct ip_set *set) +{ +	struct mtype *map = set->data; +	u32 id; + +	for (id = 0; id < map->elements; id++) +		if (test_bit(id, map->members)) +			ip_set_ext_destroy(set, get_ext(set, map, id)); +} + +static void +mtype_destroy(struct ip_set *set) +{ +	struct mtype *map = set->data; + +	if (SET_WITH_TIMEOUT(set)) +		del_timer_sync(&map->gc); + +	ip_set_free(map->members); +	if (set->dsize) { +		if (set->extensions & IPSET_EXT_DESTROY) +			mtype_ext_cleanup(set); +		ip_set_free(map->extensions); +	} +	kfree(map); + +	set->data = NULL; +} + +static void +mtype_flush(struct ip_set *set) +{ +	struct mtype *map = set->data; + +	if (set->extensions & IPSET_EXT_DESTROY) +		mtype_ext_cleanup(set); +	memset(map->members, 0, map->memsize); +} + +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ +	const struct mtype *map = set->data; +	struct nlattr *nested; + +	nested = ipset_nest_start(skb, IPSET_ATTR_DATA); +	if (!nested) +		goto nla_put_failure; +	if (mtype_do_head(skb, map) || +	    nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || +	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, +			  htonl(sizeof(*map) + +				map->memsize + +				set->dsize * map->elements))) +		goto nla_put_failure; +	if (unlikely(ip_set_put_flags(skb, set))) +		goto nla_put_failure; +	ipset_nest_end(skb, nested); + +	return 0; +nla_put_failure: +	return -EMSGSIZE; +} + +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	   struct ip_set_ext *mext, u32 flags) +{ +	struct mtype *map = set->data; +	const struct mtype_adt_elem *e = value; +	void *x = get_ext(set, map, e->id); +	int ret = mtype_do_test(e, map, set->dsize); + +	if (ret <= 0) +		return ret; +	if (SET_WITH_TIMEOUT(set) && +	    ip_set_timeout_expired(ext_timeout(x, set))) +		return 0; +	if (SET_WITH_COUNTER(set)) +		ip_set_update_counter(ext_counter(x, set), ext, mext, flags); +	return 1; +} + +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	  struct ip_set_ext *mext, u32 flags) +{ +	struct mtype *map = set->data; +	const struct mtype_adt_elem *e = value; +	void *x = get_ext(set, map, e->id); +	int ret = mtype_do_add(e, map, flags, set->dsize); + +	if (ret == IPSET_ADD_FAILED) { +		if (SET_WITH_TIMEOUT(set) && +		    ip_set_timeout_expired(ext_timeout(x, set))) +			ret = 0; +		else if (!(flags & IPSET_FLAG_EXIST)) +			return -IPSET_ERR_EXIST; +		/* Element is re-added, cleanup extensions */ +		ip_set_ext_destroy(set, x); +	} + +	if (SET_WITH_TIMEOUT(set)) +#ifdef IP_SET_BITMAP_STORED_TIMEOUT +		mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret); +#else +		ip_set_timeout_set(ext_timeout(x, set), ext->timeout); +#endif + +	if (SET_WITH_COUNTER(set)) +		ip_set_init_counter(ext_counter(x, set), ext); +	if (SET_WITH_COMMENT(set)) +		ip_set_init_comment(ext_comment(x, set), ext); +	return 0; +} + +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	  struct ip_set_ext *mext, u32 flags) +{ +	struct mtype *map = set->data; +	const struct mtype_adt_elem *e = value; +	void *x = get_ext(set, map, e->id); + +	if (mtype_do_del(e, map)) +		return -IPSET_ERR_EXIST; + +	ip_set_ext_destroy(set, x); +	if (SET_WITH_TIMEOUT(set) && +	    ip_set_timeout_expired(ext_timeout(x, set))) +		return -IPSET_ERR_EXIST; + +	return 0; +} + +#ifndef IP_SET_BITMAP_STORED_TIMEOUT +static inline bool +mtype_is_filled(const struct mtype_elem *x) +{ +	return true; +} +#endif + +static int +mtype_list(const struct ip_set *set, +	   struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct mtype *map = set->data; +	struct nlattr *adt, *nested; +	void *x; +	u32 id, first = cb->args[IPSET_CB_ARG0]; + +	adt = ipset_nest_start(skb, IPSET_ATTR_ADT); +	if (!adt) +		return -EMSGSIZE; +	for (; cb->args[IPSET_CB_ARG0] < map->elements; +	     cb->args[IPSET_CB_ARG0]++) { +		id = cb->args[IPSET_CB_ARG0]; +		x = get_ext(set, map, id); +		if (!test_bit(id, map->members) || +		    (SET_WITH_TIMEOUT(set) && +#ifdef IP_SET_BITMAP_STORED_TIMEOUT +		     mtype_is_filled((const struct mtype_elem *) x) && +#endif +		     ip_set_timeout_expired(ext_timeout(x, set)))) +			continue; +		nested = ipset_nest_start(skb, IPSET_ATTR_DATA); +		if (!nested) { +			if (id == first) { +				nla_nest_cancel(skb, adt); +				return -EMSGSIZE; +			} else +				goto nla_put_failure; +		} +		if (mtype_do_list(skb, map, id, set->dsize)) +			goto nla_put_failure; +		if (ip_set_put_extensions(skb, set, x, +		    mtype_is_filled((const struct mtype_elem *) x))) +			goto nla_put_failure; +		ipset_nest_end(skb, nested); +	} +	ipset_nest_end(skb, adt); + +	/* Set listing finished */ +	cb->args[IPSET_CB_ARG0] = 0; + +	return 0; + +nla_put_failure: +	nla_nest_cancel(skb, nested); +	if (unlikely(id == first)) { +		cb->args[IPSET_CB_ARG0] = 0; +		return -EMSGSIZE; +	} +	ipset_nest_end(skb, adt); +	return 0; +} + +static void +mtype_gc(unsigned long ul_set) +{ +	struct ip_set *set = (struct ip_set *) ul_set; +	struct mtype *map = set->data; +	void *x; +	u32 id; + +	/* We run parallel with other readers (test element) +	 * but adding/deleting new entries is locked out */ +	read_lock_bh(&set->lock); +	for (id = 0; id < map->elements; id++) +		if (mtype_gc_test(id, map, set->dsize)) { +			x = get_ext(set, map, id); +			if (ip_set_timeout_expired(ext_timeout(x, set))) { +				clear_bit(id, map->members); +				ip_set_ext_destroy(set, x); +			} +		} +	read_unlock_bh(&set->lock); + +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; +	add_timer(&map->gc); +} + +static const struct ip_set_type_variant mtype = { +	.kadt	= mtype_kadt, +	.uadt	= mtype_uadt, +	.adt	= { +		[IPSET_ADD] = mtype_add, +		[IPSET_DEL] = mtype_del, +		[IPSET_TEST] = mtype_test, +	}, +	.destroy = mtype_destroy, +	.flush	= mtype_flush, +	.head	= mtype_head, +	.list	= mtype_list, +	.same_set = mtype_same_set, +}; + +#endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c new file mode 100644 index 00000000000..6f1f9f49480 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -0,0 +1,377 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + *                         Patrick Schaaf <bof@bof.de> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counter support added */ +#define IPSET_TYPE_REV_MAX	2	/* Comment support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_bitmap:ip"); + +#define MTYPE		bitmap_ip + +/* Type structure */ +struct bitmap_ip { +	void *members;		/* the set members */ +	void *extensions;	/* data extensions */ +	u32 first_ip;		/* host byte order, included in range */ +	u32 last_ip;		/* host byte order, included in range */ +	u32 elements;		/* number of max elements in the set */ +	u32 hosts;		/* number of hosts in a subnet */ +	size_t memsize;		/* members size */ +	u8 netmask;		/* subnet netmask */ +	struct timer_list gc;	/* garbage collection */ +}; + +/* ADT structure for generic function args */ +struct bitmap_ip_adt_elem { +	u16 id; +}; + +static inline u32 +ip_to_id(const struct bitmap_ip *m, u32 ip) +{ +	return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; +} + +/* Common functions */ + +static inline int +bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, +		  struct bitmap_ip *map, size_t dsize) +{ +	return !!test_bit(e->id, map->members); +} + +static inline int +bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) +{ +	return !!test_bit(id, map->members); +} + +static inline int +bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, +		 u32 flags, size_t dsize) +{ +	return !!test_and_set_bit(e->id, map->members); +} + +static inline int +bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) +{ +	return !test_and_clear_bit(e->id, map->members); +} + +static inline int +bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, +		  size_t dsize) +{ +	return nla_put_ipaddr4(skb, IPSET_ATTR_IP, +			htonl(map->first_ip + id * map->hosts)); +} + +static inline int +bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) +{ +	return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || +	       nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || +	       (map->netmask != 32 && +		nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)); +} + +static int +bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	struct bitmap_ip *map = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct bitmap_ip_adt_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); +	u32 ip; + +	ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); +	if (ip < map->first_ip || ip > map->last_ip) +		return -IPSET_ERR_BITMAP_RANGE; + +	e.id = ip_to_id(map, ip); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], +	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	struct bitmap_ip *map = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	u32 ip = 0, ip_to = 0; +	struct bitmap_ip_adt_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	int ret = 0; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (ip < map->first_ip || ip > map->last_ip) +		return -IPSET_ERR_BITMAP_RANGE; + +	if (adt == IPSET_TEST) { +		e.id = ip_to_id(map, ip); +		return adtfn(set, &e, &ext, &ext, flags); +	} + +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) { +			swap(ip, ip_to); +			if (ip < map->first_ip) +				return -IPSET_ERR_BITMAP_RANGE; +		} +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (!cidr || cidr > 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(ip, ip_to, cidr); +	} else +		ip_to = ip; + +	if (ip_to > map->last_ip) +		return -IPSET_ERR_BITMAP_RANGE; + +	for (; !before(ip_to, ip); ip += map->hosts) { +		e.id = ip_to_id(map, ip); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static bool +bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) +{ +	const struct bitmap_ip *x = a->data; +	const struct bitmap_ip *y = b->data; + +	return x->first_ip == y->first_ip && +	       x->last_ip == y->last_ip && +	       x->netmask == y->netmask && +	       a->timeout == b->timeout && +	       a->extensions == b->extensions; +} + +/* Plain variant */ + +struct bitmap_ip_elem { +}; + +#include "ip_set_bitmap_gen.h" + +/* Create bitmap:ip type of sets */ + +static bool +init_map_ip(struct ip_set *set, struct bitmap_ip *map, +	    u32 first_ip, u32 last_ip, +	    u32 elements, u32 hosts, u8 netmask) +{ +	map->members = ip_set_alloc(map->memsize); +	if (!map->members) +		return false; +	if (set->dsize) { +		map->extensions = ip_set_alloc(set->dsize * elements); +		if (!map->extensions) { +			kfree(map->members); +			return false; +		} +	} +	map->first_ip = first_ip; +	map->last_ip = last_ip; +	map->elements = elements; +	map->hosts = hosts; +	map->netmask = netmask; +	set->timeout = IPSET_NO_TIMEOUT; + +	set->data = map; +	set->family = NFPROTO_IPV4; + +	return true; +} + +static int +bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], +		 u32 flags) +{ +	struct bitmap_ip *map; +	u32 first_ip = 0, last_ip = 0, hosts; +	u64 elements; +	u8 netmask = 32; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) +		return -IPSET_ERR_PROTOCOL; + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); +		if (ret) +			return ret; +		if (first_ip > last_ip) { +			u32 tmp = first_ip; + +			first_ip = last_ip; +			last_ip = tmp; +		} +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (cidr >= 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(first_ip, last_ip, cidr); +	} else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_NETMASK]) { +		netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + +		if (netmask > 32) +			return -IPSET_ERR_INVALID_NETMASK; + +		first_ip &= ip_set_hostmask(netmask); +		last_ip |= ~ip_set_hostmask(netmask); +	} + +	if (netmask == 32) { +		hosts = 1; +		elements = (u64)last_ip - first_ip + 1; +	} else { +		u8 mask_bits; +		u32 mask; + +		mask = range_to_mask(first_ip, last_ip, &mask_bits); + +		if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) || +		    netmask <= mask_bits) +			return -IPSET_ERR_BITMAP_RANGE; + +		pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask); +		hosts = 2 << (32 - netmask - 1); +		elements = 2 << (netmask - mask_bits - 1); +	} +	if (elements > IPSET_BITMAP_MAX_RANGE + 1) +		return -IPSET_ERR_BITMAP_RANGE_SIZE; + +	pr_debug("hosts %u, elements %llu\n", +		 hosts, (unsigned long long)elements); + +	map = kzalloc(sizeof(*map), GFP_KERNEL); +	if (!map) +		return -ENOMEM; + +	map->memsize = bitmap_bytes(0, elements - 1); +	set->variant = &bitmap_ip; +	set->dsize = ip_set_elem_len(set, tb, 0); +	if (!init_map_ip(set, map, first_ip, last_ip, +			 elements, hosts, netmask)) { +		kfree(map); +		return -ENOMEM; +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +		bitmap_ip_gc_init(set, bitmap_ip_gc); +	} +	return 0; +} + +static struct ip_set_type bitmap_ip_type __read_mostly = { +	.name		= "bitmap:ip", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP, +	.dimension	= IPSET_DIM_ONE, +	.family		= NFPROTO_IPV4, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= bitmap_ip_create, +	.create_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_NETMASK]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +bitmap_ip_init(void) +{ +	return ip_set_type_register(&bitmap_ip_type); +} + +static void __exit +bitmap_ip_fini(void) +{ +	ip_set_type_unregister(&bitmap_ip_type); +} + +module_init(bitmap_ip_init); +module_exit(bitmap_ip_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c new file mode 100644 index 00000000000..740eabededd --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -0,0 +1,414 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + *                         Patrick Schaaf <bof@bof.de> + *			   Martin Josefsson <gandalf@wlug.westbo.se> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip,mac type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/if_ether.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> + +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counter support added */ +#define IPSET_TYPE_REV_MAX	2	/* Comment support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_bitmap:ip,mac"); + +#define MTYPE		bitmap_ipmac +#define IP_SET_BITMAP_STORED_TIMEOUT + +enum { +	MAC_UNSET,		/* element is set, without MAC */ +	MAC_FILLED,		/* element is set with MAC */ +}; + +/* Type structure */ +struct bitmap_ipmac { +	void *members;		/* the set members */ +	void *extensions;	/* MAC + data extensions */ +	u32 first_ip;		/* host byte order, included in range */ +	u32 last_ip;		/* host byte order, included in range */ +	u32 elements;		/* number of max elements in the set */ +	size_t memsize;		/* members size */ +	struct timer_list gc;	/* garbage collector */ +}; + +/* ADT structure for generic function args */ +struct bitmap_ipmac_adt_elem { +	u16 id; +	unsigned char *ether; +}; + +struct bitmap_ipmac_elem { +	unsigned char ether[ETH_ALEN]; +	unsigned char filled; +} __attribute__ ((aligned)); + +static inline u32 +ip_to_id(const struct bitmap_ipmac *m, u32 ip) +{ +	return ip - m->first_ip; +} + +static inline struct bitmap_ipmac_elem * +get_elem(void *extensions, u16 id, size_t dsize) +{ +	return (struct bitmap_ipmac_elem *)(extensions + id * dsize); +} + +/* Common functions */ + +static inline int +bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, +		     const struct bitmap_ipmac *map, size_t dsize) +{ +	const struct bitmap_ipmac_elem *elem; + +	if (!test_bit(e->id, map->members)) +		return 0; +	elem = get_elem(map->extensions, e->id, dsize); +	if (elem->filled == MAC_FILLED) +		return e->ether == NULL || +		       ether_addr_equal(e->ether, elem->ether); +	/* Trigger kernel to fill out the ethernet address */ +	return -EAGAIN; +} + +static inline int +bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) +{ +	const struct bitmap_ipmac_elem *elem; + +	if (!test_bit(id, map->members)) +		return 0; +	elem = get_elem(map->extensions, id, dsize); +	/* Timer not started for the incomplete elements */ +	return elem->filled == MAC_FILLED; +} + +static inline int +bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) +{ +	return elem->filled == MAC_FILLED; +} + +static inline int +bitmap_ipmac_add_timeout(unsigned long *timeout, +			 const struct bitmap_ipmac_adt_elem *e, +			 const struct ip_set_ext *ext, struct ip_set *set, +			 struct bitmap_ipmac *map, int mode) +{ +	u32 t = ext->timeout; + +	if (mode == IPSET_ADD_START_STORED_TIMEOUT) { +		if (t == set->timeout) +			/* Timeout was not specified, get stored one */ +			t = *timeout; +		ip_set_timeout_set(timeout, t); +	} else { +		/* If MAC is unset yet, we store plain timeout value +		 * because the timer is not activated yet +		 * and we can reuse it later when MAC is filled out, +		 * possibly by the kernel */ +		if (e->ether) +			ip_set_timeout_set(timeout, t); +		else +			*timeout = t; +	} +	return 0; +} + +static inline int +bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, +		    struct bitmap_ipmac *map, u32 flags, size_t dsize) +{ +	struct bitmap_ipmac_elem *elem; + +	elem = get_elem(map->extensions, e->id, dsize); +	if (test_and_set_bit(e->id, map->members)) { +		if (elem->filled == MAC_FILLED) { +			if (e->ether && (flags & IPSET_FLAG_EXIST)) +				memcpy(elem->ether, e->ether, ETH_ALEN); +			return IPSET_ADD_FAILED; +		} else if (!e->ether) +			/* Already added without ethernet address */ +			return IPSET_ADD_FAILED; +		/* Fill the MAC address and trigger the timer activation */ +		memcpy(elem->ether, e->ether, ETH_ALEN); +		elem->filled = MAC_FILLED; +		return IPSET_ADD_START_STORED_TIMEOUT; +	} else if (e->ether) { +		/* We can store MAC too */ +		memcpy(elem->ether, e->ether, ETH_ALEN); +		elem->filled = MAC_FILLED; +		return 0; +	} else { +		elem->filled = MAC_UNSET; +		/* MAC is not stored yet, don't start timer */ +		return IPSET_ADD_STORE_PLAIN_TIMEOUT; +	} +} + +static inline int +bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, +		    struct bitmap_ipmac *map) +{ +	return !test_and_clear_bit(e->id, map->members); +} + +static inline int +bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, +		     u32 id, size_t dsize) +{ +	const struct bitmap_ipmac_elem *elem = +		get_elem(map->extensions, id, dsize); + +	return nla_put_ipaddr4(skb, IPSET_ATTR_IP, +			       htonl(map->first_ip + id)) || +	       (elem->filled == MAC_FILLED && +		nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); +} + +static inline int +bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) +{ +	return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || +	       nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); +} + +static int +bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, +		  const struct xt_action_param *par, +		  enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	struct bitmap_ipmac *map = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct bitmap_ipmac_adt_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); +	u32 ip; + +	/* MAC can be src only */ +	if (!(opt->flags & IPSET_DIM_TWO_SRC)) +		return 0; + +	ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); +	if (ip < map->first_ip || ip > map->last_ip) +		return -IPSET_ERR_BITMAP_RANGE; + +	/* Backward compatibility: we don't check the second flag */ +	if (skb_mac_header(skb) < skb->head || +	    (skb_mac_header(skb) + ETH_HLEN) > skb->data) +		return -EINVAL; + +	e.id = ip_to_id(map, ip); +	e.ether = eth_hdr(skb)->h_source; + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], +		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct bitmap_ipmac *map = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct bitmap_ipmac_adt_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0; +	int ret = 0; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (ip < map->first_ip || ip > map->last_ip) +		return -IPSET_ERR_BITMAP_RANGE; + +	e.id = ip_to_id(map, ip); +	if (tb[IPSET_ATTR_ETHER]) +		e.ether = nla_data(tb[IPSET_ATTR_ETHER]); +	else +		e.ether = NULL; + +	ret = adtfn(set, &e, &ext, &ext, flags); + +	return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static bool +bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) +{ +	const struct bitmap_ipmac *x = a->data; +	const struct bitmap_ipmac *y = b->data; + +	return x->first_ip == y->first_ip && +	       x->last_ip == y->last_ip && +	       a->timeout == b->timeout && +	       a->extensions == b->extensions; +} + +/* Plain variant */ + +#include "ip_set_bitmap_gen.h" + +/* Create bitmap:ip,mac type of sets */ + +static bool +init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, +	       u32 first_ip, u32 last_ip, u32 elements) +{ +	map->members = ip_set_alloc(map->memsize); +	if (!map->members) +		return false; +	if (set->dsize) { +		map->extensions = ip_set_alloc(set->dsize * elements); +		if (!map->extensions) { +			kfree(map->members); +			return false; +		} +	} +	map->first_ip = first_ip; +	map->last_ip = last_ip; +	map->elements = elements; +	set->timeout = IPSET_NO_TIMEOUT; + +	set->data = map; +	set->family = NFPROTO_IPV4; + +	return true; +} + +static int +bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], +		    u32 flags) +{ +	u32 first_ip = 0, last_ip = 0; +	u64 elements; +	struct bitmap_ipmac *map; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) +		return -IPSET_ERR_PROTOCOL; + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); +		if (ret) +			return ret; +		if (first_ip > last_ip) { +			u32 tmp = first_ip; + +			first_ip = last_ip; +			last_ip = tmp; +		} +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (cidr >= 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(first_ip, last_ip, cidr); +	} else +		return -IPSET_ERR_PROTOCOL; + +	elements = (u64)last_ip - first_ip + 1; + +	if (elements > IPSET_BITMAP_MAX_RANGE + 1) +		return -IPSET_ERR_BITMAP_RANGE_SIZE; + +	map = kzalloc(sizeof(*map), GFP_KERNEL); +	if (!map) +		return -ENOMEM; + +	map->memsize = bitmap_bytes(0, elements - 1); +	set->variant = &bitmap_ipmac; +	set->dsize = ip_set_elem_len(set, tb, +				     sizeof(struct bitmap_ipmac_elem)); +	if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { +		kfree(map); +		return -ENOMEM; +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +		bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); +	} +	return 0; +} + +static struct ip_set_type bitmap_ipmac_type = { +	.name		= "bitmap:ip,mac", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_MAC, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_IPV4, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= bitmap_ipmac_create, +	.create_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_ETHER]	= { .type = NLA_BINARY, +					    .len  = ETH_ALEN }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +bitmap_ipmac_init(void) +{ +	return ip_set_type_register(&bitmap_ipmac_type); +} + +static void __exit +bitmap_ipmac_fini(void) +{ +	ip_set_type_unregister(&bitmap_ipmac_type); +} + +module_init(bitmap_ipmac_init); +module_exit(bitmap_ipmac_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c new file mode 100644 index 00000000000..cf99676e69f --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -0,0 +1,311 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:port type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> + +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> +#include <linux/netfilter/ipset/ip_set_getport.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counter support added */ +#define IPSET_TYPE_REV_MAX	2	/* Comment support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_bitmap:port"); + +#define MTYPE		bitmap_port + +/* Type structure */ +struct bitmap_port { +	void *members;		/* the set members */ +	void *extensions;	/* data extensions */ +	u16 first_port;		/* host byte order, included in range */ +	u16 last_port;		/* host byte order, included in range */ +	u32 elements;		/* number of max elements in the set */ +	size_t memsize;		/* members size */ +	struct timer_list gc;	/* garbage collection */ +}; + +/* ADT structure for generic function args */ +struct bitmap_port_adt_elem { +	u16 id; +}; + +static inline u16 +port_to_id(const struct bitmap_port *m, u16 port) +{ +	return port - m->first_port; +} + +/* Common functions */ + +static inline int +bitmap_port_do_test(const struct bitmap_port_adt_elem *e, +		    const struct bitmap_port *map, size_t dsize) +{ +	return !!test_bit(e->id, map->members); +} + +static inline int +bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) +{ +	return !!test_bit(id, map->members); +} + +static inline int +bitmap_port_do_add(const struct bitmap_port_adt_elem *e, +		   struct bitmap_port *map, u32 flags, size_t dsize) +{ +	return !!test_and_set_bit(e->id, map->members); +} + +static inline int +bitmap_port_do_del(const struct bitmap_port_adt_elem *e, +		   struct bitmap_port *map) +{ +	return !test_and_clear_bit(e->id, map->members); +} + +static inline int +bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, +		    size_t dsize) +{ +	return nla_put_net16(skb, IPSET_ATTR_PORT, +			     htons(map->first_port + id)); +} + +static inline int +bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) +{ +	return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || +	       nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); +} + +static int +bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, +		 const struct xt_action_param *par, +		 enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	struct bitmap_port *map = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct bitmap_port_adt_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); +	__be16 __port; +	u16 port = 0; + +	if (!ip_set_get_ip_port(skb, opt->family, +				opt->flags & IPSET_DIM_ONE_SRC, &__port)) +		return -EINVAL; + +	port = ntohs(__port); + +	if (port < map->first_port || port > map->last_port) +		return -IPSET_ERR_BITMAP_RANGE; + +	e.id = port_to_id(map, port); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], +		 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	struct bitmap_port *map = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct bitmap_port_adt_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port;	/* wraparound */ +	u16 port_to; +	int ret = 0; + +	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); +	if (port < map->first_port || port > map->last_port) +		return -IPSET_ERR_BITMAP_RANGE; +	ret = ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (adt == IPSET_TEST) { +		e.id = port_to_id(map, port); +		return adtfn(set, &e, &ext, &ext, flags); +	} + +	if (tb[IPSET_ATTR_PORT_TO]) { +		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +		if (port > port_to) { +			swap(port, port_to); +			if (port < map->first_port) +				return -IPSET_ERR_BITMAP_RANGE; +		} +	} else +		port_to = port; + +	if (port_to > map->last_port) +		return -IPSET_ERR_BITMAP_RANGE; + +	for (; port <= port_to; port++) { +		e.id = port_to_id(map, port); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static bool +bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) +{ +	const struct bitmap_port *x = a->data; +	const struct bitmap_port *y = b->data; + +	return x->first_port == y->first_port && +	       x->last_port == y->last_port && +	       a->timeout == b->timeout && +	       a->extensions == b->extensions; +} + +/* Plain variant */ + +struct bitmap_port_elem { +}; + +#include "ip_set_bitmap_gen.h" + +/* Create bitmap:ip type of sets */ + +static bool +init_map_port(struct ip_set *set, struct bitmap_port *map, +	      u16 first_port, u16 last_port) +{ +	map->members = ip_set_alloc(map->memsize); +	if (!map->members) +		return false; +	if (set->dsize) { +		map->extensions = ip_set_alloc(set->dsize * map->elements); +		if (!map->extensions) { +			kfree(map->members); +			return false; +		} +	} +	map->first_port = first_port; +	map->last_port = last_port; +	set->timeout = IPSET_NO_TIMEOUT; + +	set->data = map; +	set->family = NFPROTO_UNSPEC; + +	return true; +} + +static int +bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], +		   u32 flags) +{ +	struct bitmap_port *map; +	u16 first_port, last_port; + +	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) +		return -IPSET_ERR_PROTOCOL; + +	first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); +	last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +	if (first_port > last_port) { +		u16 tmp = first_port; + +		first_port = last_port; +		last_port = tmp; +	} + +	map = kzalloc(sizeof(*map), GFP_KERNEL); +	if (!map) +		return -ENOMEM; + +	map->elements = last_port - first_port + 1; +	map->memsize = bitmap_bytes(0, map->elements); +	set->variant = &bitmap_port; +	set->dsize = ip_set_elem_len(set, tb, 0); +	if (!init_map_port(set, map, first_port, last_port)) { +		kfree(map); +		return -ENOMEM; +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +		bitmap_port_gc_init(set, bitmap_port_gc); +	} +	return 0; +} + +static struct ip_set_type bitmap_port_type = { +	.name		= "bitmap:port", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_PORT, +	.dimension	= IPSET_DIM_ONE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= bitmap_port_create, +	.create_policy	= { +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +bitmap_port_init(void) +{ +	return ip_set_type_register(&bitmap_port_type); +} + +static void __exit +bitmap_port_fini(void) +{ +	ip_set_type_unregister(&bitmap_port_type); +} + +module_init(bitmap_port_init); +module_exit(bitmap_port_fini); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c new file mode 100644 index 00000000000..ec8114fae50 --- /dev/null +++ b/net/netfilter/ipset/ip_set_core.c @@ -0,0 +1,2011 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + *                         Patrick Schaaf <bof@bof.de> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module for IP set management */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/rculist.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/ipset/ip_set.h> + +static LIST_HEAD(ip_set_type_list);		/* all registered set types */ +static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */ +static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */ + +struct ip_set_net { +	struct ip_set * __rcu *ip_set_list;	/* all individual sets */ +	ip_set_id_t	ip_set_max;	/* max number of sets */ +	int		is_deleted;	/* deleted by ip_set_net_exit */ +}; +static int ip_set_net_id __read_mostly; + +static inline struct ip_set_net *ip_set_pernet(struct net *net) +{ +	return net_generic(net, ip_set_net_id); +} + +#define IP_SET_INC	64 +#define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0) + +static unsigned int max_sets; + +module_param(max_sets, int, 0600); +MODULE_PARM_DESC(max_sets, "maximal number of sets"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("core IP set support"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); + +/* When the nfnl mutex is held: */ +#define ip_set_dereference(p)		\ +	rcu_dereference_protected(p, 1) +#define ip_set(inst, id)		\ +	ip_set_dereference((inst)->ip_set_list)[id] + +/* + * The set types are implemented in modules and registered set types + * can be found in ip_set_type_list. Adding/deleting types is + * serialized by ip_set_type_mutex. + */ + +static inline void +ip_set_type_lock(void) +{ +	mutex_lock(&ip_set_type_mutex); +} + +static inline void +ip_set_type_unlock(void) +{ +	mutex_unlock(&ip_set_type_mutex); +} + +/* Register and deregister settype */ + +static struct ip_set_type * +find_set_type(const char *name, u8 family, u8 revision) +{ +	struct ip_set_type *type; + +	list_for_each_entry_rcu(type, &ip_set_type_list, list) +		if (STREQ(type->name, name) && +		    (type->family == family || +		     type->family == NFPROTO_UNSPEC) && +		    revision >= type->revision_min && +		    revision <= type->revision_max) +			return type; +	return NULL; +} + +/* Unlock, try to load a set type module and lock again */ +static bool +load_settype(const char *name) +{ +	nfnl_unlock(NFNL_SUBSYS_IPSET); +	pr_debug("try to load ip_set_%s\n", name); +	if (request_module("ip_set_%s", name) < 0) { +		pr_warning("Can't find ip_set type %s\n", name); +		nfnl_lock(NFNL_SUBSYS_IPSET); +		return false; +	} +	nfnl_lock(NFNL_SUBSYS_IPSET); +	return true; +} + +/* Find a set type and reference it */ +#define find_set_type_get(name, family, revision, found)	\ +	__find_set_type_get(name, family, revision, found, false) + +static int +__find_set_type_get(const char *name, u8 family, u8 revision, +		    struct ip_set_type **found, bool retry) +{ +	struct ip_set_type *type; +	int err; + +	if (retry && !load_settype(name)) +		return -IPSET_ERR_FIND_TYPE; + +	rcu_read_lock(); +	*found = find_set_type(name, family, revision); +	if (*found) { +		err = !try_module_get((*found)->me) ? -EFAULT : 0; +		goto unlock; +	} +	/* Make sure the type is already loaded +	 * but we don't support the revision */ +	list_for_each_entry_rcu(type, &ip_set_type_list, list) +		if (STREQ(type->name, name)) { +			err = -IPSET_ERR_FIND_TYPE; +			goto unlock; +		} +	rcu_read_unlock(); + +	return retry ? -IPSET_ERR_FIND_TYPE : +		__find_set_type_get(name, family, revision, found, true); + +unlock: +	rcu_read_unlock(); +	return err; +} + +/* Find a given set type by name and family. + * If we succeeded, the supported minimal and maximum revisions are + * filled out. + */ +#define find_set_type_minmax(name, family, min, max) \ +	__find_set_type_minmax(name, family, min, max, false) + +static int +__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, +		       bool retry) +{ +	struct ip_set_type *type; +	bool found = false; + +	if (retry && !load_settype(name)) +		return -IPSET_ERR_FIND_TYPE; + +	*min = 255; *max = 0; +	rcu_read_lock(); +	list_for_each_entry_rcu(type, &ip_set_type_list, list) +		if (STREQ(type->name, name) && +		    (type->family == family || +		     type->family == NFPROTO_UNSPEC)) { +			found = true; +			if (type->revision_min < *min) +				*min = type->revision_min; +			if (type->revision_max > *max) +				*max = type->revision_max; +		} +	rcu_read_unlock(); +	if (found) +		return 0; + +	return retry ? -IPSET_ERR_FIND_TYPE : +		__find_set_type_minmax(name, family, min, max, true); +} + +#define family_name(f)	((f) == NFPROTO_IPV4 ? "inet" : \ +			 (f) == NFPROTO_IPV6 ? "inet6" : "any") + +/* Register a set type structure. The type is identified by + * the unique triple of name, family and revision. + */ +int +ip_set_type_register(struct ip_set_type *type) +{ +	int ret = 0; + +	if (type->protocol != IPSET_PROTOCOL) { +		pr_warning("ip_set type %s, family %s, revision %u:%u uses " +			   "wrong protocol version %u (want %u)\n", +			   type->name, family_name(type->family), +			   type->revision_min, type->revision_max, +			   type->protocol, IPSET_PROTOCOL); +		return -EINVAL; +	} + +	ip_set_type_lock(); +	if (find_set_type(type->name, type->family, type->revision_min)) { +		/* Duplicate! */ +		pr_warning("ip_set type %s, family %s with revision min %u " +			   "already registered!\n", type->name, +			   family_name(type->family), type->revision_min); +		ret = -EINVAL; +		goto unlock; +	} +	list_add_rcu(&type->list, &ip_set_type_list); +	pr_debug("type %s, family %s, revision %u:%u registered.\n", +		 type->name, family_name(type->family), +		 type->revision_min, type->revision_max); +unlock: +	ip_set_type_unlock(); +	return ret; +} +EXPORT_SYMBOL_GPL(ip_set_type_register); + +/* Unregister a set type. There's a small race with ip_set_create */ +void +ip_set_type_unregister(struct ip_set_type *type) +{ +	ip_set_type_lock(); +	if (!find_set_type(type->name, type->family, type->revision_min)) { +		pr_warning("ip_set type %s, family %s with revision min %u " +			   "not registered\n", type->name, +			   family_name(type->family), type->revision_min); +		goto unlock; +	} +	list_del_rcu(&type->list); +	pr_debug("type %s, family %s with revision min %u unregistered.\n", +		 type->name, family_name(type->family), type->revision_min); +unlock: +	ip_set_type_unlock(); + +	synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(ip_set_type_unregister); + +/* Utility functions */ +void * +ip_set_alloc(size_t size) +{ +	void *members = NULL; + +	if (size < KMALLOC_MAX_SIZE) +		members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + +	if (members) { +		pr_debug("%p: allocated with kmalloc\n", members); +		return members; +	} + +	members = vzalloc(size); +	if (!members) +		return NULL; +	pr_debug("%p: allocated with vmalloc\n", members); + +	return members; +} +EXPORT_SYMBOL_GPL(ip_set_alloc); + +void +ip_set_free(void *members) +{ +	pr_debug("%p: free with %s\n", members, +		 is_vmalloc_addr(members) ? "vfree" : "kfree"); +	kvfree(members); +} +EXPORT_SYMBOL_GPL(ip_set_free); + +static inline bool +flag_nested(const struct nlattr *nla) +{ +	return nla->nla_type & NLA_F_NESTED; +} + +static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { +	[IPSET_ATTR_IPADDR_IPV4]	= { .type = NLA_U32 }, +	[IPSET_ATTR_IPADDR_IPV6]	= { .type = NLA_BINARY, +					    .len = sizeof(struct in6_addr) }, +}; + +int +ip_set_get_ipaddr4(struct nlattr *nla,  __be32 *ipaddr) +{ +	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + +	if (unlikely(!flag_nested(nla))) +		return -IPSET_ERR_PROTOCOL; +	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) +		return -IPSET_ERR_PROTOCOL; + +	*ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]); +	return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); + +int +ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) +{ +	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + +	if (unlikely(!flag_nested(nla))) +		return -IPSET_ERR_PROTOCOL; + +	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) +		return -IPSET_ERR_PROTOCOL; + +	memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), +		sizeof(struct in6_addr)); +	return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); + +typedef void (*destroyer)(void *); +/* ipset data extension types, in size order */ + +const struct ip_set_ext_type ip_set_extensions[] = { +	[IPSET_EXT_ID_COUNTER] = { +		.type	= IPSET_EXT_COUNTER, +		.flag	= IPSET_FLAG_WITH_COUNTERS, +		.len	= sizeof(struct ip_set_counter), +		.align	= __alignof__(struct ip_set_counter), +	}, +	[IPSET_EXT_ID_TIMEOUT] = { +		.type	= IPSET_EXT_TIMEOUT, +		.len	= sizeof(unsigned long), +		.align	= __alignof__(unsigned long), +	}, +	[IPSET_EXT_ID_COMMENT] = { +		.type	 = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, +		.flag	 = IPSET_FLAG_WITH_COMMENT, +		.len	 = sizeof(struct ip_set_comment), +		.align	 = __alignof__(struct ip_set_comment), +		.destroy = (destroyer) ip_set_comment_free, +	}, +}; +EXPORT_SYMBOL_GPL(ip_set_extensions); + +static inline bool +add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) +{ +	return ip_set_extensions[id].flag ? +		(flags & ip_set_extensions[id].flag) : +		!!tb[IPSET_ATTR_TIMEOUT]; +} + +size_t +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +{ +	enum ip_set_ext_id id; +	size_t offset = 0; +	u32 cadt_flags = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) +		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +	if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) +		set->flags |= IPSET_CREATE_FLAG_FORCEADD; +	for (id = 0; id < IPSET_EXT_ID_MAX; id++) { +		if (!add_extension(id, cadt_flags, tb)) +			continue; +		offset += ALIGN(len + offset, ip_set_extensions[id].align); +		set->offset[id] = offset; +		set->extensions |= ip_set_extensions[id].type; +		offset += ip_set_extensions[id].len; +	} +	return len + offset; +} +EXPORT_SYMBOL_GPL(ip_set_elem_len); + +int +ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], +		      struct ip_set_ext *ext) +{ +	if (tb[IPSET_ATTR_TIMEOUT]) { +		if (!(set->extensions & IPSET_EXT_TIMEOUT)) +			return -IPSET_ERR_TIMEOUT; +		ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +	} +	if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { +		if (!(set->extensions & IPSET_EXT_COUNTER)) +			return -IPSET_ERR_COUNTER; +		if (tb[IPSET_ATTR_BYTES]) +			ext->bytes = be64_to_cpu(nla_get_be64( +						 tb[IPSET_ATTR_BYTES])); +		if (tb[IPSET_ATTR_PACKETS]) +			ext->packets = be64_to_cpu(nla_get_be64( +						   tb[IPSET_ATTR_PACKETS])); +	} +	if (tb[IPSET_ATTR_COMMENT]) { +		if (!(set->extensions & IPSET_EXT_COMMENT)) +			return -IPSET_ERR_COMMENT; +		ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_extensions); + +/* + * Creating/destroying/renaming/swapping affect the existence and + * the properties of a set. All of these can be executed from userspace + * only and serialized by the nfnl mutex indirectly from nfnetlink. + * + * Sets are identified by their index in ip_set_list and the index + * is used by the external references (set/SET netfilter modules). + * + * The set behind an index may change by swapping only, from userspace. + */ + +static inline void +__ip_set_get(struct ip_set *set) +{ +	write_lock_bh(&ip_set_ref_lock); +	set->ref++; +	write_unlock_bh(&ip_set_ref_lock); +} + +static inline void +__ip_set_put(struct ip_set *set) +{ +	write_lock_bh(&ip_set_ref_lock); +	BUG_ON(set->ref == 0); +	set->ref--; +	write_unlock_bh(&ip_set_ref_lock); +} + +/* + * Add, del and test set entries from kernel. + * + * The set behind the index must exist and must be referenced + * so it can't be destroyed (or changed) under our foot. + */ + +static inline struct ip_set * +ip_set_rcu_get(struct net *net, ip_set_id_t index) +{ +	struct ip_set *set; +	struct ip_set_net *inst = ip_set_pernet(net); + +	rcu_read_lock(); +	/* ip_set_list itself needs to be protected */ +	set = rcu_dereference(inst->ip_set_list)[index]; +	rcu_read_unlock(); + +	return set; +} + +int +ip_set_test(ip_set_id_t index, const struct sk_buff *skb, +	    const struct xt_action_param *par, struct ip_set_adt_opt *opt) +{ +	struct ip_set *set = ip_set_rcu_get( +			dev_net(par->in ? par->in : par->out), index); +	int ret = 0; + +	BUG_ON(set == NULL); +	pr_debug("set %s, index %u\n", set->name, index); + +	if (opt->dim < set->type->dimension || +	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) +		return 0; + +	read_lock_bh(&set->lock); +	ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); +	read_unlock_bh(&set->lock); + +	if (ret == -EAGAIN) { +		/* Type requests element to be completed */ +		pr_debug("element must be competed, ADD is triggered\n"); +		write_lock_bh(&set->lock); +		set->variant->kadt(set, skb, par, IPSET_ADD, opt); +		write_unlock_bh(&set->lock); +		ret = 1; +	} else { +		/* --return-nomatch: invert matched element */ +		if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && +		    (set->type->features & IPSET_TYPE_NOMATCH) && +		    (ret > 0 || ret == -ENOTEMPTY)) +			ret = -ret; +	} + +	/* Convert error codes to nomatch */ +	return (ret < 0 ? 0 : ret); +} +EXPORT_SYMBOL_GPL(ip_set_test); + +int +ip_set_add(ip_set_id_t index, const struct sk_buff *skb, +	   const struct xt_action_param *par, struct ip_set_adt_opt *opt) +{ +	struct ip_set *set = ip_set_rcu_get( +			dev_net(par->in ? par->in : par->out), index); +	int ret; + +	BUG_ON(set == NULL); +	pr_debug("set %s, index %u\n", set->name, index); + +	if (opt->dim < set->type->dimension || +	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) +		return -IPSET_ERR_TYPE_MISMATCH; + +	write_lock_bh(&set->lock); +	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); +	write_unlock_bh(&set->lock); + +	return ret; +} +EXPORT_SYMBOL_GPL(ip_set_add); + +int +ip_set_del(ip_set_id_t index, const struct sk_buff *skb, +	   const struct xt_action_param *par, struct ip_set_adt_opt *opt) +{ +	struct ip_set *set = ip_set_rcu_get( +			dev_net(par->in ? par->in : par->out), index); +	int ret = 0; + +	BUG_ON(set == NULL); +	pr_debug("set %s, index %u\n", set->name, index); + +	if (opt->dim < set->type->dimension || +	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) +		return -IPSET_ERR_TYPE_MISMATCH; + +	write_lock_bh(&set->lock); +	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); +	write_unlock_bh(&set->lock); + +	return ret; +} +EXPORT_SYMBOL_GPL(ip_set_del); + +/* + * Find set by name, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + */ +ip_set_id_t +ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) +{ +	ip_set_id_t i, index = IPSET_INVALID_ID; +	struct ip_set *s; +	struct ip_set_net *inst = ip_set_pernet(net); + +	rcu_read_lock(); +	for (i = 0; i < inst->ip_set_max; i++) { +		s = rcu_dereference(inst->ip_set_list)[i]; +		if (s != NULL && STREQ(s->name, name)) { +			__ip_set_get(s); +			index = i; +			*set = s; +			break; +		} +	} +	rcu_read_unlock(); + +	return index; +} +EXPORT_SYMBOL_GPL(ip_set_get_byname); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + */ + +static inline void +__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) +{ +	struct ip_set *set; + +	rcu_read_lock(); +	set = rcu_dereference(inst->ip_set_list)[index]; +	if (set != NULL) +		__ip_set_put(set); +	rcu_read_unlock(); +} + +void +ip_set_put_byindex(struct net *net, ip_set_id_t index) +{ +	struct ip_set_net *inst = ip_set_pernet(net); + +	__ip_set_put_byindex(inst, index); +} +EXPORT_SYMBOL_GPL(ip_set_put_byindex); + +/* + * Get the name of a set behind a set index. + * We assume the set is referenced, so it does exist and + * can't be destroyed. The set cannot be renamed due to + * the referencing either. + * + */ +const char * +ip_set_name_byindex(struct net *net, ip_set_id_t index) +{ +	const struct ip_set *set = ip_set_rcu_get(net, index); + +	BUG_ON(set == NULL); +	BUG_ON(set->ref == 0); + +	/* Referenced, so it's safe */ +	return set->name; +} +EXPORT_SYMBOL_GPL(ip_set_name_byindex); + +/* + * Routines to call by external subsystems, which do not + * call nfnl_lock for us. + */ + +/* + * Find set by index, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + * The nfnl mutex is used in the function. + */ +ip_set_id_t +ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) +{ +	struct ip_set *set; +	struct ip_set_net *inst = ip_set_pernet(net); + +	if (index > inst->ip_set_max) +		return IPSET_INVALID_ID; + +	nfnl_lock(NFNL_SUBSYS_IPSET); +	set = ip_set(inst, index); +	if (set) +		__ip_set_get(set); +	else +		index = IPSET_INVALID_ID; +	nfnl_unlock(NFNL_SUBSYS_IPSET); + +	return index; +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + * The nfnl mutex is used in the function. + */ +void +ip_set_nfnl_put(struct net *net, ip_set_id_t index) +{ +	struct ip_set *set; +	struct ip_set_net *inst = ip_set_pernet(net); + +	nfnl_lock(NFNL_SUBSYS_IPSET); +	if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ +		set = ip_set(inst, index); +		if (set != NULL) +			__ip_set_put(set); +	} +	nfnl_unlock(NFNL_SUBSYS_IPSET); +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_put); + +/* + * Communication protocol with userspace over netlink. + * + * The commands are serialized by the nfnl mutex. + */ + +static inline bool +protocol_failed(const struct nlattr * const tb[]) +{ +	return !tb[IPSET_ATTR_PROTOCOL] || +	       nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL; +} + +static inline u32 +flag_exist(const struct nlmsghdr *nlh) +{ +	return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST; +} + +static struct nlmsghdr * +start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, +	  enum ipset_cmd cmd) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; + +	nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), +			sizeof(*nfmsg), flags); +	if (nlh == NULL) +		return NULL; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = NFPROTO_IPV4; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	return nlh; +} + +/* Create a set */ + +static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { +	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 }, +	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING, +				    .len = IPSET_MAXNAMELEN - 1 }, +	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING, +				    .len = IPSET_MAXNAMELEN - 1}, +	[IPSET_ATTR_REVISION]	= { .type = NLA_U8 }, +	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 }, +	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED }, +}; + +static struct ip_set * +find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) +{ +	struct ip_set *set = NULL; +	ip_set_id_t i; + +	*id = IPSET_INVALID_ID; +	for (i = 0; i < inst->ip_set_max; i++) { +		set = ip_set(inst, i); +		if (set != NULL && STREQ(set->name, name)) { +			*id = i; +			break; +		} +	} +	return (*id == IPSET_INVALID_ID ? NULL : set); +} + +static inline struct ip_set * +find_set(struct ip_set_net *inst, const char *name) +{ +	ip_set_id_t id; + +	return find_set_and_id(inst, name, &id); +} + +static int +find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, +	     struct ip_set **set) +{ +	struct ip_set *s; +	ip_set_id_t i; + +	*index = IPSET_INVALID_ID; +	for (i = 0;  i < inst->ip_set_max; i++) { +		s = ip_set(inst, i); +		if (s == NULL) { +			if (*index == IPSET_INVALID_ID) +				*index = i; +		} else if (STREQ(name, s->name)) { +			/* Name clash */ +			*set = s; +			return -EEXIST; +		} +	} +	if (*index == IPSET_INVALID_ID) +		/* No free slot remained */ +		return -IPSET_ERR_MAX_SETS; +	return 0; +} + +static int +ip_set_none(struct sock *ctnl, struct sk_buff *skb, +	    const struct nlmsghdr *nlh, +	    const struct nlattr * const attr[]) +{ +	return -EOPNOTSUPP; +} + +static int +ip_set_create(struct sock *ctnl, struct sk_buff *skb, +	      const struct nlmsghdr *nlh, +	      const struct nlattr * const attr[]) +{ +	struct net *net = sock_net(ctnl); +	struct ip_set_net *inst = ip_set_pernet(net); +	struct ip_set *set, *clash = NULL; +	ip_set_id_t index = IPSET_INVALID_ID; +	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; +	const char *name, *typename; +	u8 family, revision; +	u32 flags = flag_exist(nlh); +	int ret = 0; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_SETNAME] == NULL || +		     attr[IPSET_ATTR_TYPENAME] == NULL || +		     attr[IPSET_ATTR_REVISION] == NULL || +		     attr[IPSET_ATTR_FAMILY] == NULL || +		     (attr[IPSET_ATTR_DATA] != NULL && +		      !flag_nested(attr[IPSET_ATTR_DATA])))) +		return -IPSET_ERR_PROTOCOL; + +	name = nla_data(attr[IPSET_ATTR_SETNAME]); +	typename = nla_data(attr[IPSET_ATTR_TYPENAME]); +	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); +	revision = nla_get_u8(attr[IPSET_ATTR_REVISION]); +	pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", +		 name, typename, family_name(family), revision); + +	/* +	 * First, and without any locks, allocate and initialize +	 * a normal base set structure. +	 */ +	set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); +	if (!set) +		return -ENOMEM; +	rwlock_init(&set->lock); +	strlcpy(set->name, name, IPSET_MAXNAMELEN); +	set->family = family; +	set->revision = revision; + +	/* +	 * Next, check that we know the type, and take +	 * a reference on the type, to make sure it stays available +	 * while constructing our new set. +	 * +	 * After referencing the type, we try to create the type +	 * specific part of the set without holding any locks. +	 */ +	ret = find_set_type_get(typename, family, revision, &(set->type)); +	if (ret) +		goto out; + +	/* +	 * Without holding any locks, create private part. +	 */ +	if (attr[IPSET_ATTR_DATA] && +	    nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], +			     set->type->create_policy)) { +		ret = -IPSET_ERR_PROTOCOL; +		goto put_out; +	} + +	ret = set->type->create(net, set, tb, flags); +	if (ret != 0) +		goto put_out; + +	/* BTW, ret==0 here. */ + +	/* +	 * Here, we have a valid, constructed set and we are protected +	 * by the nfnl mutex. Find the first free index in ip_set_list +	 * and check clashing. +	 */ +	ret = find_free_id(inst, set->name, &index, &clash); +	if (ret == -EEXIST) { +		/* If this is the same set and requested, ignore error */ +		if ((flags & IPSET_FLAG_EXIST) && +		    STREQ(set->type->name, clash->type->name) && +		    set->type->family == clash->type->family && +		    set->type->revision_min == clash->type->revision_min && +		    set->type->revision_max == clash->type->revision_max && +		    set->variant->same_set(set, clash)) +			ret = 0; +		goto cleanup; +	} else if (ret == -IPSET_ERR_MAX_SETS) { +		struct ip_set **list, **tmp; +		ip_set_id_t i = inst->ip_set_max + IP_SET_INC; + +		if (i < inst->ip_set_max || i == IPSET_INVALID_ID) +			/* Wraparound */ +			goto cleanup; + +		list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); +		if (!list) +			goto cleanup; +		/* nfnl mutex is held, both lists are valid */ +		tmp = ip_set_dereference(inst->ip_set_list); +		memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); +		rcu_assign_pointer(inst->ip_set_list, list); +		/* Make sure all current packets have passed through */ +		synchronize_net(); +		/* Use new list */ +		index = inst->ip_set_max; +		inst->ip_set_max = i; +		kfree(tmp); +		ret = 0; +	} else if (ret) +		goto cleanup; + +	/* +	 * Finally! Add our shiny new set to the list, and be done. +	 */ +	pr_debug("create: '%s' created with index %u!\n", set->name, index); +	ip_set(inst, index) = set; + +	return ret; + +cleanup: +	set->variant->destroy(set); +put_out: +	module_put(set->type->me); +out: +	kfree(set); +	return ret; +} + +/* Destroy sets */ + +static const struct nla_policy +ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { +	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 }, +	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING, +				    .len = IPSET_MAXNAMELEN - 1 }, +}; + +static void +ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) +{ +	struct ip_set *set = ip_set(inst, index); + +	pr_debug("set: %s\n",  set->name); +	ip_set(inst, index) = NULL; + +	/* Must call it without holding any lock */ +	set->variant->destroy(set); +	module_put(set->type->me); +	kfree(set); +} + +static int +ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, +	       const struct nlmsghdr *nlh, +	       const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	struct ip_set *s; +	ip_set_id_t i; +	int ret = 0; + +	if (unlikely(protocol_failed(attr))) +		return -IPSET_ERR_PROTOCOL; + +	/* Commands are serialized and references are +	 * protected by the ip_set_ref_lock. +	 * External systems (i.e. xt_set) must call +	 * ip_set_put|get_nfnl_* functions, that way we +	 * can safely check references here. +	 * +	 * list:set timer can only decrement the reference +	 * counter, so if it's already zero, we can proceed +	 * without holding the lock. +	 */ +	read_lock_bh(&ip_set_ref_lock); +	if (!attr[IPSET_ATTR_SETNAME]) { +		for (i = 0; i < inst->ip_set_max; i++) { +			s = ip_set(inst, i); +			if (s != NULL && s->ref) { +				ret = -IPSET_ERR_BUSY; +				goto out; +			} +		} +		read_unlock_bh(&ip_set_ref_lock); +		for (i = 0; i < inst->ip_set_max; i++) { +			s = ip_set(inst, i); +			if (s != NULL) +				ip_set_destroy_set(inst, i); +		} +	} else { +		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), +				    &i); +		if (s == NULL) { +			ret = -ENOENT; +			goto out; +		} else if (s->ref) { +			ret = -IPSET_ERR_BUSY; +			goto out; +		} +		read_unlock_bh(&ip_set_ref_lock); + +		ip_set_destroy_set(inst, i); +	} +	return 0; +out: +	read_unlock_bh(&ip_set_ref_lock); +	return ret; +} + +/* Flush sets */ + +static void +ip_set_flush_set(struct ip_set *set) +{ +	pr_debug("set: %s\n",  set->name); + +	write_lock_bh(&set->lock); +	set->variant->flush(set); +	write_unlock_bh(&set->lock); +} + +static int +ip_set_flush(struct sock *ctnl, struct sk_buff *skb, +	     const struct nlmsghdr *nlh, +	     const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	struct ip_set *s; +	ip_set_id_t i; + +	if (unlikely(protocol_failed(attr))) +		return -IPSET_ERR_PROTOCOL; + +	if (!attr[IPSET_ATTR_SETNAME]) { +		for (i = 0; i < inst->ip_set_max; i++) { +			s = ip_set(inst, i); +			if (s != NULL) +				ip_set_flush_set(s); +		} +	} else { +		s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); +		if (s == NULL) +			return -ENOENT; + +		ip_set_flush_set(s); +	} + +	return 0; +} + +/* Rename a set */ + +static const struct nla_policy +ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { +	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 }, +	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING, +				    .len = IPSET_MAXNAMELEN - 1 }, +	[IPSET_ATTR_SETNAME2]	= { .type = NLA_NUL_STRING, +				    .len = IPSET_MAXNAMELEN - 1 }, +}; + +static int +ip_set_rename(struct sock *ctnl, struct sk_buff *skb, +	      const struct nlmsghdr *nlh, +	      const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	struct ip_set *set, *s; +	const char *name2; +	ip_set_id_t i; +	int ret = 0; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_SETNAME] == NULL || +		     attr[IPSET_ATTR_SETNAME2] == NULL)) +		return -IPSET_ERR_PROTOCOL; + +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); +	if (set == NULL) +		return -ENOENT; + +	read_lock_bh(&ip_set_ref_lock); +	if (set->ref != 0) { +		ret = -IPSET_ERR_REFERENCED; +		goto out; +	} + +	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); +	for (i = 0; i < inst->ip_set_max; i++) { +		s = ip_set(inst, i); +		if (s != NULL && STREQ(s->name, name2)) { +			ret = -IPSET_ERR_EXIST_SETNAME2; +			goto out; +		} +	} +	strncpy(set->name, name2, IPSET_MAXNAMELEN); + +out: +	read_unlock_bh(&ip_set_ref_lock); +	return ret; +} + +/* Swap two sets so that name/index points to the other. + * References and set names are also swapped. + * + * The commands are serialized by the nfnl mutex and references are + * protected by the ip_set_ref_lock. The kernel interfaces + * do not hold the mutex but the pointer settings are atomic + * so the ip_set_list always contains valid pointers to the sets. + */ + +static int +ip_set_swap(struct sock *ctnl, struct sk_buff *skb, +	    const struct nlmsghdr *nlh, +	    const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	struct ip_set *from, *to; +	ip_set_id_t from_id, to_id; +	char from_name[IPSET_MAXNAMELEN]; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_SETNAME] == NULL || +		     attr[IPSET_ATTR_SETNAME2] == NULL)) +		return -IPSET_ERR_PROTOCOL; + +	from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), +			       &from_id); +	if (from == NULL) +		return -ENOENT; + +	to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), +			     &to_id); +	if (to == NULL) +		return -IPSET_ERR_EXIST_SETNAME2; + +	/* Features must not change. +	 * Not an artificial restriction anymore, as we must prevent +	 * possible loops created by swapping in setlist type of sets. */ +	if (!(from->type->features == to->type->features && +	      from->family == to->family)) +		return -IPSET_ERR_TYPE_MISMATCH; + +	strncpy(from_name, from->name, IPSET_MAXNAMELEN); +	strncpy(from->name, to->name, IPSET_MAXNAMELEN); +	strncpy(to->name, from_name, IPSET_MAXNAMELEN); + +	write_lock_bh(&ip_set_ref_lock); +	swap(from->ref, to->ref); +	ip_set(inst, from_id) = to; +	ip_set(inst, to_id) = from; +	write_unlock_bh(&ip_set_ref_lock); + +	return 0; +} + +/* List/save set data */ + +#define DUMP_INIT	0 +#define DUMP_ALL	1 +#define DUMP_ONE	2 +#define DUMP_LAST	3 + +#define DUMP_TYPE(arg)		(((u32)(arg)) & 0x0000FFFF) +#define DUMP_FLAGS(arg)		(((u32)(arg)) >> 16) + +static int +ip_set_dump_done(struct netlink_callback *cb) +{ +	struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; +	if (cb->args[IPSET_CB_ARG0]) { +		pr_debug("release set %s\n", +			 ip_set(inst, cb->args[IPSET_CB_INDEX])->name); +		__ip_set_put_byindex(inst, +			(ip_set_id_t) cb->args[IPSET_CB_INDEX]); +	} +	return 0; +} + +static inline void +dump_attrs(struct nlmsghdr *nlh) +{ +	const struct nlattr *attr; +	int rem; + +	pr_debug("dump nlmsg\n"); +	nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) { +		pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len); +	} +} + +static int +dump_init(struct netlink_callback *cb, struct ip_set_net *inst) +{ +	struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); +	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); +	struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; +	struct nlattr *attr = (void *)nlh + min_len; +	u32 dump_type; +	ip_set_id_t index; + +	/* Second pass, so parser can't fail */ +	nla_parse(cda, IPSET_ATTR_CMD_MAX, +		  attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); + +	/* cb->args[IPSET_CB_NET]:	net namespace +	 *         [IPSET_CB_DUMP]:	dump single set/all sets +	 *         [IPSET_CB_INDEX]: 	set index +	 *         [IPSET_CB_ARG0]:	type specific +	 */ + +	if (cda[IPSET_ATTR_SETNAME]) { +		struct ip_set *set; + +		set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), +				      &index); +		if (set == NULL) +			return -ENOENT; + +		dump_type = DUMP_ONE; +		cb->args[IPSET_CB_INDEX] = index; +	} else +		dump_type = DUMP_ALL; + +	if (cda[IPSET_ATTR_FLAGS]) { +		u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); +		dump_type |= (f << 16); +	} +	cb->args[IPSET_CB_NET] = (unsigned long)inst; +	cb->args[IPSET_CB_DUMP] = dump_type; + +	return 0; +} + +static int +ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) +{ +	ip_set_id_t index = IPSET_INVALID_ID, max; +	struct ip_set *set = NULL; +	struct nlmsghdr *nlh = NULL; +	unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; +	struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); +	u32 dump_type, dump_flags; +	int ret = 0; + +	if (!cb->args[IPSET_CB_DUMP]) { +		ret = dump_init(cb, inst); +		if (ret < 0) { +			nlh = nlmsg_hdr(cb->skb); +			/* We have to create and send the error message +			 * manually :-( */ +			if (nlh->nlmsg_flags & NLM_F_ACK) +				netlink_ack(cb->skb, nlh, ret); +			return ret; +		} +	} + +	if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) +		goto out; + +	dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]); +	dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]); +	max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1 +				    : inst->ip_set_max; +dump_last: +	pr_debug("dump type, flag: %u %u index: %ld\n", +		 dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); +	for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { +		index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; +		set = ip_set(inst, index); +		if (set == NULL) { +			if (dump_type == DUMP_ONE) { +				ret = -ENOENT; +				goto out; +			} +			continue; +		} +		/* When dumping all sets, we must dump "sorted" +		 * so that lists (unions of sets) are dumped last. +		 */ +		if (dump_type != DUMP_ONE && +		    ((dump_type == DUMP_ALL) == +		     !!(set->type->features & IPSET_DUMP_LAST))) +			continue; +		pr_debug("List set: %s\n", set->name); +		if (!cb->args[IPSET_CB_ARG0]) { +			/* Start listing: make sure set won't be destroyed */ +			pr_debug("reference set\n"); +			__ip_set_get(set); +		} +		nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, +				cb->nlh->nlmsg_seq, flags, +				IPSET_CMD_LIST); +		if (!nlh) { +			ret = -EMSGSIZE; +			goto release_refcount; +		} +		if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || +		    nla_put_string(skb, IPSET_ATTR_SETNAME, set->name)) +			goto nla_put_failure; +		if (dump_flags & IPSET_FLAG_LIST_SETNAME) +			goto next_set; +		switch (cb->args[IPSET_CB_ARG0]) { +		case 0: +			/* Core header data */ +			if (nla_put_string(skb, IPSET_ATTR_TYPENAME, +					   set->type->name) || +			    nla_put_u8(skb, IPSET_ATTR_FAMILY, +				       set->family) || +			    nla_put_u8(skb, IPSET_ATTR_REVISION, +				       set->revision)) +				goto nla_put_failure; +			ret = set->variant->head(set, skb); +			if (ret < 0) +				goto release_refcount; +			if (dump_flags & IPSET_FLAG_LIST_HEADER) +				goto next_set; +			/* Fall through and add elements */ +		default: +			read_lock_bh(&set->lock); +			ret = set->variant->list(set, skb, cb); +			read_unlock_bh(&set->lock); +			if (!cb->args[IPSET_CB_ARG0]) +				/* Set is done, proceed with next one */ +				goto next_set; +			goto release_refcount; +		} +	} +	/* If we dump all sets, continue with dumping last ones */ +	if (dump_type == DUMP_ALL) { +		dump_type = DUMP_LAST; +		cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); +		cb->args[IPSET_CB_INDEX] = 0; +		goto dump_last; +	} +	goto out; + +nla_put_failure: +	ret = -EFAULT; +next_set: +	if (dump_type == DUMP_ONE) +		cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID; +	else +		cb->args[IPSET_CB_INDEX]++; +release_refcount: +	/* If there was an error or set is done, release set */ +	if (ret || !cb->args[IPSET_CB_ARG0]) { +		pr_debug("release set %s\n", ip_set(inst, index)->name); +		__ip_set_put_byindex(inst, index); +		cb->args[IPSET_CB_ARG0] = 0; +	} +out: +	if (nlh) { +		nlmsg_end(skb, nlh); +		pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len); +		dump_attrs(nlh); +	} + +	return ret < 0 ? ret : skb->len; +} + +static int +ip_set_dump(struct sock *ctnl, struct sk_buff *skb, +	    const struct nlmsghdr *nlh, +	    const struct nlattr * const attr[]) +{ +	if (unlikely(protocol_failed(attr))) +		return -IPSET_ERR_PROTOCOL; + +	{ +		struct netlink_dump_control c = { +			.dump = ip_set_dump_start, +			.done = ip_set_dump_done, +		}; +		return netlink_dump_start(ctnl, skb, nlh, &c); +	} +} + +/* Add, del and test */ + +static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { +	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 }, +	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING, +				    .len = IPSET_MAXNAMELEN - 1 }, +	[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED }, +	[IPSET_ATTR_ADT]	= { .type = NLA_NESTED }, +}; + +static int +call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, +	struct nlattr *tb[], enum ipset_adt adt, +	u32 flags, bool use_lineno) +{ +	int ret; +	u32 lineno = 0; +	bool eexist = flags & IPSET_FLAG_EXIST, retried = false; + +	do { +		write_lock_bh(&set->lock); +		ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); +		write_unlock_bh(&set->lock); +		retried = true; +	} while (ret == -EAGAIN && +		 set->variant->resize && +		 (ret = set->variant->resize(set, retried)) == 0); + +	if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) +		return 0; +	if (lineno && use_lineno) { +		/* Error in restore/batch mode: send back lineno */ +		struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); +		struct sk_buff *skb2; +		struct nlmsgerr *errmsg; +		size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); +		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); +		struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; +		struct nlattr *cmdattr; +		u32 *errline; + +		skb2 = nlmsg_new(payload, GFP_KERNEL); +		if (skb2 == NULL) +			return -ENOMEM; +		rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, +				  nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); +		errmsg = nlmsg_data(rep); +		errmsg->error = ret; +		memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); +		cmdattr = (void *)&errmsg->msg + min_len; + +		nla_parse(cda, IPSET_ATTR_CMD_MAX, +			  cmdattr, nlh->nlmsg_len - min_len, +			  ip_set_adt_policy); + +		errline = nla_data(cda[IPSET_ATTR_LINENO]); + +		*errline = lineno; + +		netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +		/* Signal netlink not to send its ACK/errmsg.  */ +		return -EINTR; +	} + +	return ret; +} + +static int +ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, +	    const struct nlmsghdr *nlh, +	    const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	struct ip_set *set; +	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; +	const struct nlattr *nla; +	u32 flags = flag_exist(nlh); +	bool use_lineno; +	int ret = 0; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_SETNAME] == NULL || +		     !((attr[IPSET_ATTR_DATA] != NULL) ^ +		       (attr[IPSET_ATTR_ADT] != NULL)) || +		     (attr[IPSET_ATTR_DATA] != NULL && +		      !flag_nested(attr[IPSET_ATTR_DATA])) || +		     (attr[IPSET_ATTR_ADT] != NULL && +		      (!flag_nested(attr[IPSET_ATTR_ADT]) || +		       attr[IPSET_ATTR_LINENO] == NULL)))) +		return -IPSET_ERR_PROTOCOL; + +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); +	if (set == NULL) +		return -ENOENT; + +	use_lineno = !!attr[IPSET_ATTR_LINENO]; +	if (attr[IPSET_ATTR_DATA]) { +		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, +				     attr[IPSET_ATTR_DATA], +				     set->type->adt_policy)) +			return -IPSET_ERR_PROTOCOL; +		ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, +			      use_lineno); +	} else { +		int nla_rem; + +		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { +			memset(tb, 0, sizeof(tb)); +			if (nla_type(nla) != IPSET_ATTR_DATA || +			    !flag_nested(nla) || +			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, +					     set->type->adt_policy)) +				return -IPSET_ERR_PROTOCOL; +			ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, +				      flags, use_lineno); +			if (ret < 0) +				return ret; +		} +	} +	return ret; +} + +static int +ip_set_udel(struct sock *ctnl, struct sk_buff *skb, +	    const struct nlmsghdr *nlh, +	    const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	struct ip_set *set; +	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; +	const struct nlattr *nla; +	u32 flags = flag_exist(nlh); +	bool use_lineno; +	int ret = 0; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_SETNAME] == NULL || +		     !((attr[IPSET_ATTR_DATA] != NULL) ^ +		       (attr[IPSET_ATTR_ADT] != NULL)) || +		     (attr[IPSET_ATTR_DATA] != NULL && +		      !flag_nested(attr[IPSET_ATTR_DATA])) || +		     (attr[IPSET_ATTR_ADT] != NULL && +		      (!flag_nested(attr[IPSET_ATTR_ADT]) || +		       attr[IPSET_ATTR_LINENO] == NULL)))) +		return -IPSET_ERR_PROTOCOL; + +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); +	if (set == NULL) +		return -ENOENT; + +	use_lineno = !!attr[IPSET_ATTR_LINENO]; +	if (attr[IPSET_ATTR_DATA]) { +		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, +				     attr[IPSET_ATTR_DATA], +				     set->type->adt_policy)) +			return -IPSET_ERR_PROTOCOL; +		ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, +			      use_lineno); +	} else { +		int nla_rem; + +		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { +			memset(tb, 0, sizeof(*tb)); +			if (nla_type(nla) != IPSET_ATTR_DATA || +			    !flag_nested(nla) || +			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, +					     set->type->adt_policy)) +				return -IPSET_ERR_PROTOCOL; +			ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, +				      flags, use_lineno); +			if (ret < 0) +				return ret; +		} +	} +	return ret; +} + +static int +ip_set_utest(struct sock *ctnl, struct sk_buff *skb, +	     const struct nlmsghdr *nlh, +	     const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	struct ip_set *set; +	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; +	int ret = 0; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_SETNAME] == NULL || +		     attr[IPSET_ATTR_DATA] == NULL || +		     !flag_nested(attr[IPSET_ATTR_DATA]))) +		return -IPSET_ERR_PROTOCOL; + +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); +	if (set == NULL) +		return -ENOENT; + +	if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], +			     set->type->adt_policy)) +		return -IPSET_ERR_PROTOCOL; + +	read_lock_bh(&set->lock); +	ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); +	read_unlock_bh(&set->lock); +	/* Userspace can't trigger element to be re-added */ +	if (ret == -EAGAIN) +		ret = 1; + +	return ret > 0 ? 0 : -IPSET_ERR_EXIST; +} + +/* Get headed data of a set */ + +static int +ip_set_header(struct sock *ctnl, struct sk_buff *skb, +	      const struct nlmsghdr *nlh, +	      const struct nlattr * const attr[]) +{ +	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); +	const struct ip_set *set; +	struct sk_buff *skb2; +	struct nlmsghdr *nlh2; +	int ret = 0; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_SETNAME] == NULL)) +		return -IPSET_ERR_PROTOCOL; + +	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); +	if (set == NULL) +		return -ENOENT; + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, +			 IPSET_CMD_HEADER); +	if (!nlh2) +		goto nlmsg_failure; +	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || +	    nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) || +	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) || +	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || +	    nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision)) +		goto nla_put_failure; +	nlmsg_end(skb2, nlh2); + +	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +	if (ret < 0) +		return ret; + +	return 0; + +nla_put_failure: +	nlmsg_cancel(skb2, nlh2); +nlmsg_failure: +	kfree_skb(skb2); +	return -EMSGSIZE; +} + +/* Get type data */ + +static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { +	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 }, +	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING, +				    .len = IPSET_MAXNAMELEN - 1 }, +	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 }, +}; + +static int +ip_set_type(struct sock *ctnl, struct sk_buff *skb, +	    const struct nlmsghdr *nlh, +	    const struct nlattr * const attr[]) +{ +	struct sk_buff *skb2; +	struct nlmsghdr *nlh2; +	u8 family, min, max; +	const char *typename; +	int ret = 0; + +	if (unlikely(protocol_failed(attr) || +		     attr[IPSET_ATTR_TYPENAME] == NULL || +		     attr[IPSET_ATTR_FAMILY] == NULL)) +		return -IPSET_ERR_PROTOCOL; + +	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); +	typename = nla_data(attr[IPSET_ATTR_TYPENAME]); +	ret = find_set_type_minmax(typename, family, &min, &max); +	if (ret) +		return ret; + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, +			 IPSET_CMD_TYPE); +	if (!nlh2) +		goto nlmsg_failure; +	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || +	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) || +	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) || +	    nla_put_u8(skb2, IPSET_ATTR_REVISION, max) || +	    nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min)) +		goto nla_put_failure; +	nlmsg_end(skb2, nlh2); + +	pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); +	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +	if (ret < 0) +		return ret; + +	return 0; + +nla_put_failure: +	nlmsg_cancel(skb2, nlh2); +nlmsg_failure: +	kfree_skb(skb2); +	return -EMSGSIZE; +} + +/* Get protocol version */ + +static const struct nla_policy +ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { +	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 }, +}; + +static int +ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, +		const struct nlmsghdr *nlh, +		const struct nlattr * const attr[]) +{ +	struct sk_buff *skb2; +	struct nlmsghdr *nlh2; +	int ret = 0; + +	if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) +		return -IPSET_ERR_PROTOCOL; + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, +			 IPSET_CMD_PROTOCOL); +	if (!nlh2) +		goto nlmsg_failure; +	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL)) +		goto nla_put_failure; +	nlmsg_end(skb2, nlh2); + +	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +	if (ret < 0) +		return ret; + +	return 0; + +nla_put_failure: +	nlmsg_cancel(skb2, nlh2); +nlmsg_failure: +	kfree_skb(skb2); +	return -EMSGSIZE; +} + +static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { +	[IPSET_CMD_NONE]	= { +		.call		= ip_set_none, +		.attr_count	= IPSET_ATTR_CMD_MAX, +	}, +	[IPSET_CMD_CREATE]	= { +		.call		= ip_set_create, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_create_policy, +	}, +	[IPSET_CMD_DESTROY]	= { +		.call		= ip_set_destroy, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_setname_policy, +	}, +	[IPSET_CMD_FLUSH]	= { +		.call		= ip_set_flush, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_setname_policy, +	}, +	[IPSET_CMD_RENAME]	= { +		.call		= ip_set_rename, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_setname2_policy, +	}, +	[IPSET_CMD_SWAP]	= { +		.call		= ip_set_swap, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_setname2_policy, +	}, +	[IPSET_CMD_LIST]	= { +		.call		= ip_set_dump, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_setname_policy, +	}, +	[IPSET_CMD_SAVE]	= { +		.call		= ip_set_dump, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_setname_policy, +	}, +	[IPSET_CMD_ADD]	= { +		.call		= ip_set_uadd, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_adt_policy, +	}, +	[IPSET_CMD_DEL]	= { +		.call		= ip_set_udel, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_adt_policy, +	}, +	[IPSET_CMD_TEST]	= { +		.call		= ip_set_utest, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_adt_policy, +	}, +	[IPSET_CMD_HEADER]	= { +		.call		= ip_set_header, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_setname_policy, +	}, +	[IPSET_CMD_TYPE]	= { +		.call		= ip_set_type, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_type_policy, +	}, +	[IPSET_CMD_PROTOCOL]	= { +		.call		= ip_set_protocol, +		.attr_count	= IPSET_ATTR_CMD_MAX, +		.policy		= ip_set_protocol_policy, +	}, +}; + +static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { +	.name		= "ip_set", +	.subsys_id	= NFNL_SUBSYS_IPSET, +	.cb_count	= IPSET_MSG_MAX, +	.cb		= ip_set_netlink_subsys_cb, +}; + +/* Interface to iptables/ip6tables */ + +static int +ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) +{ +	unsigned int *op; +	void *data; +	int copylen = *len, ret = 0; +	struct net *net = sock_net(sk); +	struct ip_set_net *inst = ip_set_pernet(net); + +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +		return -EPERM; +	if (optval != SO_IP_SET) +		return -EBADF; +	if (*len < sizeof(unsigned int)) +		return -EINVAL; + +	data = vmalloc(*len); +	if (!data) +		return -ENOMEM; +	if (copy_from_user(data, user, *len) != 0) { +		ret = -EFAULT; +		goto done; +	} +	op = (unsigned int *) data; + +	if (*op < IP_SET_OP_VERSION) { +		/* Check the version at the beginning of operations */ +		struct ip_set_req_version *req_version = data; +		if (req_version->version != IPSET_PROTOCOL) { +			ret = -EPROTO; +			goto done; +		} +	} + +	switch (*op) { +	case IP_SET_OP_VERSION: { +		struct ip_set_req_version *req_version = data; + +		if (*len != sizeof(struct ip_set_req_version)) { +			ret = -EINVAL; +			goto done; +		} + +		req_version->version = IPSET_PROTOCOL; +		ret = copy_to_user(user, req_version, +				   sizeof(struct ip_set_req_version)); +		goto done; +	} +	case IP_SET_OP_GET_BYNAME: { +		struct ip_set_req_get_set *req_get = data; +		ip_set_id_t id; + +		if (*len != sizeof(struct ip_set_req_get_set)) { +			ret = -EINVAL; +			goto done; +		} +		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; +		nfnl_lock(NFNL_SUBSYS_IPSET); +		find_set_and_id(inst, req_get->set.name, &id); +		req_get->set.index = id; +		nfnl_unlock(NFNL_SUBSYS_IPSET); +		goto copy; +	} +	case IP_SET_OP_GET_FNAME: { +		struct ip_set_req_get_set_family *req_get = data; +		ip_set_id_t id; + +		if (*len != sizeof(struct ip_set_req_get_set_family)) { +			ret = -EINVAL; +			goto done; +		} +		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; +		nfnl_lock(NFNL_SUBSYS_IPSET); +		find_set_and_id(inst, req_get->set.name, &id); +		req_get->set.index = id; +		if (id != IPSET_INVALID_ID) +			req_get->family = ip_set(inst, id)->family; +		nfnl_unlock(NFNL_SUBSYS_IPSET); +		goto copy; +	} +	case IP_SET_OP_GET_BYINDEX: { +		struct ip_set_req_get_set *req_get = data; +		struct ip_set *set; + +		if (*len != sizeof(struct ip_set_req_get_set) || +		    req_get->set.index >= inst->ip_set_max) { +			ret = -EINVAL; +			goto done; +		} +		nfnl_lock(NFNL_SUBSYS_IPSET); +		set = ip_set(inst, req_get->set.index); +		strncpy(req_get->set.name, set ? set->name : "", +			IPSET_MAXNAMELEN); +		nfnl_unlock(NFNL_SUBSYS_IPSET); +		goto copy; +	} +	default: +		ret = -EBADMSG; +		goto done; +	}	/* end of switch(op) */ + +copy: +	ret = copy_to_user(user, data, copylen); + +done: +	vfree(data); +	if (ret > 0) +		ret = 0; +	return ret; +} + +static struct nf_sockopt_ops so_set __read_mostly = { +	.pf		= PF_INET, +	.get_optmin	= SO_IP_SET, +	.get_optmax	= SO_IP_SET + 1, +	.get		= &ip_set_sockfn_get, +	.owner		= THIS_MODULE, +}; + +static int __net_init +ip_set_net_init(struct net *net) +{ +	struct ip_set_net *inst = ip_set_pernet(net); +	struct ip_set **list; + +	inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX; +	if (inst->ip_set_max >= IPSET_INVALID_ID) +		inst->ip_set_max = IPSET_INVALID_ID - 1; + +	list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); +	if (!list) +		return -ENOMEM; +	inst->is_deleted = 0; +	rcu_assign_pointer(inst->ip_set_list, list); +	return 0; +} + +static void __net_exit +ip_set_net_exit(struct net *net) +{ +	struct ip_set_net *inst = ip_set_pernet(net); + +	struct ip_set *set = NULL; +	ip_set_id_t i; + +	inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + +	for (i = 0; i < inst->ip_set_max; i++) { +		set = ip_set(inst, i); +		if (set != NULL) +			ip_set_destroy_set(inst, i); +	} +	kfree(rcu_dereference_protected(inst->ip_set_list, 1)); +} + +static struct pernet_operations ip_set_net_ops = { +	.init	= ip_set_net_init, +	.exit   = ip_set_net_exit, +	.id	= &ip_set_net_id, +	.size	= sizeof(struct ip_set_net) +}; + + +static int __init +ip_set_init(void) +{ +	int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); +	if (ret != 0) { +		pr_err("ip_set: cannot register with nfnetlink.\n"); +		return ret; +	} +	ret = nf_register_sockopt(&so_set); +	if (ret != 0) { +		pr_err("SO_SET registry failed: %d\n", ret); +		nfnetlink_subsys_unregister(&ip_set_netlink_subsys); +		return ret; +	} +	ret = register_pernet_subsys(&ip_set_net_ops); +	if (ret) { +		pr_err("ip_set: cannot register pernet_subsys.\n"); +		nf_unregister_sockopt(&so_set); +		nfnetlink_subsys_unregister(&ip_set_netlink_subsys); +		return ret; +	} +	pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); +	return 0; +} + +static void __exit +ip_set_fini(void) +{ +	unregister_pernet_subsys(&ip_set_net_ops); +	nf_unregister_sockopt(&so_set); +	nfnetlink_subsys_unregister(&ip_set_netlink_subsys); +	pr_debug("these are the famous last words\n"); +} + +module_init(ip_set_init); +module_exit(ip_set_fini); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c new file mode 100644 index 00000000000..29fb01ddff9 --- /dev/null +++ b/net/netfilter/ipset/ip_set_getport.c @@ -0,0 +1,174 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Get Layer-4 data from the packets */ + +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/sctp.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/ip.h> +#include <net/ipv6.h> + +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/export.h> + +/* We must handle non-linear skbs */ +static bool +get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, +	 bool src, __be16 *port, u8 *proto) +{ +	switch (protocol) { +	case IPPROTO_TCP: { +		struct tcphdr _tcph; +		const struct tcphdr *th; + +		th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); +		if (th == NULL) +			/* No choice either */ +			return false; + +		*port = src ? th->source : th->dest; +		break; +	} +	case IPPROTO_SCTP: { +		sctp_sctphdr_t _sh; +		const sctp_sctphdr_t *sh; + +		sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); +		if (sh == NULL) +			/* No choice either */ +			return false; + +		*port = src ? sh->source : sh->dest; +		break; +	} +	case IPPROTO_UDP: +	case IPPROTO_UDPLITE: { +		struct udphdr _udph; +		const struct udphdr *uh; + +		uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); +		if (uh == NULL) +			/* No choice either */ +			return false; + +		*port = src ? uh->source : uh->dest; +		break; +	} +	case IPPROTO_ICMP: { +		struct icmphdr _ich; +		const struct icmphdr *ic; + +		ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); +		if (ic == NULL) +			return false; + +		*port = (__force __be16)htons((ic->type << 8) | ic->code); +		break; +	} +	case IPPROTO_ICMPV6: { +		struct icmp6hdr _ich; +		const struct icmp6hdr *ic; + +		ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); +		if (ic == NULL) +			return false; + +		*port = (__force __be16) +			htons((ic->icmp6_type << 8) | ic->icmp6_code); +		break; +	} +	default: +		break; +	} +	*proto = protocol; + +	return true; +} + +bool +ip_set_get_ip4_port(const struct sk_buff *skb, bool src, +		    __be16 *port, u8 *proto) +{ +	const struct iphdr *iph = ip_hdr(skb); +	unsigned int protooff = ip_hdrlen(skb); +	int protocol = iph->protocol; + +	/* See comments at tcp_match in ip_tables.c */ +	if (protocol <= 0) +		return false; + +	if (ntohs(iph->frag_off) & IP_OFFSET) +		switch (protocol) { +		case IPPROTO_TCP: +		case IPPROTO_SCTP: +		case IPPROTO_UDP: +		case IPPROTO_UDPLITE: +		case IPPROTO_ICMP: +			/* Port info not available for fragment offset > 0 */ +			return false; +		default: +			/* Other protocols doesn't have ports, +			   so we can match fragments */ +			*proto = protocol; +			return true; +		} + +	return get_port(skb, protocol, protooff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip4_port); + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +bool +ip_set_get_ip6_port(const struct sk_buff *skb, bool src, +		    __be16 *port, u8 *proto) +{ +	int protoff; +	u8 nexthdr; +	__be16 frag_off = 0; + +	nexthdr = ipv6_hdr(skb)->nexthdr; +	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, +				   &frag_off); +	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) +		return false; + +	return get_port(skb, nexthdr, protoff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); +#endif + +bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ +	bool ret; +	u8 proto; + +	switch (pf) { +	case NFPROTO_IPV4: +		ret = ip_set_get_ip4_port(skb, src, port, &proto); +		break; +	case NFPROTO_IPV6: +		ret = ip_set_get_ip6_port(skb, src, port, &proto); +		break; +	default: +		return false; +	} +	if (!ret) +		return ret; +	switch (proto) { +	case IPPROTO_TCP: +	case IPPROTO_UDP: +		return true; +	default: +		return false; +	} +} +EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h new file mode 100644 index 00000000000..61c7fb05280 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -0,0 +1,1160 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _IP_SET_HASH_GEN_H +#define _IP_SET_HASH_GEN_H + +#include <linux/rcupdate.h> +#include <linux/jhash.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#ifndef rcu_dereference_bh +#define rcu_dereference_bh(p)	rcu_dereference(p) +#endif + +#define rcu_dereference_bh_nfnl(p)	rcu_dereference_bh_check(p, 1) + +/* Hashing which uses arrays to resolve clashing. The hash table is resized + * (doubled) when searching becomes too long. + * Internally jhash is used with the assumption that the size of the + * stored data is a multiple of sizeof(u32). If storage supports timeout, + * the timeout field must be the last one in the data structure - that field + * is ignored when computing the hash key. + * + * Readers and resizing + * + * Resizing can be triggered by userspace command only, and those + * are serialized by the nfnl mutex. During resizing the set is + * read-locked, so the only possible concurrent operations are + * the kernel side readers. Those must be protected by proper RCU locking. + */ + +/* Number of elements to store in an initial array block */ +#define AHASH_INIT_SIZE			4 +/* Max number of elements to store in an array block */ +#define AHASH_MAX_SIZE			(3*AHASH_INIT_SIZE) + +/* Max number of elements can be tuned */ +#ifdef IP_SET_HASH_WITH_MULTI +#define AHASH_MAX(h)			((h)->ahash_max) + +static inline u8 +tune_ahash_max(u8 curr, u32 multi) +{ +	u32 n; + +	if (multi < curr) +		return curr; + +	n = curr + AHASH_INIT_SIZE; +	/* Currently, at listing one hash bucket must fit into a message. +	 * Therefore we have a hard limit here. +	 */ +	return n > curr && n <= 64 ? n : curr; +} +#define TUNE_AHASH_MAX(h, multi)	\ +	((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) +#else +#define AHASH_MAX(h)			AHASH_MAX_SIZE +#define TUNE_AHASH_MAX(h, multi) +#endif + +/* A hash bucket */ +struct hbucket { +	void *value;		/* the array of the values */ +	u8 size;		/* size of the array */ +	u8 pos;			/* position of the first free entry */ +}; + +/* The hash table: the table size stored here in order to make resizing easy */ +struct htable { +	u8 htable_bits;		/* size of hash table == 2^htable_bits */ +	struct hbucket bucket[0]; /* hashtable buckets */ +}; + +#define hbucket(h, i)		(&((h)->bucket[i])) + +#ifndef IPSET_NET_COUNT +#define IPSET_NET_COUNT		1 +#endif + +/* Book-keeping of the prefixes added to the set */ +struct net_prefixes { +	u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ +	u8 cidr[IPSET_NET_COUNT];  /* the different cidr values in the set */ +}; + +/* Compute the hash table size */ +static size_t +htable_size(u8 hbits) +{ +	size_t hsize; + +	/* We must fit both into u32 in jhash and size_t */ +	if (hbits > 31) +		return 0; +	hsize = jhash_size(hbits); +	if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) +	    < hsize) +		return 0; + +	return hsize * sizeof(struct hbucket) + sizeof(struct htable); +} + +/* Compute htable_bits from the user input parameter hashsize */ +static u8 +htable_bits(u32 hashsize) +{ +	/* Assume that hashsize == 2^htable_bits */ +	u8 bits = fls(hashsize - 1); +	if (jhash_size(bits) != hashsize) +		/* Round up to the first 2^n value */ +		bits = fls(hashsize); + +	return bits; +} + +static int +hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) +{ +	if (n->pos >= n->size) { +		void *tmp; + +		if (n->size >= ahash_max) +			/* Trigger rehashing */ +			return -EAGAIN; + +		tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, +			      GFP_ATOMIC); +		if (!tmp) +			return -ENOMEM; +		if (n->size) { +			memcpy(tmp, n->value, n->size * dsize); +			kfree(n->value); +		} +		n->value = tmp; +		n->size += AHASH_INIT_SIZE; +	} +	return 0; +} + +#ifdef IP_SET_HASH_WITH_NETS +#if IPSET_NET_COUNT > 1 +#define __CIDR(cidr, i)		(cidr[i]) +#else +#define __CIDR(cidr, i)		(cidr) +#endif +#ifdef IP_SET_HASH_WITH_NETS_PACKED +/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */ +#define CIDR(cidr, i)		(__CIDR(cidr, i) + 1) +#else +#define CIDR(cidr, i)		(__CIDR(cidr, i)) +#endif + +#define SET_HOST_MASK(family)	(family == AF_INET ? 32 : 128) + +#ifdef IP_SET_HASH_WITH_MULTI +#define NLEN(family)		(SET_HOST_MASK(family) + 1) +#else +#define NLEN(family)		SET_HOST_MASK(family) +#endif + +#else +#define NLEN(family)		0 +#endif /* IP_SET_HASH_WITH_NETS */ + +#endif /* _IP_SET_HASH_GEN_H */ + +/* Family dependent templates */ + +#undef ahash_data +#undef mtype_data_equal +#undef mtype_do_data_match +#undef mtype_data_set_flags +#undef mtype_data_reset_flags +#undef mtype_data_netmask +#undef mtype_data_list +#undef mtype_data_next +#undef mtype_elem + +#undef mtype_ahash_destroy +#undef mtype_ext_cleanup +#undef mtype_add_cidr +#undef mtype_del_cidr +#undef mtype_ahash_memsize +#undef mtype_flush +#undef mtype_destroy +#undef mtype_gc_init +#undef mtype_same_set +#undef mtype_kadt +#undef mtype_uadt +#undef mtype + +#undef mtype_add +#undef mtype_del +#undef mtype_test_cidrs +#undef mtype_test +#undef mtype_expire +#undef mtype_resize +#undef mtype_head +#undef mtype_list +#undef mtype_gc +#undef mtype_gc_init +#undef mtype_variant +#undef mtype_data_match + +#undef HKEY + +#define mtype_data_equal	IPSET_TOKEN(MTYPE, _data_equal) +#ifdef IP_SET_HASH_WITH_NETS +#define mtype_do_data_match	IPSET_TOKEN(MTYPE, _do_data_match) +#else +#define mtype_do_data_match(d)	1 +#endif +#define mtype_data_set_flags	IPSET_TOKEN(MTYPE, _data_set_flags) +#define mtype_data_reset_elem	IPSET_TOKEN(MTYPE, _data_reset_elem) +#define mtype_data_reset_flags	IPSET_TOKEN(MTYPE, _data_reset_flags) +#define mtype_data_netmask	IPSET_TOKEN(MTYPE, _data_netmask) +#define mtype_data_list		IPSET_TOKEN(MTYPE, _data_list) +#define mtype_data_next		IPSET_TOKEN(MTYPE, _data_next) +#define mtype_elem		IPSET_TOKEN(MTYPE, _elem) +#define mtype_ahash_destroy	IPSET_TOKEN(MTYPE, _ahash_destroy) +#define mtype_ext_cleanup	IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_add_cidr		IPSET_TOKEN(MTYPE, _add_cidr) +#define mtype_del_cidr		IPSET_TOKEN(MTYPE, _del_cidr) +#define mtype_ahash_memsize	IPSET_TOKEN(MTYPE, _ahash_memsize) +#define mtype_flush		IPSET_TOKEN(MTYPE, _flush) +#define mtype_destroy		IPSET_TOKEN(MTYPE, _destroy) +#define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_same_set		IPSET_TOKEN(MTYPE, _same_set) +#define mtype_kadt		IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt		IPSET_TOKEN(MTYPE, _uadt) +#define mtype			MTYPE + +#define mtype_add		IPSET_TOKEN(MTYPE, _add) +#define mtype_del		IPSET_TOKEN(MTYPE, _del) +#define mtype_test_cidrs	IPSET_TOKEN(MTYPE, _test_cidrs) +#define mtype_test		IPSET_TOKEN(MTYPE, _test) +#define mtype_expire		IPSET_TOKEN(MTYPE, _expire) +#define mtype_resize		IPSET_TOKEN(MTYPE, _resize) +#define mtype_head		IPSET_TOKEN(MTYPE, _head) +#define mtype_list		IPSET_TOKEN(MTYPE, _list) +#define mtype_gc		IPSET_TOKEN(MTYPE, _gc) +#define mtype_variant		IPSET_TOKEN(MTYPE, _variant) +#define mtype_data_match	IPSET_TOKEN(MTYPE, _data_match) + +#ifndef HKEY_DATALEN +#define HKEY_DATALEN		sizeof(struct mtype_elem) +#endif + +#define HKEY(data, initval, htable_bits)			\ +(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval)	\ +	& jhash_mask(htable_bits)) + +#ifndef htype +#define htype			HTYPE + +/* The generic hash structure */ +struct htype { +	struct htable __rcu *table; /* the hash table */ +	u32 maxelem;		/* max elements in the hash */ +	u32 elements;		/* current element (vs timeout) */ +	u32 initval;		/* random jhash init value */ +#ifdef IP_SET_HASH_WITH_MARKMASK +	u32 markmask;		/* markmask value for mark mask to store */ +#endif +	struct timer_list gc;	/* garbage collection when timeout enabled */ +	struct mtype_elem next; /* temporary storage for uadd */ +#ifdef IP_SET_HASH_WITH_MULTI +	u8 ahash_max;		/* max elements in an array block */ +#endif +#ifdef IP_SET_HASH_WITH_NETMASK +	u8 netmask;		/* netmask value for subnets to store */ +#endif +#ifdef IP_SET_HASH_WITH_RBTREE +	struct rb_root rbtree; +#endif +#ifdef IP_SET_HASH_WITH_NETS +	struct net_prefixes nets[0]; /* book-keeping of prefixes */ +#endif +}; +#endif + +#ifdef IP_SET_HASH_WITH_NETS +/* Network cidr size book keeping when the hash stores different + * sized networks */ +static void +mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) +{ +	int i, j; + +	/* Add in increasing prefix order, so larger cidr first */ +	for (i = 0, j = -1; i < nets_length && h->nets[i].nets[n]; i++) { +		if (j != -1) +			continue; +		else if (h->nets[i].cidr[n] < cidr) +			j = i; +		else if (h->nets[i].cidr[n] == cidr) { +			h->nets[i].nets[n]++; +			return; +		} +	} +	if (j != -1) { +		for (; i > j; i--) { +			h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; +			h->nets[i].nets[n] = h->nets[i - 1].nets[n]; +		} +	} +	h->nets[i].cidr[n] = cidr; +	h->nets[i].nets[n] = 1; +} + +static void +mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) +{ +	u8 i, j, net_end = nets_length - 1; + +	for (i = 0; i < nets_length; i++) { +	        if (h->nets[i].cidr[n] != cidr) +	                continue; +                if (h->nets[i].nets[n] > 1 || i == net_end || +                    h->nets[i + 1].nets[n] == 0) { +                        h->nets[i].nets[n]--; +                        return; +                } +                for (j = i; j < net_end && h->nets[j].nets[n]; j++) { +		        h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; +		        h->nets[j].nets[n] = h->nets[j + 1].nets[n]; +                } +                h->nets[j].nets[n] = 0; +                return; +	} +} +#endif + +/* Calculate the actual memory size of the set data */ +static size_t +mtype_ahash_memsize(const struct htype *h, const struct htable *t, +		    u8 nets_length, size_t dsize) +{ +	u32 i; +	size_t memsize = sizeof(*h) +			 + sizeof(*t) +#ifdef IP_SET_HASH_WITH_NETS +			 + sizeof(struct net_prefixes) * nets_length +#endif +			 + jhash_size(t->htable_bits) * sizeof(struct hbucket); + +	for (i = 0; i < jhash_size(t->htable_bits); i++) +		memsize += t->bucket[i].size * dsize; + +	return memsize; +} + +/* Get the ith element from the array block n */ +#define ahash_data(n, i, dsize)	\ +	((struct mtype_elem *)((n)->value + ((i) * (dsize)))) + +static void +mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) +{ +	int i; + +	for (i = 0; i < n->pos; i++) +		ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); +} + +/* Flush a hash type of set: destroy all elements */ +static void +mtype_flush(struct ip_set *set) +{ +	struct htype *h = set->data; +	struct htable *t; +	struct hbucket *n; +	u32 i; + +	t = rcu_dereference_bh_nfnl(h->table); +	for (i = 0; i < jhash_size(t->htable_bits); i++) { +		n = hbucket(t, i); +		if (n->size) { +			if (set->extensions & IPSET_EXT_DESTROY) +				mtype_ext_cleanup(set, n); +			n->size = n->pos = 0; +			/* FIXME: use slab cache */ +			kfree(n->value); +		} +	} +#ifdef IP_SET_HASH_WITH_NETS +	memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); +#endif +	h->elements = 0; +} + +/* Destroy the hashtable part of the set */ +static void +mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) +{ +	struct hbucket *n; +	u32 i; + +	for (i = 0; i < jhash_size(t->htable_bits); i++) { +		n = hbucket(t, i); +		if (n->size) { +			if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) +				mtype_ext_cleanup(set, n); +			/* FIXME: use slab cache */ +			kfree(n->value); +		} +	} + +	ip_set_free(t); +} + +/* Destroy a hash type of set */ +static void +mtype_destroy(struct ip_set *set) +{ +	struct htype *h = set->data; + +	if (set->extensions & IPSET_EXT_TIMEOUT) +		del_timer_sync(&h->gc); + +	mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); +#ifdef IP_SET_HASH_WITH_RBTREE +	rbtree_destroy(&h->rbtree); +#endif +	kfree(h); + +	set->data = NULL; +} + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ +	struct htype *h = set->data; + +	init_timer(&h->gc); +	h->gc.data = (unsigned long) set; +	h->gc.function = gc; +	h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; +	add_timer(&h->gc); +	pr_debug("gc initialized, run in every %u\n", +		 IPSET_GC_PERIOD(set->timeout)); +} + +static bool +mtype_same_set(const struct ip_set *a, const struct ip_set *b) +{ +	const struct htype *x = a->data; +	const struct htype *y = b->data; + +	/* Resizing changes htable_bits, so we ignore it */ +	return x->maxelem == y->maxelem && +	       a->timeout == b->timeout && +#ifdef IP_SET_HASH_WITH_NETMASK +	       x->netmask == y->netmask && +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	       x->markmask == y->markmask && +#endif +	       a->extensions == b->extensions; +} + +/* Delete expired elements from the hashtable */ +static void +mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) +{ +	struct htable *t; +	struct hbucket *n; +	struct mtype_elem *data; +	u32 i; +	int j; +#ifdef IP_SET_HASH_WITH_NETS +	u8 k; +#endif + +	rcu_read_lock_bh(); +	t = rcu_dereference_bh(h->table); +	for (i = 0; i < jhash_size(t->htable_bits); i++) { +		n = hbucket(t, i); +		for (j = 0; j < n->pos; j++) { +			data = ahash_data(n, j, dsize); +			if (ip_set_timeout_expired(ext_timeout(data, set))) { +				pr_debug("expired %u/%u\n", i, j); +#ifdef IP_SET_HASH_WITH_NETS +				for (k = 0; k < IPSET_NET_COUNT; k++) +					mtype_del_cidr(h, CIDR(data->cidr, k), +						       nets_length, k); +#endif +				ip_set_ext_destroy(set, data); +				if (j != n->pos - 1) +					/* Not last one */ +					memcpy(data, +					       ahash_data(n, n->pos - 1, dsize), +					       dsize); +				n->pos--; +				h->elements--; +			} +		} +		if (n->pos + AHASH_INIT_SIZE < n->size) { +			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) +					    * dsize, +					    GFP_ATOMIC); +			if (!tmp) +				/* Still try to delete expired elements */ +				continue; +			n->size -= AHASH_INIT_SIZE; +			memcpy(tmp, n->value, n->size * dsize); +			kfree(n->value); +			n->value = tmp; +		} +	} +	rcu_read_unlock_bh(); +} + +static void +mtype_gc(unsigned long ul_set) +{ +	struct ip_set *set = (struct ip_set *) ul_set; +	struct htype *h = set->data; + +	pr_debug("called\n"); +	write_lock_bh(&set->lock); +	mtype_expire(set, h, NLEN(set->family), set->dsize); +	write_unlock_bh(&set->lock); + +	h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; +	add_timer(&h->gc); +} + +/* Resize a hash: create a new hash table with doubling the hashsize + * and inserting the elements to it. Repeat until we succeed or + * fail due to memory pressures. */ +static int +mtype_resize(struct ip_set *set, bool retried) +{ +	struct htype *h = set->data; +	struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table); +	u8 htable_bits = orig->htable_bits; +#ifdef IP_SET_HASH_WITH_NETS +	u8 flags; +#endif +	struct mtype_elem *data; +	struct mtype_elem *d; +	struct hbucket *n, *m; +	u32 i, j; +	int ret; + +	/* Try to cleanup once */ +	if (SET_WITH_TIMEOUT(set) && !retried) { +		i = h->elements; +		write_lock_bh(&set->lock); +		mtype_expire(set, set->data, NLEN(set->family), set->dsize); +		write_unlock_bh(&set->lock); +		if (h->elements < i) +			return 0; +	} + +retry: +	ret = 0; +	htable_bits++; +	pr_debug("attempt to resize set %s from %u to %u, t %p\n", +		 set->name, orig->htable_bits, htable_bits, orig); +	if (!htable_bits) { +		/* In case we have plenty of memory :-) */ +		pr_warning("Cannot increase the hashsize of set %s further\n", +			   set->name); +		return -IPSET_ERR_HASH_FULL; +	} +	t = ip_set_alloc(sizeof(*t) +			 + jhash_size(htable_bits) * sizeof(struct hbucket)); +	if (!t) +		return -ENOMEM; +	t->htable_bits = htable_bits; + +	read_lock_bh(&set->lock); +	for (i = 0; i < jhash_size(orig->htable_bits); i++) { +		n = hbucket(orig, i); +		for (j = 0; j < n->pos; j++) { +			data = ahash_data(n, j, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS +			flags = 0; +			mtype_data_reset_flags(data, &flags); +#endif +			m = hbucket(t, HKEY(data, h->initval, htable_bits)); +			ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize); +			if (ret < 0) { +#ifdef IP_SET_HASH_WITH_NETS +				mtype_data_reset_flags(data, &flags); +#endif +				read_unlock_bh(&set->lock); +				mtype_ahash_destroy(set, t, false); +				if (ret == -EAGAIN) +					goto retry; +				return ret; +			} +			d = ahash_data(m, m->pos++, set->dsize); +			memcpy(d, data, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS +			mtype_data_reset_flags(d, &flags); +#endif +		} +	} + +	rcu_assign_pointer(h->table, t); +	read_unlock_bh(&set->lock); + +	/* Give time to other readers of the set */ +	synchronize_rcu_bh(); + +	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, +		 orig->htable_bits, orig, t->htable_bits, t); +	mtype_ahash_destroy(set, orig, false); + +	return 0; +} + +/* Add an element to a hash and update the internal counters when succeeded, + * otherwise report the proper error code. */ +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	  struct ip_set_ext *mext, u32 flags) +{ +	struct htype *h = set->data; +	struct htable *t; +	const struct mtype_elem *d = value; +	struct mtype_elem *data; +	struct hbucket *n; +	int i, ret = 0; +	int j = AHASH_MAX(h) + 1; +	bool flag_exist = flags & IPSET_FLAG_EXIST; +	u32 key, multi = 0; + +	if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) { +		rcu_read_lock_bh(); +		t = rcu_dereference_bh(h->table); +		key = HKEY(value, h->initval, t->htable_bits); +		n = hbucket(t,key); +		if (n->pos) { +			/* Choosing the first entry in the array to replace */ +			j = 0; +			goto reuse_slot; +		} +		rcu_read_unlock_bh(); +	} +	if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) +		/* FIXME: when set is full, we slow down here */ +		mtype_expire(set, h, NLEN(set->family), set->dsize); + +	if (h->elements >= h->maxelem) { +		if (net_ratelimit()) +			pr_warning("Set %s is full, maxelem %u reached\n", +				   set->name, h->maxelem); +		return -IPSET_ERR_HASH_FULL; +	} + +	rcu_read_lock_bh(); +	t = rcu_dereference_bh(h->table); +	key = HKEY(value, h->initval, t->htable_bits); +	n = hbucket(t, key); +	for (i = 0; i < n->pos; i++) { +		data = ahash_data(n, i, set->dsize); +		if (mtype_data_equal(data, d, &multi)) { +			if (flag_exist || +			    (SET_WITH_TIMEOUT(set) && +			     ip_set_timeout_expired(ext_timeout(data, set)))) { +				/* Just the extensions could be overwritten */ +				j = i; +				goto reuse_slot; +			} else { +				ret = -IPSET_ERR_EXIST; +				goto out; +			} +		} +		/* Reuse first timed out entry */ +		if (SET_WITH_TIMEOUT(set) && +		    ip_set_timeout_expired(ext_timeout(data, set)) && +		    j != AHASH_MAX(h) + 1) +			j = i; +	} +reuse_slot: +	if (j != AHASH_MAX(h) + 1) { +		/* Fill out reused slot */ +		data = ahash_data(n, j, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS +		for (i = 0; i < IPSET_NET_COUNT; i++) { +			mtype_del_cidr(h, CIDR(data->cidr, i), +				       NLEN(set->family), i); +			mtype_add_cidr(h, CIDR(d->cidr, i), +				       NLEN(set->family), i); +		} +#endif +		ip_set_ext_destroy(set, data); +	} else { +		/* Use/create a new slot */ +		TUNE_AHASH_MAX(h, multi); +		ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize); +		if (ret != 0) { +			if (ret == -EAGAIN) +				mtype_data_next(&h->next, d); +			goto out; +		} +		data = ahash_data(n, n->pos++, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS +		for (i = 0; i < IPSET_NET_COUNT; i++) +			mtype_add_cidr(h, CIDR(d->cidr, i), NLEN(set->family), +				       i); +#endif +		h->elements++; +	} +	memcpy(data, d, sizeof(struct mtype_elem)); +#ifdef IP_SET_HASH_WITH_NETS +	mtype_data_set_flags(data, flags); +#endif +	if (SET_WITH_TIMEOUT(set)) +		ip_set_timeout_set(ext_timeout(data, set), ext->timeout); +	if (SET_WITH_COUNTER(set)) +		ip_set_init_counter(ext_counter(data, set), ext); +	if (SET_WITH_COMMENT(set)) +		ip_set_init_comment(ext_comment(data, set), ext); + +out: +	rcu_read_unlock_bh(); +	return ret; +} + +/* Delete an element from the hash: swap it with the last element + * and free up space if possible. + */ +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	  struct ip_set_ext *mext, u32 flags) +{ +	struct htype *h = set->data; +	struct htable *t; +	const struct mtype_elem *d = value; +	struct mtype_elem *data; +	struct hbucket *n; +	int i, ret = -IPSET_ERR_EXIST; +#ifdef IP_SET_HASH_WITH_NETS +	u8 j; +#endif +	u32 key, multi = 0; + +	rcu_read_lock_bh(); +	t = rcu_dereference_bh(h->table); +	key = HKEY(value, h->initval, t->htable_bits); +	n = hbucket(t, key); +	for (i = 0; i < n->pos; i++) { +		data = ahash_data(n, i, set->dsize); +		if (!mtype_data_equal(data, d, &multi)) +			continue; +		if (SET_WITH_TIMEOUT(set) && +		    ip_set_timeout_expired(ext_timeout(data, set))) +			goto out; +		if (i != n->pos - 1) +			/* Not last one */ +			memcpy(data, ahash_data(n, n->pos - 1, set->dsize), +			       set->dsize); + +		n->pos--; +		h->elements--; +#ifdef IP_SET_HASH_WITH_NETS +		for (j = 0; j < IPSET_NET_COUNT; j++) +			mtype_del_cidr(h, CIDR(d->cidr, j), NLEN(set->family), +				       j); +#endif +		ip_set_ext_destroy(set, data); +		if (n->pos + AHASH_INIT_SIZE < n->size) { +			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) +					    * set->dsize, +					    GFP_ATOMIC); +			if (!tmp) { +				ret = 0; +				goto out; +			} +			n->size -= AHASH_INIT_SIZE; +			memcpy(tmp, n->value, n->size * set->dsize); +			kfree(n->value); +			n->value = tmp; +		} +		ret = 0; +		goto out; +	} + +out: +	rcu_read_unlock_bh(); +	return ret; +} + +static inline int +mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, +		 struct ip_set_ext *mext, struct ip_set *set, u32 flags) +{ +	if (SET_WITH_COUNTER(set)) +		ip_set_update_counter(ext_counter(data, set), +				      ext, mext, flags); +	return mtype_do_data_match(data); +} + +#ifdef IP_SET_HASH_WITH_NETS +/* Special test function which takes into account the different network + * sizes added to the set */ +static int +mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, +		 const struct ip_set_ext *ext, +		 struct ip_set_ext *mext, u32 flags) +{ +	struct htype *h = set->data; +	struct htable *t = rcu_dereference_bh(h->table); +	struct hbucket *n; +	struct mtype_elem *data; +#if IPSET_NET_COUNT == 2 +	struct mtype_elem orig = *d; +	int i, j = 0, k; +#else +	int i, j = 0; +#endif +	u32 key, multi = 0; +	u8 nets_length = NLEN(set->family); + +	pr_debug("test by nets\n"); +	for (; j < nets_length && h->nets[j].nets[0] && !multi; j++) { +#if IPSET_NET_COUNT == 2 +		mtype_data_reset_elem(d, &orig); +		mtype_data_netmask(d, h->nets[j].cidr[0], false); +		for (k = 0; k < nets_length && h->nets[k].nets[1] && !multi; +		     k++) { +			mtype_data_netmask(d, h->nets[k].cidr[1], true); +#else +		mtype_data_netmask(d, h->nets[j].cidr[0]); +#endif +		key = HKEY(d, h->initval, t->htable_bits); +		n = hbucket(t, key); +		for (i = 0; i < n->pos; i++) { +			data = ahash_data(n, i, set->dsize); +			if (!mtype_data_equal(data, d, &multi)) +				continue; +			if (SET_WITH_TIMEOUT(set)) { +				if (!ip_set_timeout_expired( +						ext_timeout(data, set))) +					return mtype_data_match(data, ext, +								mext, set, +								flags); +#ifdef IP_SET_HASH_WITH_MULTI +				multi = 0; +#endif +			} else +				return mtype_data_match(data, ext, +							mext, set, flags); +		} +#if IPSET_NET_COUNT == 2 +		} +#endif +	} +	return 0; +} +#endif + +/* Test whether the element is added to the set */ +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	   struct ip_set_ext *mext, u32 flags) +{ +	struct htype *h = set->data; +	struct htable *t; +	struct mtype_elem *d = value; +	struct hbucket *n; +	struct mtype_elem *data; +	int i, ret = 0; +	u32 key, multi = 0; + +	rcu_read_lock_bh(); +	t = rcu_dereference_bh(h->table); +#ifdef IP_SET_HASH_WITH_NETS +	/* If we test an IP address and not a network address, +	 * try all possible network sizes */ +	for (i = 0; i < IPSET_NET_COUNT; i++) +		if (CIDR(d->cidr, i) != SET_HOST_MASK(set->family)) +			break; +	if (i == IPSET_NET_COUNT) { +		ret = mtype_test_cidrs(set, d, ext, mext, flags); +		goto out; +	} +#endif + +	key = HKEY(d, h->initval, t->htable_bits); +	n = hbucket(t, key); +	for (i = 0; i < n->pos; i++) { +		data = ahash_data(n, i, set->dsize); +		if (mtype_data_equal(data, d, &multi) && +		    !(SET_WITH_TIMEOUT(set) && +		      ip_set_timeout_expired(ext_timeout(data, set)))) { +			ret = mtype_data_match(data, ext, mext, set, flags); +			goto out; +		} +	} +out: +	rcu_read_unlock_bh(); +	return ret; +} + +/* Reply a HEADER request: fill out the header part of the set */ +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ +	const struct htype *h = set->data; +	const struct htable *t; +	struct nlattr *nested; +	size_t memsize; + +	t = rcu_dereference_bh_nfnl(h->table); +	memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); + +	nested = ipset_nest_start(skb, IPSET_ATTR_DATA); +	if (!nested) +		goto nla_put_failure; +	if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, +			  htonl(jhash_size(t->htable_bits))) || +	    nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) +		goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_NETMASK +	if (h->netmask != HOST_MASK && +	    nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) +		goto nla_put_failure; +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) +		goto nla_put_failure; +#endif +	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || +	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) +		goto nla_put_failure; +	if (unlikely(ip_set_put_flags(skb, set))) +		goto nla_put_failure; +	ipset_nest_end(skb, nested); + +	return 0; +nla_put_failure: +	return -EMSGSIZE; +} + +/* Reply a LIST/SAVE request: dump the elements of the specified set */ +static int +mtype_list(const struct ip_set *set, +	   struct sk_buff *skb, struct netlink_callback *cb) +{ +	const struct htype *h = set->data; +	const struct htable *t = rcu_dereference_bh_nfnl(h->table); +	struct nlattr *atd, *nested; +	const struct hbucket *n; +	const struct mtype_elem *e; +	u32 first = cb->args[IPSET_CB_ARG0]; +	/* We assume that one hash bucket fills into one page */ +	void *incomplete; +	int i; + +	atd = ipset_nest_start(skb, IPSET_ATTR_ADT); +	if (!atd) +		return -EMSGSIZE; +	pr_debug("list hash set %s\n", set->name); +	for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); +	     cb->args[IPSET_CB_ARG0]++) { +		incomplete = skb_tail_pointer(skb); +		n = hbucket(t, cb->args[IPSET_CB_ARG0]); +		pr_debug("cb->arg bucket: %lu, t %p n %p\n", +			 cb->args[IPSET_CB_ARG0], t, n); +		for (i = 0; i < n->pos; i++) { +			e = ahash_data(n, i, set->dsize); +			if (SET_WITH_TIMEOUT(set) && +			    ip_set_timeout_expired(ext_timeout(e, set))) +				continue; +			pr_debug("list hash %lu hbucket %p i %u, data %p\n", +				 cb->args[IPSET_CB_ARG0], n, i, e); +			nested = ipset_nest_start(skb, IPSET_ATTR_DATA); +			if (!nested) { +				if (cb->args[IPSET_CB_ARG0] == first) { +					nla_nest_cancel(skb, atd); +					return -EMSGSIZE; +				} else +					goto nla_put_failure; +			} +			if (mtype_data_list(skb, e)) +				goto nla_put_failure; +			if (ip_set_put_extensions(skb, set, e, true)) +				goto nla_put_failure; +			ipset_nest_end(skb, nested); +		} +	} +	ipset_nest_end(skb, atd); +	/* Set listing finished */ +	cb->args[IPSET_CB_ARG0] = 0; + +	return 0; + +nla_put_failure: +	nlmsg_trim(skb, incomplete); +	if (unlikely(first == cb->args[IPSET_CB_ARG0])) { +		pr_warning("Can't list set %s: one bucket does not fit into " +			   "a message. Please report it!\n", set->name); +		cb->args[IPSET_CB_ARG0] = 0; +		return -EMSGSIZE; +	} +	ipset_nest_end(skb, atd); +	return 0; +} + +static int +IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, +	    const struct xt_action_param *par, +	    enum ipset_adt adt, struct ip_set_adt_opt *opt); + +static int +IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], +	    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + +static const struct ip_set_type_variant mtype_variant = { +	.kadt	= mtype_kadt, +	.uadt	= mtype_uadt, +	.adt	= { +		[IPSET_ADD] = mtype_add, +		[IPSET_DEL] = mtype_del, +		[IPSET_TEST] = mtype_test, +	}, +	.destroy = mtype_destroy, +	.flush	= mtype_flush, +	.head	= mtype_head, +	.list	= mtype_list, +	.resize	= mtype_resize, +	.same_set = mtype_same_set, +}; + +#ifdef IP_SET_EMIT_CREATE +static int +IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, +			    struct nlattr *tb[], u32 flags) +{ +	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; +#ifdef IP_SET_HASH_WITH_MARKMASK +	u32 markmask; +#endif +	u8 hbits; +#ifdef IP_SET_HASH_WITH_NETMASK +	u8 netmask; +#endif +	size_t hsize; +	struct HTYPE *h; +	struct htable *t; + +	if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) +		return -IPSET_ERR_INVALID_FAMILY; + +#ifdef IP_SET_HASH_WITH_MARKMASK +	markmask = 0xffffffff; +#endif +#ifdef IP_SET_HASH_WITH_NETMASK +	netmask = set->family == NFPROTO_IPV4 ? 32 : 128; +	pr_debug("Create set %s with family %s\n", +		 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); +#endif + +	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || +#ifdef IP_SET_HASH_WITH_MARKMASK +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || +#endif +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_HASHSIZE]) { +		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); +		if (hashsize < IPSET_MIMINAL_HASHSIZE) +			hashsize = IPSET_MIMINAL_HASHSIZE; +	} + +	if (tb[IPSET_ATTR_MAXELEM]) +		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + +#ifdef IP_SET_HASH_WITH_NETMASK +	if (tb[IPSET_ATTR_NETMASK]) { +		netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + +		if ((set->family == NFPROTO_IPV4 && netmask > 32) || +		    (set->family == NFPROTO_IPV6 && netmask > 128) || +		    netmask == 0) +			return -IPSET_ERR_INVALID_NETMASK; +	} +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	if (tb[IPSET_ATTR_MARKMASK]) { +		markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + +		if ((markmask > 4294967295u) || markmask == 0) +			return -IPSET_ERR_INVALID_MARKMASK; +	} +#endif + +	hsize = sizeof(*h); +#ifdef IP_SET_HASH_WITH_NETS +	hsize += sizeof(struct net_prefixes) * +		(set->family == NFPROTO_IPV4 ? 32 : 128); +#endif +	h = kzalloc(hsize, GFP_KERNEL); +	if (!h) +		return -ENOMEM; + +	h->maxelem = maxelem; +#ifdef IP_SET_HASH_WITH_NETMASK +	h->netmask = netmask; +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK +	h->markmask = markmask; +#endif +	get_random_bytes(&h->initval, sizeof(h->initval)); +	set->timeout = IPSET_NO_TIMEOUT; + +	hbits = htable_bits(hashsize); +	hsize = htable_size(hbits); +	if (hsize == 0) { +		kfree(h); +		return -ENOMEM; +	} +	t = ip_set_alloc(hsize); +	if (!t) { +		kfree(h); +		return -ENOMEM; +	} +	t->htable_bits = hbits; +	rcu_assign_pointer(h->table, t); + +	set->data = h; +	if (set->family == NFPROTO_IPV4) { +		set->variant = &IPSET_TOKEN(HTYPE, 4_variant); +		set->dsize = ip_set_elem_len(set, tb, +				sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); +	} else { +		set->variant = &IPSET_TOKEN(HTYPE, 6_variant); +		set->dsize = ip_set_elem_len(set, tb, +				sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); +	} +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +		if (set->family == NFPROTO_IPV4) +			IPSET_TOKEN(HTYPE, 4_gc_init)(set, +				IPSET_TOKEN(HTYPE, 4_gc)); +		else +			IPSET_TOKEN(HTYPE, 6_gc_init)(set, +				IPSET_TOKEN(HTYPE, 6_gc)); +	} + +	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", +		 set->name, jhash_size(t->htable_bits), +		 t->htable_bits, h->maxelem, set->data, t); + +	return 0; +} +#endif /* IP_SET_EMIT_CREATE */ diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c new file mode 100644 index 00000000000..dd40607f878 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -0,0 +1,315 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1	   Counters support */ +/*				2	   Comments support */ +#define IPSET_TYPE_REV_MAX	3	/* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip"); + +/* Type specific function prefix */ +#define HTYPE		hash_ip +#define IP_SET_HASH_WITH_NETMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ip4_elem { +	/* Zero valued IP addresses cannot be stored */ +	__be32 ip; +}; + +/* Common functions */ + +static inline bool +hash_ip4_data_equal(const struct hash_ip4_elem *e1, +		    const struct hash_ip4_elem *e2, +		    u32 *multi) +{ +	return e1->ip == e2->ip; +} + +static inline bool +hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) +{ +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) +{ +	next->ip = e->ip; +} + +#define MTYPE		hash_ip4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, +	      const struct xt_action_param *par, +	      enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ip *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ip4_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); +	__be32 ip; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); +	ip &= ip_set_netmask(h->netmask); +	if (ip == 0) +		return -EINVAL; + +	e.ip = ip; +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], +	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ip *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ip4_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, hosts; +	int ret = 0; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	ip &= ip_set_hostmask(h->netmask); + +	if (adt == IPSET_TEST) { +		e.ip = htonl(ip); +		if (e.ip == 0) +			return -IPSET_ERR_HASH_ELEM; +		return adtfn(set, &e, &ext, &ext, flags); +	} + +	ip_to = ip; +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (!cidr || cidr > 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(ip, ip_to, cidr); +	} + +	hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); + +	if (retried) +		ip = ntohl(h->next.ip); +	for (; !before(ip_to, ip); ip += hosts) { +		e.ip = htonl(ip); +		if (e.ip == 0) +			return -IPSET_ERR_HASH_ELEM; +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +/* IPv6 variant */ + +/* Member elements */ +struct hash_ip6_elem { +	union nf_inet_addr ip; +}; + +/* Common functions */ + +static inline bool +hash_ip6_data_equal(const struct hash_ip6_elem *ip1, +		    const struct hash_ip6_elem *ip2, +		    u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); +} + +static inline void +hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ +	ip6_netmask(ip, prefix); +} + +static bool +hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) +{ +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE		hash_ip6 +#define PF		6 +#define HOST_MASK	128 + +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, +	      const struct xt_action_param *par, +	      enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ip *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ip6_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	hash_ip6_netmask(&e.ip, h->netmask); +	if (ipv6_addr_any(&e.ip.in6)) +		return -EINVAL; + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], +	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ip *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ip6_elem e = {}; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || +		     tb[IPSET_ATTR_IP_TO] || +		     tb[IPSET_ATTR_CIDR])) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	hash_ip6_netmask(&e.ip, h->netmask); +	if (ipv6_addr_any(&e.ip.in6)) +		return -IPSET_ERR_HASH_ELEM; + +	ret = adtfn(set, &e, &ext, &ext, flags); + +	return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_ip_type __read_mostly = { +	.name		= "hash:ip", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP, +	.dimension	= IPSET_DIM_ONE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_ip_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_NETMASK]	= { .type = NLA_U8  }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_ip_init(void) +{ +	return ip_set_type_register(&hash_ip_type); +} + +static void __exit +hash_ip_fini(void) +{ +	ip_set_type_unregister(&hash_ip_type); +} + +module_init(hash_ip_init); +module_exit(hash_ip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c new file mode 100644 index 00000000000..4eff0a29725 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -0,0 +1,321 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,mark type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +#define IPSET_TYPE_REV_MAX	1	/* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); +IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,mark"); + +/* Type specific function prefix */ +#define HTYPE		hash_ipmark +#define IP_SET_HASH_WITH_MARKMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipmark4_elem { +	__be32 ip; +	__u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, +			const struct hash_ipmark4_elem *ip2, +			u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->mark == ip2->mark; +} + +static bool +hash_ipmark4_data_list(struct sk_buff *skb, +		       const struct hash_ipmark4_elem *data) +{ +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipmark4_data_next(struct hash_ipmark4_elem *next, +		       const struct hash_ipmark4_elem *d) +{ +	next->ip = d->ip; +} + +#define MTYPE           hash_ipmark4 +#define PF              4 +#define HOST_MASK       32 +#define HKEY_DATALEN	sizeof(struct hash_ipmark4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, +		  const struct xt_action_param *par, +		  enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.mark = skb->mark; +	e.mark &= h->markmask; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], +		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip, ip_to = 0; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); +	e.mark &= h->markmask; + +	if (adt == IPSET_TEST || +	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip = ntohl(e.ip); +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (!cidr || cidr > 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(ip, ip_to, cidr); +	} + +	if (retried) +		ip = ntohl(h->next.ip); +	for (; !before(ip_to, ip); ip++) { +		e.ip = htonl(ip); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_ipmark6_elem { +	union nf_inet_addr ip; +	__u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, +			const struct hash_ipmark6_elem *ip2, +			u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ip1->mark == ip2->mark; +} + +static bool +hash_ipmark6_data_list(struct sk_buff *skb, +		       const struct hash_ipmark6_elem *data) +{ +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipmark6_data_next(struct hash_ipmark4_elem *next, +		       const struct hash_ipmark6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE		hash_ipmark6 +#define PF		6 +#define HOST_MASK	128 +#define HKEY_DATALEN	sizeof(struct hash_ipmark6_elem) +#define	IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + + +static int +hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, +		  const struct xt_action_param *par, +		  enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.mark = skb->mark; +	e.mark &= h->markmask; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], +		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipmark *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipmark6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || +		     tb[IPSET_ATTR_IP_TO] || +		     tb[IPSET_ATTR_CIDR])) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); +	e.mark &= h->markmask; + +	if (adt == IPSET_TEST) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ret = adtfn(set, &e, &ext, &ext, flags); +	if (ret && !ip_set_eexist(ret, flags)) +		return ret; +	else +		ret = 0; + +	return ret; +} + +static struct ip_set_type hash_ipmark_type __read_mostly = { +	.name		= "hash:ip,mark", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_MARK, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_ipmark_create, +	.create_policy	= { +		[IPSET_ATTR_MARKMASK]	= { .type = NLA_U32 }, +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_MARK]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_ipmark_init(void) +{ +	return ip_set_type_register(&hash_ipmark_type); +} + +static void __exit +hash_ipmark_fini(void) +{ +	ip_set_type_unregister(&hash_ipmark_type); +} + +module_init(hash_ipmark_init); +module_exit(hash_ipmark_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c new file mode 100644 index 00000000000..7597b82a8b0 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -0,0 +1,390 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Counters support added */ +/*				3    Comments support added */ +#define IPSET_TYPE_REV_MAX	4 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,port"); + +/* Type specific function prefix */ +#define HTYPE		hash_ipport + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipport4_elem { +	__be32 ip; +	__be16 port; +	u8 proto; +	u8 padding; +}; + +/* Common functions */ + +static inline bool +hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, +			const struct hash_ipport4_elem *ip2, +			u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static bool +hash_ipport4_data_list(struct sk_buff *skb, +		       const struct hash_ipport4_elem *data) +{ +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipport4_data_next(struct hash_ipport4_elem *next, +		       const struct hash_ipport4_elem *d) +{ +	next->ip = d->ip; +	next->port = d->port; +} + +#define MTYPE           hash_ipport4 +#define PF              4 +#define HOST_MASK       32 +#define HKEY_DATALEN	sizeof(struct hash_ipport4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, +		  const struct xt_action_param *par, +		  enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipport4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], +		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipport *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipport4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip, ip_to = 0, p = 0, port, port_to; +	bool with_ports = false; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMP)) +		e.port = 0; + +	if (adt == IPSET_TEST || +	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || +	      tb[IPSET_ATTR_PORT_TO])) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip = ntohl(e.ip); +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (!cidr || cidr > 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(ip, ip_to, cidr); +	} + +	port_to = port = ntohs(e.port); +	if (with_ports && tb[IPSET_ATTR_PORT_TO]) { +		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +		if (port > port_to) +			swap(port, port_to); +	} + +	if (retried) +		ip = ntohl(h->next.ip); +	for (; !before(ip_to, ip); ip++) { +		p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) +						       : port; +		for (; p <= port_to; p++) { +			e.ip = htonl(ip); +			e.port = htons(p); +			ret = adtfn(set, &e, &ext, &ext, flags); + +			if (ret && !ip_set_eexist(ret, flags)) +				return ret; +			else +				ret = 0; +		} +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_ipport6_elem { +	union nf_inet_addr ip; +	__be16 port; +	u8 proto; +	u8 padding; +}; + +/* Common functions */ + +static inline bool +hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, +			const struct hash_ipport6_elem *ip2, +			u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static bool +hash_ipport6_data_list(struct sk_buff *skb, +		       const struct hash_ipport6_elem *data) +{ +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipport6_data_next(struct hash_ipport4_elem *next, +		       const struct hash_ipport6_elem *d) +{ +	next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE		hash_ipport6 +#define PF		6 +#define HOST_MASK	128 +#define HKEY_DATALEN	sizeof(struct hash_ipport6_elem) +#define	IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, +		  const struct xt_action_param *par, +		  enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipport6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], +		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipport *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipport6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to; +	bool with_ports = false; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || +		     tb[IPSET_ATTR_IP_TO] || +		     tb[IPSET_ATTR_CIDR])) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMPV6)) +		e.port = 0; + +	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	port = ntohs(e.port); +	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +	if (port > port_to) +		swap(port, port_to); + +	if (retried) +		port = ntohs(h->next.port); +	for (; port <= port_to; port++) { +		e.port = htons(port); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static struct ip_set_type hash_ipport_type __read_mostly = { +	.name		= "hash:ip,port", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_ipport_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_ipport_init(void) +{ +	return ip_set_type_register(&hash_ipport_type); +} + +static void __exit +hash_ipport_fini(void) +{ +	ip_set_type_unregister(&hash_ipport_type); +} + +module_init(hash_ipport_init); +module_exit(hash_ipport_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c new file mode 100644 index 00000000000..672655ffd57 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -0,0 +1,402 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,ip type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Counters support added */ +/*				3    Comments support added */ +#define IPSET_TYPE_REV_MAX	4 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,port,ip"); + +/* Type specific function prefix */ +#define HTYPE		hash_ipportip + +/* IPv4 variant */ + +/* Member elements  */ +struct hash_ipportip4_elem { +	__be32 ip; +	__be32 ip2; +	__be16 port; +	u8 proto; +	u8 padding; +}; + +static inline bool +hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, +			  const struct hash_ipportip4_elem *ip2, +			  u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->ip2 == ip2->ip2 && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static bool +hash_ipportip4_data_list(struct sk_buff *skb, +		       const struct hash_ipportip4_elem *data) +{ +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipportip4_data_next(struct hash_ipportip4_elem *next, +			 const struct hash_ipportip4_elem *d) +{ +	next->ip = d->ip; +	next->port = d->port; +} + +/* Common functions */ +#define MTYPE		hash_ipportip4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, +		    const struct xt_action_param *par, +		    enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportip4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], +		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipportip *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportip4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip, ip_to = 0, p = 0, port, port_to; +	bool with_ports = false; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMP)) +		e.port = 0; + +	if (adt == IPSET_TEST || +	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || +	      tb[IPSET_ATTR_PORT_TO])) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip = ntohl(e.ip); +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +	} else if (tb[IPSET_ATTR_CIDR]) { +		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (!cidr || cidr > 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(ip, ip_to, cidr); +	} + +	port_to = port = ntohs(e.port); +	if (with_ports && tb[IPSET_ATTR_PORT_TO]) { +		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +		if (port > port_to) +			swap(port, port_to); +	} + +	if (retried) +		ip = ntohl(h->next.ip); +	for (; !before(ip_to, ip); ip++) { +		p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) +						       : port; +		for (; p <= port_to; p++) { +			e.ip = htonl(ip); +			e.port = htons(p); +			ret = adtfn(set, &e, &ext, &ext, flags); + +			if (ret && !ip_set_eexist(ret, flags)) +				return ret; +			else +				ret = 0; +		} +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_ipportip6_elem { +	union nf_inet_addr ip; +	union nf_inet_addr ip2; +	__be16 port; +	u8 proto; +	u8 padding; +}; + +/* Common functions */ + +static inline bool +hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, +			  const struct hash_ipportip6_elem *ip2, +			  u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static bool +hash_ipportip6_data_list(struct sk_buff *skb, +			 const struct hash_ipportip6_elem *data) +{ +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipportip6_data_next(struct hash_ipportip4_elem *next, +			 const struct hash_ipportip6_elem *d) +{ +	next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_ipportip6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, +		    const struct xt_action_param *par, +		    enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportip6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], +		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipportip *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportip6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to; +	bool with_ports = false; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || +		     tb[IPSET_ATTR_IP_TO] || +		     tb[IPSET_ATTR_CIDR])) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMPV6)) +		e.port = 0; + +	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	port = ntohs(e.port); +	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +	if (port > port_to) +		swap(port, port_to); + +	if (retried) +		port = ntohs(h->next.port); +	for (; port <= port_to; port++) { +		e.port = htons(port); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static struct ip_set_type hash_ipportip_type __read_mostly = { +	.name		= "hash:ip,port,ip", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, +	.dimension	= IPSET_DIM_THREE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_ipportip_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_ipportip_init(void) +{ +	return ip_set_type_register(&hash_ipportip_type); +} + +static void __exit +hash_ipportip_fini(void) +{ +	ip_set_type_unregister(&hash_ipportip_type); +} + +module_init(hash_ipportip_init); +module_exit(hash_ipportip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c new file mode 100644 index 00000000000..7308d84f927 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -0,0 +1,561 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Range as input support for IPv4 added */ +/*				3    nomatch flag support added */ +/*				4    Counters support added */ +/*				5    Comments support added */ +#define IPSET_TYPE_REV_MAX	6 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,port,net"); + +/* Type specific function prefix */ +#define HTYPE		hash_ipportnet + +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipportnet4_elem { +	__be32 ip; +	__be32 ip2; +	__be16 port; +	u8 cidr:7; +	u8 nomatch:1; +	u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, +			   const struct hash_ipportnet4_elem *ip2, +			   u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->ip2 == ip2->ip2 && +	       ip1->cidr == ip2->cidr && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static inline int +hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) +{ +	elem->ip2 &= ip_set_netmask(cidr); +	elem->cidr = cidr - 1; +} + +static bool +hash_ipportnet4_data_list(struct sk_buff *skb, +			  const struct hash_ipportnet4_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, +			  const struct hash_ipportnet4_elem *d) +{ +	next->ip = d->ip; +	next->port = d->port; +	next->ip2 = d->ip2; +} + +#define MTYPE		hash_ipportnet4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, +		     const struct xt_action_param *par, +		     enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ipportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportnet4_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK - 1; + +	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); +	e.ip2 &= ip_set_netmask(e.cidr + 1); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], +		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, p = 0, port, port_to; +	u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; +	bool with_ports = false; +	u8 cidr; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR2]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr = cidr - 1; +	} + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMP)) +		e.port = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; +	if (adt == IPSET_TEST || +	    !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || +	      tb[IPSET_ATTR_IP2_TO])) { +		e.ip = htonl(ip); +		e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip; +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +	} else if (tb[IPSET_ATTR_CIDR]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +		if (!cidr || cidr > 32) +			return -IPSET_ERR_INVALID_CIDR; +		ip_set_mask_from_to(ip, ip_to, cidr); +	} + +	port_to = port = ntohs(e.port); +	if (tb[IPSET_ATTR_PORT_TO]) { +		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +		if (port > port_to) +			swap(port, port_to); +	} + +	ip2_to = ip2_from; +	if (tb[IPSET_ATTR_IP2_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); +		if (ret) +			return ret; +		if (ip2_from > ip2_to) +			swap(ip2_from, ip2_to); +		if (ip2_from + UINT_MAX == ip2_to) +			return -IPSET_ERR_HASH_RANGE; +	} else +		ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); + +	if (retried) +		ip = ntohl(h->next.ip); +	for (; !before(ip_to, ip); ip++) { +		e.ip = htonl(ip); +		p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) +						       : port; +		for (; p <= port_to; p++) { +			e.port = htons(p); +			ip2 = retried && +			      ip == ntohl(h->next.ip) && +			      p == ntohs(h->next.port) +				? ntohl(h->next.ip2) : ip2_from; +			while (!after(ip2, ip2_to)) { +				e.ip2 = htonl(ip2); +				ip2_last = ip_set_range_to_cidr(ip2, ip2_to, +								&cidr); +				e.cidr = cidr - 1; +				ret = adtfn(set, &e, &ext, &ext, flags); + +				if (ret && !ip_set_eexist(ret, flags)) +					return ret; +				else +					ret = 0; +				ip2 = ip2_last + 1; +			} +		} +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_ipportnet6_elem { +	union nf_inet_addr ip; +	union nf_inet_addr ip2; +	__be16 port; +	u8 cidr:7; +	u8 nomatch:1; +	u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, +			   const struct hash_ipportnet6_elem *ip2, +			   u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && +	       ip1->cidr == ip2->cidr && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static inline int +hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) +{ +	ip6_netmask(&elem->ip2, cidr); +	elem->cidr = cidr - 1; +} + +static bool +hash_ipportnet6_data_list(struct sk_buff *skb, +			  const struct hash_ipportnet6_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, +			  const struct hash_ipportnet6_elem *d) +{ +	next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_ipportnet6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, +		     const struct xt_action_param *par, +		     enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_ipportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportnet6_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK - 1; + +	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); +	ip6_netmask(&e.ip2, e.cidr + 1); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], +		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_ipportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to; +	bool with_ports = false; +	u8 cidr; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || +		     tb[IPSET_ATTR_IP_TO] || +		     tb[IPSET_ATTR_CIDR])) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR2]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr = cidr - 1; +	} + +	ip6_netmask(&e.ip2, e.cidr + 1); + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMPV6)) +		e.port = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	port = ntohs(e.port); +	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +	if (port > port_to) +		swap(port, port_to); + +	if (retried) +		port = ntohs(h->next.port); +	for (; port <= port_to; port++) { +		e.port = htons(port); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static struct ip_set_type hash_ipportnet_type __read_mostly = { +	.name		= "hash:ip,port,net", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | +			  IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_THREE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_ipportnet_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CIDR2]	= { .type = NLA_U8 }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_ipportnet_init(void) +{ +	return ip_set_type_register(&hash_ipportnet_type); +} + +static void __exit +hash_ipportnet_fini(void) +{ +	ip_set_type_unregister(&hash_ipportnet_type); +} + +module_init(hash_ipportnet_init); +module_exit(hash_ipportnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c new file mode 100644 index 00000000000..4c7d495783a --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -0,0 +1,397 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1    Range as input support for IPv4 added */ +/*				2    nomatch flag support added */ +/*				3    Counters support added */ +/*				4    Comments support added */ +#define IPSET_TYPE_REV_MAX	5 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net"); + +/* Type specific function prefix */ +#define HTYPE		hash_net +#define IP_SET_HASH_WITH_NETS + +/* IPv4 variant */ + +/* Member elements  */ +struct hash_net4_elem { +	__be32 ip; +	u16 padding0; +	u8 nomatch; +	u8 cidr; +}; + +/* Common functions */ + +static inline bool +hash_net4_data_equal(const struct hash_net4_elem *ip1, +		     const struct hash_net4_elem *ip2, +		     u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->cidr == ip2->cidr; +} + +static inline int +hash_net4_do_data_match(const struct hash_net4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) +{ +	elem->ip &= ip_set_netmask(cidr); +	elem->cidr = cidr; +} + +static bool +hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_net4_data_next(struct hash_net4_elem *next, +		    const struct hash_net4_elem *d) +{ +	next->ip = d->ip; +} + +#define MTYPE		hash_net4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_net *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_net4_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (e.cidr == 0) +		return -EINVAL; +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	e.ip &= ip_set_netmask(e.cidr); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], +	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_net *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_net4_elem e = { .cidr = HOST_MASK }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, last; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (!e.cidr || e.cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +	} + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { +		e.ip = htonl(ip & ip_set_hostmask(e.cidr)); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret: +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip; +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip_to < ip) +			swap(ip, ip_to); +		if (ip + UINT_MAX == ip_to) +			return -IPSET_ERR_HASH_RANGE; +	} +	if (retried) +		ip = ntohl(h->next.ip); +	while (!after(ip, ip_to)) { +		e.ip = htonl(ip); +		last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); +		ret = adtfn(set, &e, &ext, &ext, flags); +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +		ip = last + 1; +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_net6_elem { +	union nf_inet_addr ip; +	u16 padding0; +	u8 nomatch; +	u8 cidr; +}; + +/* Common functions */ + +static inline bool +hash_net6_data_equal(const struct hash_net6_elem *ip1, +		     const struct hash_net6_elem *ip2, +		     u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ip1->cidr == ip2->cidr; +} + +static inline int +hash_net6_do_data_match(const struct hash_net6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) +{ +	ip6_netmask(&elem->ip, cidr); +	elem->cidr = cidr; +} + +static bool +hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_net6_data_next(struct hash_net4_elem *next, +		    const struct hash_net6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_net6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_net *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_net6_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (e.cidr == 0) +		return -EINVAL; +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	ip6_netmask(&e.ip, e.cidr); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], +	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_net6_elem e = { .cidr = HOST_MASK }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) +		e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +	if (!e.cidr || e.cidr > HOST_MASK) +		return -IPSET_ERR_INVALID_CIDR; + +	ip6_netmask(&e.ip, e.cidr); + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	ret = adtfn(set, &e, &ext, &ext, flags); + +	return ip_set_enomatch(ret, flags, adt, set) ? -ret : +	       ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_net_type __read_mostly = { +	.name		= "hash:net", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_ONE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_net_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_net_init(void) +{ +	return ip_set_type_register(&hash_net_type); +} + +static void __exit +hash_net_fini(void) +{ +	ip_set_type_unregister(&hash_net_type); +} + +module_init(hash_net_init); +module_exit(hash_net_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c new file mode 100644 index 00000000000..db2606805b3 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -0,0 +1,610 @@ +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net,iface type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <linux/rbtree.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1    nomatch flag support added */ +/*				2    /0 support added */ +/*				3    Counters support added */ +/*				4    Comments support added */ +#define IPSET_TYPE_REV_MAX	5 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,iface"); + +/* Interface name rbtree */ + +struct iface_node { +	struct rb_node node; +	char iface[IFNAMSIZ]; +}; + +#define iface_data(n)	(rb_entry(n, struct iface_node, node)->iface) + +static void +rbtree_destroy(struct rb_root *root) +{ +	struct iface_node *node, *next; + +	rbtree_postorder_for_each_entry_safe(node, next, root, node) +		kfree(node); + +	*root = RB_ROOT; +} + +static int +iface_test(struct rb_root *root, const char **iface) +{ +	struct rb_node *n = root->rb_node; + +	while (n) { +		const char *d = iface_data(n); +		int res = strcmp(*iface, d); + +		if (res < 0) +			n = n->rb_left; +		else if (res > 0) +			n = n->rb_right; +		else { +			*iface = d; +			return 1; +		} +	} +	return 0; +} + +static int +iface_add(struct rb_root *root, const char **iface) +{ +	struct rb_node **n = &(root->rb_node), *p = NULL; +	struct iface_node *d; + +	while (*n) { +		char *ifname = iface_data(*n); +		int res = strcmp(*iface, ifname); + +		p = *n; +		if (res < 0) +			n = &((*n)->rb_left); +		else if (res > 0) +			n = &((*n)->rb_right); +		else { +			*iface = ifname; +			return 0; +		} +	} + +	d = kzalloc(sizeof(*d), GFP_ATOMIC); +	if (!d) +		return -ENOMEM; +	strcpy(d->iface, *iface); + +	rb_link_node(&d->node, p, n); +	rb_insert_color(&d->node, root); + +	*iface = d->iface; +	return 0; +} + +/* Type specific function prefix */ +#define HTYPE		hash_netiface +#define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_RBTREE +#define IP_SET_HASH_WITH_MULTI + +#define STREQ(a, b)	(strcmp(a, b) == 0) + +/* IPv4 variant */ + +struct hash_netiface4_elem_hashed { +	__be32 ip; +	u8 physdev; +	u8 cidr; +	u8 nomatch; +	u8 elem; +}; + +/* Member elements */ +struct hash_netiface4_elem { +	__be32 ip; +	u8 physdev; +	u8 cidr; +	u8 nomatch; +	u8 elem; +	const char *iface; +}; + +/* Common functions */ + +static inline bool +hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, +			  const struct hash_netiface4_elem *ip2, +			  u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->cidr == ip2->cidr && +	       (++*multi) && +	       ip1->physdev == ip2->physdev && +	       ip1->iface == ip2->iface; +} + +static inline int +hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) +{ +	elem->ip &= ip_set_netmask(cidr); +	elem->cidr = cidr; +} + +static bool +hash_netiface4_data_list(struct sk_buff *skb, +			 const struct hash_netiface4_elem *data) +{ +	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + +	if (data->nomatch) +		flags |= IPSET_FLAG_NOMATCH; +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || +	    nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netiface4_data_next(struct hash_netiface4_elem *next, +			 const struct hash_netiface4_elem *d) +{ +	next->ip = d->ip; +} + +#define MTYPE		hash_netiface4 +#define PF		4 +#define HOST_MASK	32 +#define HKEY_DATALEN	sizeof(struct hash_netiface4_elem_hashed) +#include "ip_set_hash_gen.h" + +static int +hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, +		    const struct xt_action_param *par, +		    enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	struct hash_netiface *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netiface4_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), +		.elem = 1, +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); +	int ret; + +	if (e.cidr == 0) +		return -EINVAL; +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	e.ip &= ip_set_netmask(e.cidr); + +#define IFACE(dir)	(par->dir ? par->dir->name : NULL) +#define PHYSDEV(dir)	(nf_bridge->dir ? nf_bridge->dir->name : NULL) +#define SRCDIR		(opt->flags & IPSET_DIM_TWO_SRC) + +	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { +#ifdef CONFIG_BRIDGE_NETFILTER +		const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + +		if (!nf_bridge) +			return -EINVAL; +		e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); +		e.physdev = 1; +#else +		e.iface = NULL; +#endif +	} else +		e.iface = SRCDIR ? IFACE(in) : IFACE(out); + +	if (!e.iface) +		return -EINVAL; +	ret = iface_test(&h->rbtree, &e.iface); +	if (adt == IPSET_ADD) { +		if (!ret) { +			ret = iface_add(&h->rbtree, &e.iface); +			if (ret) +				return ret; +		} +	} else if (!ret) +		return ret; + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], +		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	struct hash_netiface *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, last; +	char iface[IFNAMSIZ]; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !tb[IPSET_ATTR_IFACE] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (e.cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +	} + +	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); +	e.iface = iface; +	ret = iface_test(&h->rbtree, &e.iface); +	if (adt == IPSET_ADD) { +		if (!ret) { +			ret = iface_add(&h->rbtree, &e.iface); +			if (ret) +				return ret; +		} +	} else if (!ret) +		return ret; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_PHYSDEV) +			e.physdev = 1; +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} +	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { +		e.ip = htonl(ip & ip_set_hostmask(e.cidr)); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip_to < ip) +			swap(ip, ip_to); +		if (ip + UINT_MAX == ip_to) +			return -IPSET_ERR_HASH_RANGE; +	} else +		ip_set_mask_from_to(ip, ip_to, e.cidr); + +	if (retried) +		ip = ntohl(h->next.ip); +	while (!after(ip, ip_to)) { +		e.ip = htonl(ip); +		last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +		ip = last + 1; +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_netiface6_elem_hashed { +	union nf_inet_addr ip; +	u8 physdev; +	u8 cidr; +	u8 nomatch; +	u8 elem; +}; + +struct hash_netiface6_elem { +	union nf_inet_addr ip; +	u8 physdev; +	u8 cidr; +	u8 nomatch; +	u8 elem; +	const char *iface; +}; + +/* Common functions */ + +static inline bool +hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, +			  const struct hash_netiface6_elem *ip2, +			  u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ip1->cidr == ip2->cidr && +	       (++*multi) && +	       ip1->physdev == ip2->physdev && +	       ip1->iface == ip2->iface; +} + +static inline int +hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) +{ +	ip6_netmask(&elem->ip, cidr); +	elem->cidr = cidr; +} + +static bool +hash_netiface6_data_list(struct sk_buff *skb, +			 const struct hash_netiface6_elem *data) +{ +	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + +	if (data->nomatch) +		flags |= IPSET_FLAG_NOMATCH; +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || +	    nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netiface6_data_next(struct hash_netiface4_elem *next, +			 const struct hash_netiface6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE		hash_netiface6 +#define PF		6 +#define HOST_MASK	128 +#define HKEY_DATALEN	sizeof(struct hash_netiface6_elem_hashed) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, +		    const struct xt_action_param *par, +		    enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	struct hash_netiface *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netiface6_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), +		.elem = 1, +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); +	int ret; + +	if (e.cidr == 0) +		return -EINVAL; +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	ip6_netmask(&e.ip, e.cidr); + +	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { +#ifdef CONFIG_BRIDGE_NETFILTER +		const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + +		if (!nf_bridge) +			return -EINVAL; +		e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); +		e.physdev = 1; +#else +		e.iface = NULL; +#endif +	} else +		e.iface = SRCDIR ? IFACE(in) : IFACE(out); + +	if (!e.iface) +		return -EINVAL; +	ret = iface_test(&h->rbtree, &e.iface); +	if (adt == IPSET_ADD) { +		if (!ret) { +			ret = iface_add(&h->rbtree, &e.iface); +			if (ret) +				return ret; +		} +	} else if (!ret) +		return ret; + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], +		   enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	struct hash_netiface *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	char iface[IFNAMSIZ]; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !tb[IPSET_ATTR_IFACE] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) +		e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +	if (e.cidr > HOST_MASK) +		return -IPSET_ERR_INVALID_CIDR; +	ip6_netmask(&e.ip, e.cidr); + +	strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); +	e.iface = iface; +	ret = iface_test(&h->rbtree, &e.iface); +	if (adt == IPSET_ADD) { +		if (!ret) { +			ret = iface_add(&h->rbtree, &e.iface); +			if (ret) +				return ret; +		} +	} else if (!ret) +		return ret; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_PHYSDEV) +			e.physdev = 1; +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	ret = adtfn(set, &e, &ext, &ext, flags); + +	return ip_set_enomatch(ret, flags, adt, set) ? -ret : +	       ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_netiface_type __read_mostly = { +	.name		= "hash:net,iface", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_IFACE | +			  IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_netiface_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IFACE]	= { .type = NLA_NUL_STRING, +					    .len  = IFNAMSIZ - 1 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_netiface_init(void) +{ +	return ip_set_type_register(&hash_netiface_type); +} + +static void __exit +hash_netiface_fini(void) +{ +	ip_set_type_unregister(&hash_netiface_type); +} + +module_init(hash_netiface_init); +module_exit(hash_netiface_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c new file mode 100644 index 00000000000..3e99987e4bf --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -0,0 +1,481 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +#define IPSET_TYPE_REV_MAX	1	/* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,net"); + +/* Type specific function prefix */ +#define HTYPE		hash_netnet +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variants */ + +/* Member elements  */ +struct hash_netnet4_elem { +	union { +		__be32 ip[2]; +		__be64 ipcmp; +	}; +	u8 nomatch; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +}; + +/* Common functions */ + +static inline bool +hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, +		     const struct hash_netnet4_elem *ip2, +		     u32 *multi) +{ +	return ip1->ipcmp == ip2->ipcmp && +	       ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, +			  struct hash_netnet4_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) +{ +	if (inner) { +		elem->ip[1] &= ip_set_netmask(cidr); +		elem->cidr[1] = cidr; +	} else { +		elem->ip[0] &= ip_set_netmask(cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netnet4_data_list(struct sk_buff *skb, +		    const struct hash_netnet4_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || +	    nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return false; + +nla_put_failure: +	return true; +} + +static inline void +hash_netnet4_data_next(struct hash_netnet4_elem *next, +		    const struct hash_netnet4_elem *d) +{ +	next->ipcmp = d->ipcmp; +} + +#define MTYPE		hash_netnet4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); +	ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); +	e.ip[0] &= ip_set_netmask(e.cidr[0]); +	e.ip[1] &= ip_set_netmask(e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], +	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, last; +	u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; +	u8 cidr, cidr2; +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[0] = cidr; +	} + +	if (tb[IPSET_ATTR_CIDR2]) { +		cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); +		if (!cidr2 || cidr2 > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[1] = cidr2; +	} + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] && +				   tb[IPSET_ATTR_IP2_TO])) { +		e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); +		e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip; +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip_to < ip) +			swap(ip, ip_to); +		if (ip + UINT_MAX == ip_to) +			return -IPSET_ERR_HASH_RANGE; +	} + +	ip2_to = ip2_from; +	if (tb[IPSET_ATTR_IP2_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); +		if (ret) +			return ret; +		if (ip2_to < ip2_from) +			swap(ip2_from, ip2_to); +		if (ip2_from + UINT_MAX == ip2_to) +			return -IPSET_ERR_HASH_RANGE; + +	} + +	if (retried) +		ip = ntohl(h->next.ip[0]); + +	while (!after(ip, ip_to)) { +		e.ip[0] = htonl(ip); +		last = ip_set_range_to_cidr(ip, ip_to, &cidr); +		e.cidr[0] = cidr; +		ip2 = (retried && +		       ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) +						   : ip2_from; +		while (!after(ip2, ip2_to)) { +			e.ip[1] = htonl(ip2); +			last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); +			e.cidr[1] = cidr2; +			ret = adtfn(set, &e, &ext, &ext, flags); +			if (ret && !ip_set_eexist(ret, flags)) +				return ret; +			else +				ret = 0; +			ip2 = last2 + 1; +		} +		ip = last + 1; +	} +	return ret; +} + +/* IPv6 variants */ + +struct hash_netnet6_elem { +	union nf_inet_addr ip[2]; +	u8 nomatch; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +}; + +/* Common functions */ + +static inline bool +hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, +		     const struct hash_netnet6_elem *ip2, +		     u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && +	       ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && +	       ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) +{ +	elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, +			  struct hash_netnet6_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) +{ +	if (inner) { +		ip6_netmask(&elem->ip[1], cidr); +		elem->cidr[1] = cidr; +	} else { +		ip6_netmask(&elem->ip[0], cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netnet6_data_list(struct sk_buff *skb, +		    const struct hash_netnet6_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || +	    nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return false; + +nla_put_failure: +	return true; +} + +static inline void +hash_netnet6_data_next(struct hash_netnet4_elem *next, +		    const struct hash_netnet6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_netnet6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); +	ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], +	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || +	      ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) +		e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +	if (tb[IPSET_ATTR_CIDR2]) +		e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + +	if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || +	    e.cidr[1] > HOST_MASK) +		return -IPSET_ERR_INVALID_CIDR; + +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	ret = adtfn(set, &e, &ext, &ext, flags); + +	return ip_set_enomatch(ret, flags, adt, set) ? -ret : +	       ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_netnet_type __read_mostly = { +	.name		= "hash:net,net", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_netnet_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CIDR2]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_netnet_init(void) +{ +	return ip_set_type_register(&hash_netnet_type); +} + +static void __exit +hash_netnet_fini(void) +{ +	ip_set_type_unregister(&hash_netnet_type); +} + +module_init(hash_netnet_init); +module_exit(hash_netnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c new file mode 100644 index 00000000000..1c645fbd09c --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -0,0 +1,509 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net,port type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1    SCTP and UDPLITE support added */ +/*				2    Range as input support for IPv4 added */ +/*				3    nomatch flag support added */ +/*				4    Counters support added */ +/*				5    Comments support added */ +#define IPSET_TYPE_REV_MAX	6 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,port"); + +/* Type specific function prefix */ +#define HTYPE		hash_netport +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED + +/* IPv4 variant */ + +/* Member elements */ +struct hash_netport4_elem { +	__be32 ip; +	__be16 port; +	u8 proto; +	u8 cidr:7; +	u8 nomatch:1; +}; + +/* Common functions */ + +static inline bool +hash_netport4_data_equal(const struct hash_netport4_elem *ip1, +			 const struct hash_netport4_elem *ip2, +			 u32 *multi) +{ +	return ip1->ip == ip2->ip && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto && +	       ip1->cidr == ip2->cidr; +} + +static inline int +hash_netport4_do_data_match(const struct hash_netport4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) +{ +	elem->ip &= ip_set_netmask(cidr); +	elem->cidr = cidr - 1; +} + +static bool +hash_netport4_data_list(struct sk_buff *skb, +			const struct hash_netport4_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netport4_data_next(struct hash_netport4_elem *next, +			const struct hash_netport4_elem *d) +{ +	next->ip = d->ip; +	next->port = d->port; +} + +#define MTYPE		hash_netport4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, +		   const struct xt_action_param *par, +		   enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netport *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netport4_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK - 1; + +	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); +	e.ip &= ip_set_netmask(e.cidr + 1); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], +		   enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netport *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to, p = 0, ip = 0, ip_to = 0, last; +	bool with_ports = false; +	u8 cidr; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr = cidr - 1; +	} + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMP)) +		e.port = 0; + +	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { +		e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	port = port_to = ntohs(e.port); +	if (tb[IPSET_ATTR_PORT_TO]) { +		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +		if (port_to < port) +			swap(port, port_to); +	} +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip_to < ip) +			swap(ip, ip_to); +		if (ip + UINT_MAX == ip_to) +			return -IPSET_ERR_HASH_RANGE; +	} else +		ip_set_mask_from_to(ip, ip_to, e.cidr + 1); + +	if (retried) +		ip = ntohl(h->next.ip); +	while (!after(ip, ip_to)) { +		e.ip = htonl(ip); +		last = ip_set_range_to_cidr(ip, ip_to, &cidr); +		e.cidr = cidr - 1; +		p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) +						       : port; +		for (; p <= port_to; p++) { +			e.port = htons(p); +			ret = adtfn(set, &e, &ext, &ext, flags); + +			if (ret && !ip_set_eexist(ret, flags)) +				return ret; +			else +				ret = 0; +		} +		ip = last + 1; +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_netport6_elem { +	union nf_inet_addr ip; +	__be16 port; +	u8 proto; +	u8 cidr:7; +	u8 nomatch:1; +}; + +/* Common functions */ + +static inline bool +hash_netport6_data_equal(const struct hash_netport6_elem *ip1, +			 const struct hash_netport6_elem *ip2, +			 u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto && +	       ip1->cidr == ip2->cidr; +} + +static inline int +hash_netport6_do_data_match(const struct hash_netport6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) +{ +	ip6_netmask(&elem->ip, cidr); +	elem->cidr = cidr - 1; +} + +static bool +hash_netport6_data_list(struct sk_buff *skb, +			const struct hash_netport6_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netport6_data_next(struct hash_netport4_elem *next, +			const struct hash_netport6_elem *d) +{ +	next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_netport6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, +		   const struct xt_action_param *par, +		   enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netport *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netport6_elem e = { +		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, +	}; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	if (adt == IPSET_TEST) +		e.cidr = HOST_MASK - 1; + +	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); +	ip6_netmask(&e.ip, e.cidr + 1); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], +		   enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netport *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netport6_elem e = { .cidr = HOST_MASK  - 1 }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to; +	bool with_ports = false; +	u8 cidr; +	int ret; + +	if (unlikely(!tb[IPSET_ATTR_IP] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr = cidr - 1; +	} +	ip6_netmask(&e.ip, e.cidr + 1); + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMPV6)) +		e.port = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	port = ntohs(e.port); +	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +	if (port > port_to) +		swap(port, port_to); + +	if (retried) +		port = ntohs(h->next.port); +	for (; port <= port_to; port++) { +		e.port = htons(port); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static struct ip_set_type hash_netport_type __read_mostly = { +	.name		= "hash:net,port", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_TWO, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_netport_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_netport_init(void) +{ +	return ip_set_type_register(&hash_netport_type); +} + +static void __exit +hash_netport_fini(void) +{ +	ip_set_type_unregister(&hash_netport_type); +} + +module_init(hash_netport_init); +module_exit(hash_netport_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c new file mode 100644 index 00000000000..c0d2ba73f8b --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -0,0 +1,587 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				0    Comments support added */ +#define IPSET_TYPE_REV_MAX	1 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,port,net"); + +/* Type specific function prefix */ +#define HTYPE		hash_netportnet +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variant */ + +/* Member elements */ +struct hash_netportnet4_elem { +	union { +		__be32 ip[2]; +		__be64 ipcmp; +	}; +	__be16 port; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +	u8 nomatch:1; +	u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, +			   const struct hash_netportnet4_elem *ip2, +			   u32 *multi) +{ +	return ip1->ipcmp == ip2->ipcmp && +	       ip1->ccmp == ip2->ccmp && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, +				struct hash_netportnet4_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, +			      u8 cidr, bool inner) +{ +	if (inner) { +		elem->ip[1] &= ip_set_netmask(cidr); +		elem->cidr[1] = cidr; +	} else { +		elem->ip[0] &= ip_set_netmask(cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netportnet4_data_list(struct sk_buff *skb, +			  const struct hash_netportnet4_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || +	    nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netportnet4_data_next(struct hash_netportnet4_elem *next, +			  const struct hash_netportnet4_elem *d) +{ +	next->ipcmp = d->ipcmp; +	next->port = d->port; +} + +#define MTYPE		hash_netportnet4 +#define PF		4 +#define HOST_MASK	32 +#include "ip_set_hash_gen.h" + +static int +hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, +		     const struct xt_action_param *par, +		     enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + +	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); +	ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]); +	e.ip[0] &= ip_set_netmask(e.cidr[0]); +	e.ip[1] &= ip_set_netmask(e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], +		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet4_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; +	u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; +	bool with_ports = false; +	u8 cidr, cidr2; +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || +	      ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[0] = cidr; +	} + +	if (tb[IPSET_ATTR_CIDR2]) { +		cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); +		if (!cidr || cidr > HOST_MASK) +			return -IPSET_ERR_INVALID_CIDR; +		e.cidr[1] = cidr; +	} + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMP)) +		e.port = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; +	if (adt == IPSET_TEST || +	    !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { +		e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); +		e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	ip_to = ip; +	if (tb[IPSET_ATTR_IP_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); +		if (ret) +			return ret; +		if (ip > ip_to) +			swap(ip, ip_to); +		if (unlikely(ip + UINT_MAX == ip_to)) +			return -IPSET_ERR_HASH_RANGE; +	} + +	port_to = port = ntohs(e.port); +	if (tb[IPSET_ATTR_PORT_TO]) { +		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +		if (port > port_to) +			swap(port, port_to); +	} + +	ip2_to = ip2_from; +	if (tb[IPSET_ATTR_IP2_TO]) { +		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); +		if (ret) +			return ret; +		if (ip2_from > ip2_to) +			swap(ip2_from, ip2_to); +		if (unlikely(ip2_from + UINT_MAX == ip2_to)) +			return -IPSET_ERR_HASH_RANGE; +	} + +	if (retried) +		ip = ntohl(h->next.ip[0]); + +	while (!after(ip, ip_to)) { +		e.ip[0] = htonl(ip); +		ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); +		e.cidr[0] = cidr; +		p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) +							  : port; +		for (; p <= port_to; p++) { +			e.port = htons(p); +			ip2 = (retried && ip == ntohl(h->next.ip[0]) && +			       p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) +							 : ip2_from; +			while (!after(ip2, ip2_to)) { +				e.ip[1] = htonl(ip2); +				ip2_last = ip_set_range_to_cidr(ip2, ip2_to, +								&cidr2); +				e.cidr[1] = cidr2; +				ret = adtfn(set, &e, &ext, &ext, flags); +				if (ret && !ip_set_eexist(ret, flags)) +					return ret; +				else +					ret = 0; +				ip2 = ip2_last + 1; +			} +		} +		ip = ip_last + 1; +	} +	return ret; +} + +/* IPv6 variant */ + +struct hash_netportnet6_elem { +	union nf_inet_addr ip[2]; +	__be16 port; +	union { +		u8 cidr[2]; +		u16 ccmp; +	}; +	u8 nomatch:1; +	u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, +			   const struct hash_netportnet6_elem *ip2, +			   u32 *multi) +{ +	return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && +	       ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && +	       ip1->ccmp == ip2->ccmp && +	       ip1->port == ip2->port && +	       ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) +{ +	return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) +{ +	elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) +{ +	swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, +				struct hash_netportnet6_elem *orig) +{ +	elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, +			      u8 cidr, bool inner) +{ +	if (inner) { +		ip6_netmask(&elem->ip[1], cidr); +		elem->cidr[1] = cidr; +	} else { +		ip6_netmask(&elem->ip[0], cidr); +		elem->cidr[0] = cidr; +	} +} + +static bool +hash_netportnet6_data_list(struct sk_buff *skb, +			  const struct hash_netportnet6_elem *data) +{ +	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + +	if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || +	    nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || +	    nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || +	    nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || +	    nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || +	    (flags && +	     nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return 1; +} + +static inline void +hash_netportnet6_data_next(struct hash_netportnet4_elem *next, +			  const struct hash_netportnet6_elem *d) +{ +	next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE		hash_netportnet6 +#define PF		6 +#define HOST_MASK	128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, +		     const struct xt_action_param *par, +		     enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); +	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); +	if (adt == IPSET_TEST) +		e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; + +	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, +				 &e.port, &e.proto)) +		return -EINVAL; + +	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); +	ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6); +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], +		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	const struct hash_netportnet *h = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct hash_netportnet6_elem e = { }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	u32 port, port_to; +	bool with_ports = false; +	int ret; + +	e.cidr[0] = e.cidr[1] = HOST_MASK; +	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || +		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; +	if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) +		return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || +	      ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || +	      ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; + +	if (tb[IPSET_ATTR_CIDR]) +		e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + +	if (tb[IPSET_ATTR_CIDR2]) +		e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + +	if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || +		     e.cidr[1] > HOST_MASK)) +		return -IPSET_ERR_INVALID_CIDR; + +	ip6_netmask(&e.ip[0], e.cidr[0]); +	ip6_netmask(&e.ip[1], e.cidr[1]); + +	if (tb[IPSET_ATTR_PORT]) +		e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); +	else +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_PROTO]) { +		e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); +		with_ports = ip_set_proto_with_ports(e.proto); + +		if (e.proto == 0) +			return -IPSET_ERR_INVALID_PROTO; +	} else +		return -IPSET_ERR_MISSING_PROTO; + +	if (!(with_ports || e.proto == IPPROTO_ICMPV6)) +		e.port = 0; + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		if (cadt_flags & IPSET_FLAG_NOMATCH) +			flags |= (IPSET_FLAG_NOMATCH << 16); +	} + +	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { +		ret = adtfn(set, &e, &ext, &ext, flags); +		return ip_set_enomatch(ret, flags, adt, set) ? -ret : +		       ip_set_eexist(ret, flags) ? 0 : ret; +	} + +	port = ntohs(e.port); +	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); +	if (port > port_to) +		swap(port, port_to); + +	if (retried) +		port = ntohs(h->next.port); +	for (; port <= port_to; port++) { +		e.port = htons(port); +		ret = adtfn(set, &e, &ext, &ext, flags); + +		if (ret && !ip_set_eexist(ret, flags)) +			return ret; +		else +			ret = 0; +	} +	return ret; +} + +static struct ip_set_type hash_netportnet_type __read_mostly = { +	.name		= "hash:net,port,net", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | +			  IPSET_TYPE_NOMATCH, +	.dimension	= IPSET_DIM_THREE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= hash_netportnet_create, +	.create_policy	= { +		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 }, +		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 }, +		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_IP]		= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_IP2_TO]	= { .type = NLA_NESTED }, +		[IPSET_ATTR_PORT]	= { .type = NLA_U16 }, +		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 }, +		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CIDR2]	= { .type = NLA_U8 }, +		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +hash_netportnet_init(void) +{ +	return ip_set_type_register(&hash_netportnet_type); +} + +static void __exit +hash_netportnet_fini(void) +{ +	ip_set_type_unregister(&hash_netportnet_type); +} + +module_init(hash_netportnet_init); +module_exit(hash_netportnet_fini); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c new file mode 100644 index 00000000000..3e2317f3cf6 --- /dev/null +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -0,0 +1,685 @@ +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the list:set type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> + +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_list.h> + +#define IPSET_TYPE_REV_MIN	0 +/*				1    Counters support added */ +#define IPSET_TYPE_REV_MAX	2 /* Comments support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_list:set"); + +/* Member elements  */ +struct set_elem { +	ip_set_id_t id; +}; + +struct set_adt_elem { +	ip_set_id_t id; +	ip_set_id_t refid; +	int before; +}; + +/* Type structure */ +struct list_set { +	u32 size;		/* size of set list array */ +	struct timer_list gc;	/* garbage collection */ +	struct net *net;	/* namespace */ +	struct set_elem members[0]; /* the set members */ +}; + +#define list_set_elem(set, map, id)	\ +	(struct set_elem *)((void *)(map)->members + (id) * (set)->dsize) + +static int +list_set_ktest(struct ip_set *set, const struct sk_buff *skb, +	       const struct xt_action_param *par, +	       struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) +{ +	struct list_set *map = set->data; +	struct set_elem *e; +	u32 i, cmdflags = opt->cmdflags; +	int ret; + +	/* Don't lookup sub-counters at all */ +	opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; +	if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) +		opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; +	for (i = 0; i < map->size; i++) { +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			return 0; +		if (SET_WITH_TIMEOUT(set) && +		    ip_set_timeout_expired(ext_timeout(e, set))) +			continue; +		ret = ip_set_test(e->id, skb, par, opt); +		if (ret > 0) { +			if (SET_WITH_COUNTER(set)) +				ip_set_update_counter(ext_counter(e, set), +						      ext, &opt->ext, +						      cmdflags); +			return ret; +		} +	} +	return 0; +} + +static int +list_set_kadd(struct ip_set *set, const struct sk_buff *skb, +	      const struct xt_action_param *par, +	      struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) +{ +	struct list_set *map = set->data; +	struct set_elem *e; +	u32 i; +	int ret; + +	for (i = 0; i < map->size; i++) { +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			return 0; +		if (SET_WITH_TIMEOUT(set) && +		    ip_set_timeout_expired(ext_timeout(e, set))) +			continue; +		ret = ip_set_add(e->id, skb, par, opt); +		if (ret == 0) +			return ret; +	} +	return 0; +} + +static int +list_set_kdel(struct ip_set *set, const struct sk_buff *skb, +	      const struct xt_action_param *par, +	      struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) +{ +	struct list_set *map = set->data; +	struct set_elem *e; +	u32 i; +	int ret; + +	for (i = 0; i < map->size; i++) { +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			return 0; +		if (SET_WITH_TIMEOUT(set) && +		    ip_set_timeout_expired(ext_timeout(e, set))) +			continue; +		ret = ip_set_del(e->id, skb, par, opt); +		if (ret == 0) +			return ret; +	} +	return 0; +} + +static int +list_set_kadt(struct ip_set *set, const struct sk_buff *skb, +	      const struct xt_action_param *par, +	      enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ +	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + +	switch (adt) { +	case IPSET_TEST: +		return list_set_ktest(set, skb, par, opt, &ext); +	case IPSET_ADD: +		return list_set_kadd(set, skb, par, opt, &ext); +	case IPSET_DEL: +		return list_set_kdel(set, skb, par, opt, &ext); +	default: +		break; +	} +	return -EINVAL; +} + +static bool +id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) +{ +	const struct list_set *map = set->data; +	const struct set_elem *e; + +	if (i >= map->size) +		return 0; + +	e = list_set_elem(set, map, i); +	return !!(e->id == id && +		 !(SET_WITH_TIMEOUT(set) && +		   ip_set_timeout_expired(ext_timeout(e, set)))); +} + +static int +list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, +	     const struct ip_set_ext *ext) +{ +	struct list_set *map = set->data; +	struct set_elem *e = list_set_elem(set, map, i); + +	if (e->id != IPSET_INVALID_ID) { +		if (i == map->size - 1) { +			/* Last element replaced: e.g. add new,before,last */ +			ip_set_put_byindex(map->net, e->id); +			ip_set_ext_destroy(set, e); +		} else { +			struct set_elem *x = list_set_elem(set, map, +							   map->size - 1); + +			/* Last element pushed off */ +			if (x->id != IPSET_INVALID_ID) { +				ip_set_put_byindex(map->net, x->id); +				ip_set_ext_destroy(set, x); +			} +			memmove(list_set_elem(set, map, i + 1), e, +				set->dsize * (map->size - (i + 1))); +			/* Extensions must be initialized to zero */ +			memset(e, 0, set->dsize); +		} +	} + +	e->id = d->id; +	if (SET_WITH_TIMEOUT(set)) +		ip_set_timeout_set(ext_timeout(e, set), ext->timeout); +	if (SET_WITH_COUNTER(set)) +		ip_set_init_counter(ext_counter(e, set), ext); +	if (SET_WITH_COMMENT(set)) +		ip_set_init_comment(ext_comment(e, set), ext); +	return 0; +} + +static int +list_set_del(struct ip_set *set, u32 i) +{ +	struct list_set *map = set->data; +	struct set_elem *e = list_set_elem(set, map, i); + +	ip_set_put_byindex(map->net, e->id); +	ip_set_ext_destroy(set, e); + +	if (i < map->size - 1) +		memmove(e, list_set_elem(set, map, i + 1), +			set->dsize * (map->size - (i + 1))); + +	/* Last element */ +	e = list_set_elem(set, map, map->size - 1); +	e->id = IPSET_INVALID_ID; +	return 0; +} + +static void +set_cleanup_entries(struct ip_set *set) +{ +	struct list_set *map = set->data; +	struct set_elem *e; +	u32 i = 0; + +	while (i < map->size) { +		e = list_set_elem(set, map, i); +		if (e->id != IPSET_INVALID_ID && +		    ip_set_timeout_expired(ext_timeout(e, set))) +			list_set_del(set, i); +			/* Check element moved to position i in next loop */ +		else +			i++; +	} +} + +static int +list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	       struct ip_set_ext *mext, u32 flags) +{ +	struct list_set *map = set->data; +	struct set_adt_elem *d = value; +	struct set_elem *e; +	u32 i; +	int ret; + +	for (i = 0; i < map->size; i++) { +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			return 0; +		else if (SET_WITH_TIMEOUT(set) && +			 ip_set_timeout_expired(ext_timeout(e, set))) +			continue; +		else if (e->id != d->id) +			continue; + +		if (d->before == 0) +			return 1; +		else if (d->before > 0) +			ret = id_eq(set, i + 1, d->refid); +		else +			ret = i > 0 && id_eq(set, i - 1, d->refid); +		return ret; +	} +	return 0; +} + + +static int +list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	      struct ip_set_ext *mext, u32 flags) +{ +	struct list_set *map = set->data; +	struct set_adt_elem *d = value; +	struct set_elem *e; +	bool flag_exist = flags & IPSET_FLAG_EXIST; +	u32 i, ret = 0; + +	if (SET_WITH_TIMEOUT(set)) +		set_cleanup_entries(set); + +	/* Check already added element */ +	for (i = 0; i < map->size; i++) { +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			goto insert; +		else if (e->id != d->id) +			continue; + +		if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || +		    (d->before < 0 && +		     (i == 0 || !id_eq(set, i - 1, d->refid)))) +			/* Before/after doesn't match */ +			return -IPSET_ERR_REF_EXIST; +		if (!flag_exist) +			/* Can't re-add */ +			return -IPSET_ERR_EXIST; +		/* Update extensions */ +		ip_set_ext_destroy(set, e); + +		if (SET_WITH_TIMEOUT(set)) +			ip_set_timeout_set(ext_timeout(e, set), ext->timeout); +		if (SET_WITH_COUNTER(set)) +			ip_set_init_counter(ext_counter(e, set), ext); +		if (SET_WITH_COMMENT(set)) +			ip_set_init_comment(ext_comment(e, set), ext); +		/* Set is already added to the list */ +		ip_set_put_byindex(map->net, d->id); +		return 0; +	} +insert: +	ret = -IPSET_ERR_LIST_FULL; +	for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			ret = d->before != 0 ? -IPSET_ERR_REF_EXIST +				: list_set_add(set, i, d, ext); +		else if (e->id != d->refid) +			continue; +		else if (d->before > 0) +			ret = list_set_add(set, i, d, ext); +		else if (i + 1 < map->size) +			ret = list_set_add(set, i + 1, d, ext); +	} + +	return ret; +} + +static int +list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, +	      struct ip_set_ext *mext, u32 flags) +{ +	struct list_set *map = set->data; +	struct set_adt_elem *d = value; +	struct set_elem *e; +	u32 i; + +	for (i = 0; i < map->size; i++) { +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			return d->before != 0 ? -IPSET_ERR_REF_EXIST +					      : -IPSET_ERR_EXIST; +		else if (SET_WITH_TIMEOUT(set) && +			 ip_set_timeout_expired(ext_timeout(e, set))) +			continue; +		else if (e->id != d->id) +			continue; + +		if (d->before == 0) +			return list_set_del(set, i); +		else if (d->before > 0) { +			if (!id_eq(set, i + 1, d->refid)) +				return -IPSET_ERR_REF_EXIST; +			return list_set_del(set, i); +		} else if (i == 0 || !id_eq(set, i - 1, d->refid)) +			return -IPSET_ERR_REF_EXIST; +		else +			return list_set_del(set, i); +	} +	return -IPSET_ERR_EXIST; +} + +static int +list_set_uadt(struct ip_set *set, struct nlattr *tb[], +	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ +	struct list_set *map = set->data; +	ipset_adtfn adtfn = set->variant->adt[adt]; +	struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; +	struct ip_set_ext ext = IP_SET_INIT_UEXT(set); +	struct ip_set *s; +	int ret = 0; + +	if (unlikely(!tb[IPSET_ATTR_NAME] || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_LINENO]) +		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + +	ret = ip_set_get_extensions(set, tb, &ext); +	if (ret) +		return ret; +	e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s); +	if (e.id == IPSET_INVALID_ID) +		return -IPSET_ERR_NAME; +	/* "Loop detection" */ +	if (s->type->features & IPSET_TYPE_NAME) { +		ret = -IPSET_ERR_LOOP; +		goto finish; +	} + +	if (tb[IPSET_ATTR_CADT_FLAGS]) { +		u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); +		e.before = f & IPSET_FLAG_BEFORE; +	} + +	if (e.before && !tb[IPSET_ATTR_NAMEREF]) { +		ret = -IPSET_ERR_BEFORE; +		goto finish; +	} + +	if (tb[IPSET_ATTR_NAMEREF]) { +		e.refid = ip_set_get_byname(map->net, +					    nla_data(tb[IPSET_ATTR_NAMEREF]), +					    &s); +		if (e.refid == IPSET_INVALID_ID) { +			ret = -IPSET_ERR_NAMEREF; +			goto finish; +		} +		if (!e.before) +			e.before = -1; +	} +	if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set)) +		set_cleanup_entries(set); + +	ret = adtfn(set, &e, &ext, &ext, flags); + +finish: +	if (e.refid != IPSET_INVALID_ID) +		ip_set_put_byindex(map->net, e.refid); +	if (adt != IPSET_ADD || ret) +		ip_set_put_byindex(map->net, e.id); + +	return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static void +list_set_flush(struct ip_set *set) +{ +	struct list_set *map = set->data; +	struct set_elem *e; +	u32 i; + +	for (i = 0; i < map->size; i++) { +		e = list_set_elem(set, map, i); +		if (e->id != IPSET_INVALID_ID) { +			ip_set_put_byindex(map->net, e->id); +			ip_set_ext_destroy(set, e); +			e->id = IPSET_INVALID_ID; +		} +	} +} + +static void +list_set_destroy(struct ip_set *set) +{ +	struct list_set *map = set->data; + +	if (SET_WITH_TIMEOUT(set)) +		del_timer_sync(&map->gc); +	list_set_flush(set); +	kfree(map); + +	set->data = NULL; +} + +static int +list_set_head(struct ip_set *set, struct sk_buff *skb) +{ +	const struct list_set *map = set->data; +	struct nlattr *nested; + +	nested = ipset_nest_start(skb, IPSET_ATTR_DATA); +	if (!nested) +		goto nla_put_failure; +	if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || +	    nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || +	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, +			  htonl(sizeof(*map) + map->size * set->dsize))) +		goto nla_put_failure; +	if (unlikely(ip_set_put_flags(skb, set))) +		goto nla_put_failure; +	ipset_nest_end(skb, nested); + +	return 0; +nla_put_failure: +	return -EMSGSIZE; +} + +static int +list_set_list(const struct ip_set *set, +	      struct sk_buff *skb, struct netlink_callback *cb) +{ +	const struct list_set *map = set->data; +	struct nlattr *atd, *nested; +	u32 i, first = cb->args[IPSET_CB_ARG0]; +	const struct set_elem *e; + +	atd = ipset_nest_start(skb, IPSET_ATTR_ADT); +	if (!atd) +		return -EMSGSIZE; +	for (; cb->args[IPSET_CB_ARG0] < map->size; +	     cb->args[IPSET_CB_ARG0]++) { +		i = cb->args[IPSET_CB_ARG0]; +		e = list_set_elem(set, map, i); +		if (e->id == IPSET_INVALID_ID) +			goto finish; +		if (SET_WITH_TIMEOUT(set) && +		    ip_set_timeout_expired(ext_timeout(e, set))) +			continue; +		nested = ipset_nest_start(skb, IPSET_ATTR_DATA); +		if (!nested) { +			if (i == first) { +				nla_nest_cancel(skb, atd); +				return -EMSGSIZE; +			} else +				goto nla_put_failure; +		} +		if (nla_put_string(skb, IPSET_ATTR_NAME, +				   ip_set_name_byindex(map->net, e->id))) +			goto nla_put_failure; +		if (ip_set_put_extensions(skb, set, e, true)) +			goto nla_put_failure; +		ipset_nest_end(skb, nested); +	} +finish: +	ipset_nest_end(skb, atd); +	/* Set listing finished */ +	cb->args[IPSET_CB_ARG0] = 0; +	return 0; + +nla_put_failure: +	nla_nest_cancel(skb, nested); +	if (unlikely(i == first)) { +		cb->args[IPSET_CB_ARG0] = 0; +		return -EMSGSIZE; +	} +	ipset_nest_end(skb, atd); +	return 0; +} + +static bool +list_set_same_set(const struct ip_set *a, const struct ip_set *b) +{ +	const struct list_set *x = a->data; +	const struct list_set *y = b->data; + +	return x->size == y->size && +	       a->timeout == b->timeout && +	       a->extensions == b->extensions; +} + +static const struct ip_set_type_variant set_variant = { +	.kadt	= list_set_kadt, +	.uadt	= list_set_uadt, +	.adt	= { +		[IPSET_ADD] = list_set_uadd, +		[IPSET_DEL] = list_set_udel, +		[IPSET_TEST] = list_set_utest, +	}, +	.destroy = list_set_destroy, +	.flush	= list_set_flush, +	.head	= list_set_head, +	.list	= list_set_list, +	.same_set = list_set_same_set, +}; + +static void +list_set_gc(unsigned long ul_set) +{ +	struct ip_set *set = (struct ip_set *) ul_set; +	struct list_set *map = set->data; + +	write_lock_bh(&set->lock); +	set_cleanup_entries(set); +	write_unlock_bh(&set->lock); + +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; +	add_timer(&map->gc); +} + +static void +list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ +	struct list_set *map = set->data; + +	init_timer(&map->gc); +	map->gc.data = (unsigned long) set; +	map->gc.function = gc; +	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; +	add_timer(&map->gc); +} + +/* Create list:set type of sets */ + +static bool +init_list_set(struct net *net, struct ip_set *set, u32 size) +{ +	struct list_set *map; +	struct set_elem *e; +	u32 i; + +	map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL); +	if (!map) +		return false; + +	map->size = size; +	map->net = net; +	set->data = map; + +	for (i = 0; i < size; i++) { +		e = list_set_elem(set, map, i); +		e->id = IPSET_INVALID_ID; +	} + +	return true; +} + +static int +list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], +		u32 flags) +{ +	u32 size = IP_SET_LIST_DEFAULT_SIZE; + +	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || +		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) +		return -IPSET_ERR_PROTOCOL; + +	if (tb[IPSET_ATTR_SIZE]) +		size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]); +	if (size < IP_SET_LIST_MIN_SIZE) +		size = IP_SET_LIST_MIN_SIZE; + +	set->variant = &set_variant; +	set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); +	if (!init_list_set(net, set, size)) +		return -ENOMEM; +	if (tb[IPSET_ATTR_TIMEOUT]) { +		set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +		list_set_gc_init(set, list_set_gc); +	} +	return 0; +} + +static struct ip_set_type list_set_type __read_mostly = { +	.name		= "list:set", +	.protocol	= IPSET_PROTOCOL, +	.features	= IPSET_TYPE_NAME | IPSET_DUMP_LAST, +	.dimension	= IPSET_DIM_ONE, +	.family		= NFPROTO_UNSPEC, +	.revision_min	= IPSET_TYPE_REV_MIN, +	.revision_max	= IPSET_TYPE_REV_MAX, +	.create		= list_set_create, +	.create_policy	= { +		[IPSET_ATTR_SIZE]	= { .type = NLA_U32 }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +	}, +	.adt_policy	= { +		[IPSET_ATTR_NAME]	= { .type = NLA_STRING, +					    .len = IPSET_MAXNAMELEN }, +		[IPSET_ATTR_NAMEREF]	= { .type = NLA_STRING, +					    .len = IPSET_MAXNAMELEN }, +		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 }, +		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 }, +		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 }, +		[IPSET_ATTR_BYTES]	= { .type = NLA_U64 }, +		[IPSET_ATTR_PACKETS]	= { .type = NLA_U64 }, +		[IPSET_ATTR_COMMENT]	= { .type = NLA_NUL_STRING }, +	}, +	.me		= THIS_MODULE, +}; + +static int __init +list_set_init(void) +{ +	return ip_set_type_register(&list_set_type); +} + +static void __exit +list_set_fini(void) +{ +	ip_set_type_unregister(&list_set_type); +} + +module_init(list_set_init); +module_exit(list_set_fini); diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c new file mode 100644 index 00000000000..04d15fdc99e --- /dev/null +++ b/net/netfilter/ipset/pfxlen.c @@ -0,0 +1,313 @@ +#include <linux/export.h> +#include <linux/netfilter/ipset/pfxlen.h> + +/* + * Prefixlen maps for fast conversions, by Jan Engelhardt. + */ + +#define E(a, b, c, d) \ +	{.ip6 = { \ +		htonl(a), htonl(b), \ +		htonl(c), htonl(d), \ +	} } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_netmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_netmask_map[] = { +	E(0x00000000, 0x00000000, 0x00000000, 0x00000000), +	E(0x80000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_netmask_map); + +#undef  E +#define E(a, b, c, d)						\ +	{.ip6 = { (__force __be32) a, (__force __be32) b,	\ +		  (__force __be32) c, (__force __be32) d,	\ +	} } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_hostmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_hostmask_map[] = { +	E(0x00000000, 0x00000000, 0x00000000, 0x00000000), +	E(0x80000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), +	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_hostmask_map); + +/* Find the largest network which matches the range from left, in host order. */ +u32 +ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr) +{ +	u32 last; +	u8 i; + +	for (i = 1; i < 32; i++) { +		if ((from & ip_set_hostmask(i)) != from) +			continue; +		last = from | ~ip_set_hostmask(i); +		if (!after(last, to)) { +			*cidr = i; +			return last; +		} +	} +	*cidr = 32; +	return from; +} +EXPORT_SYMBOL_GPL(ip_set_range_to_cidr); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 70bd1d0774c..0c3b1670b0d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -28,12 +28,11 @@ if IP_VS  config	IP_VS_IPV6  	bool "IPv6 support for IPVS"  	depends on IPV6 = y || IP_VS = IPV6 +	select IP6_NF_IPTABLES  	---help--- -	  Add IPv6 support to IPVS. This is incomplete and might be dangerous. +	  Add IPv6 support to IPVS. -	  See http://www.mindbasket.com/ipvs for more information. - -	  Say N if unsure. +	  Say Y if unsure.  config	IP_VS_DEBUG  	bool "IP virtual server debugging" @@ -122,7 +121,6 @@ config	IP_VS_RR  config	IP_VS_WRR  	tristate "weighted round-robin scheduling" -	select GCD  	---help---  	  The weighted robin-robin scheduling algorithm directs network  	  connections to different real servers based on server weights @@ -232,11 +230,27 @@ config	IP_VS_NQ  	  If you want to compile it in kernel, say Y. To compile it as a  	  module, choose M here. If unsure, say N. +comment 'IPVS SH scheduler' + +config IP_VS_SH_TAB_BITS +	int "IPVS source hashing table size (the Nth power of 2)" +	range 4 20 +	default 8 +	---help--- +	  The source hashing scheduler maps source IPs to destinations +	  stored in a hash table. This table is tiled by each destination +	  until all slots in the table are filled. When using weights to +	  allow destinations to receive more connections, the table is +	  tiled an amount proportional to the weights specified. The table +	  needs to be large enough to effectively fit all the destinations +	  multiplied by their respective weights. +  comment 'IPVS application helper'  config	IP_VS_FTP    	tristate "FTP protocol helper" -        depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT +	depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ +		NF_CONNTRACK_FTP  	select IP_VS_NFCT  	---help---  	  FTP is a protocol that transfers IP address and/or port number in diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index a475edee091..dfd7b65b3d2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -31,7 +31,6 @@  #include <net/net_namespace.h>  #include <net/protocol.h>  #include <net/tcp.h> -#include <asm/system.h>  #include <linux/stat.h>  #include <linux/proc_fs.h>  #include <linux/seq_file.h> @@ -43,11 +42,8 @@ EXPORT_SYMBOL(register_ip_vs_app);  EXPORT_SYMBOL(unregister_ip_vs_app);  EXPORT_SYMBOL(register_ip_vs_app_inc); -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list);  static DEFINE_MUTEX(__ip_vs_app_mutex); -  /*   *	Get an ip_vs_app object   */ @@ -62,12 +58,25 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)  	module_put(app->module);  } +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ +	kfree(inc->timeout_table); +	kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ +	struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + +	ip_vs_app_inc_destroy(inc); +}  /*   *	Allocate/initialize app incarnation and register it in proto apps.   */  static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, +		  __u16 port)  {  	struct ip_vs_protocol *pp;  	struct ip_vs_app *inc; @@ -98,7 +107,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)  		}  	} -	ret = pp->register_app(inc); +	ret = pp->register_app(net, inc);  	if (ret)  		goto out; @@ -109,8 +118,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)  	return 0;    out: -	kfree(inc->timeout_table); -	kfree(inc); +	ip_vs_app_inc_destroy(inc);  	return ret;  } @@ -119,7 +127,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)   *	Release app incarnation   */  static void -ip_vs_app_inc_release(struct ip_vs_app *inc) +ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)  {  	struct ip_vs_protocol *pp; @@ -127,15 +135,14 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)  		return;  	if (pp->unregister_app) -		pp->unregister_app(inc); +		pp->unregister_app(net, inc);  	IP_VS_DBG(9, "%s App %s:%u unregistered\n",  		  pp->name, inc->name, ntohs(inc->port));  	list_del(&inc->a_list); -	kfree(inc->timeout_table); -	kfree(inc); +	call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);  } @@ -147,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)  {  	int result; -	atomic_inc(&inc->usecnt); -	if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) -		atomic_dec(&inc->usecnt); +	result = ip_vs_app_get(inc->app); +	if (result) +		atomic_inc(&inc->usecnt);  	return result;  } @@ -159,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)   */  void ip_vs_app_inc_put(struct ip_vs_app *inc)  { -	ip_vs_app_put(inc->app);  	atomic_dec(&inc->usecnt); +	ip_vs_app_put(inc->app);  } @@ -168,13 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)   *	Register an application incarnation in protocol applications   */  int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, +		       __u16 port)  {  	int result;  	mutex_lock(&__ip_vs_app_mutex); -	result = ip_vs_app_inc_new(app, proto, port); +	result = ip_vs_app_inc_new(net, app, proto, port);  	mutex_unlock(&__ip_vs_app_mutex); @@ -182,51 +190,79 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)  } -/* - *	ip_vs_app registration routine - */ -int register_ip_vs_app(struct ip_vs_app *app) +/* Register application for netns */ +struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)  { -	/* increase the module use count */ -	ip_vs_use_count_inc(); +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_app *a; +	int err = 0; + +	if (!ipvs) +		return ERR_PTR(-ENOENT);  	mutex_lock(&__ip_vs_app_mutex); -	list_add(&app->a_list, &ip_vs_app_list); +	list_for_each_entry(a, &ipvs->app_list, a_list) { +		if (!strcmp(app->name, a->name)) { +			err = -EEXIST; +			goto out_unlock; +		} +	} +	a = kmemdup(app, sizeof(*app), GFP_KERNEL); +	if (!a) { +		err = -ENOMEM; +		goto out_unlock; +	} +	INIT_LIST_HEAD(&a->incs_list); +	list_add(&a->a_list, &ipvs->app_list); +	/* increase the module use count */ +	ip_vs_use_count_inc(); +out_unlock:  	mutex_unlock(&__ip_vs_app_mutex); -	return 0; +	return err ? ERR_PTR(err) : a;  }  /*   *	ip_vs_app unregistration routine   *	We are sure there are no app incarnations attached to services + *	Caller should use synchronize_rcu() or rcu_barrier()   */ -void unregister_ip_vs_app(struct ip_vs_app *app) +void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)  { -	struct ip_vs_app *inc, *nxt; +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_app *a, *anxt, *inc, *nxt; + +	if (!ipvs) +		return;  	mutex_lock(&__ip_vs_app_mutex); -	list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { -		ip_vs_app_inc_release(inc); -	} +	list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { +		if (app && strcmp(app->name, a->name)) +			continue; +		list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { +			ip_vs_app_inc_release(net, inc); +		} -	list_del(&app->a_list); +		list_del(&a->a_list); +		kfree(a); -	mutex_unlock(&__ip_vs_app_mutex); +		/* decrease the module use count */ +		ip_vs_use_count_dec(); +	} -	/* decrease the module use count */ -	ip_vs_use_count_dec(); +	mutex_unlock(&__ip_vs_app_mutex);  }  /*   *	Bind ip_vs_conn to its ip_vs_app (called by cp constructor)   */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +int ip_vs_bind_app(struct ip_vs_conn *cp, +		   struct ip_vs_protocol *pp)  {  	return pp->app_conn_bind(cp);  } @@ -313,17 +349,17 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)   *	Assumes already checked proto==IPPROTO_TCP and diff!=0.   */  static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, -				 unsigned flag, __u32 seq, int diff) +				 unsigned int flag, __u32 seq, int diff)  {  	/* spinlock is to keep updating cp->flags atomic */ -	spin_lock(&cp->lock); +	spin_lock_bh(&cp->lock);  	if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {  		vseq->previous_delta = vseq->delta;  		vseq->delta += diff;  		vseq->init_seq = seq;  		cp->flags |= flag;  	} -	spin_unlock(&cp->lock); +	spin_unlock_bh(&cp->lock);  }  static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, @@ -481,11 +517,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)   *	/proc/net/ip_vs_app entry function   */ -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)  {  	struct ip_vs_app *app, *inc; -	list_for_each_entry(app, &ip_vs_app_list, a_list) { +	list_for_each_entry(app, &ipvs->app_list, a_list) {  		list_for_each_entry(inc, &app->incs_list, a_list) {  			if (pos-- == 0)  				return inc; @@ -497,19 +533,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)  static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)  { +	struct net *net = seq_file_net(seq); +	struct netns_ipvs *ipvs = net_ipvs(net); +  	mutex_lock(&__ip_vs_app_mutex); -	return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; +	return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;  }  static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)  {  	struct ip_vs_app *inc, *app;  	struct list_head *e; +	struct net *net = seq_file_net(seq); +	struct netns_ipvs *ipvs = net_ipvs(net);  	++*pos;  	if (v == SEQ_START_TOKEN) -		return ip_vs_app_idx(0); +		return ip_vs_app_idx(ipvs, 0);  	inc = v;  	app = inc->app; @@ -518,7 +559,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)  		return list_entry(e, struct ip_vs_app, a_list);  	/* go on to next application */ -	for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { +	for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {  		app = list_entry(e, struct ip_vs_app, a_list);  		list_for_each_entry(inc, &app->incs_list, a_list) {  			return inc; @@ -557,7 +598,8 @@ static const struct seq_operations ip_vs_app_seq_ops = {  static int ip_vs_app_open(struct inode *inode, struct file *file)  { -	return seq_open(file, &ip_vs_app_seq_ops); +	return seq_open_net(inode, file, &ip_vs_app_seq_ops, +			    sizeof(struct seq_net_private));  }  static const struct file_operations ip_vs_app_fops = { @@ -565,19 +607,21 @@ static const struct file_operations ip_vs_app_fops = {  	.open	 = ip_vs_app_open,  	.read	 = seq_read,  	.llseek  = seq_lseek, -	.release = seq_release, +	.release = seq_release_net,  };  #endif -int __init ip_vs_app_init(void) +int __net_init ip_vs_app_net_init(struct net *net)  { -	/* we will replace it with proc_net_ipvs_create() soon */ -	proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); +	struct netns_ipvs *ipvs = net_ipvs(net); + +	INIT_LIST_HEAD(&ipvs->app_list); +	proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);  	return 0;  } - -void ip_vs_app_cleanup(void) +void __net_exit ip_vs_app_net_cleanup(struct net *net)  { -	proc_net_remove(&init_net, "ip_vs_app"); +	unregister_ip_vs_app(net, NULL /* all */); +	remove_proc_entry("ip_vs_app", net->proc_net);  } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index e9adecdc8ca..610e19c0e13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -48,104 +48,71 @@  /*   * Connection hash size. Default is what was selected at compile time.  */ -int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;  module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);  MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");  /* size and mask values */ -int ip_vs_conn_tab_size; -int ip_vs_conn_tab_mask; +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly;  /*   *  Connection hash table: for input and output packets lookups of IPVS   */ -static struct list_head *ip_vs_conn_tab; +static struct hlist_head *ip_vs_conn_tab __read_mostly;  /*  SLAB cache for IPVS connections */  static struct kmem_cache *ip_vs_conn_cachep __read_mostly; -/*  counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); -  /*  counter for no client port connections */  static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);  /* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; +static unsigned int ip_vs_conn_rnd __read_mostly;  /*   *  Fine locking granularity for big connection hash table   */ -#define CT_LOCKARRAY_BITS  4 +#define CT_LOCKARRAY_BITS  5  #define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)  #define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)  struct ip_vs_aligned_lock  { -	rwlock_t	l; +	spinlock_t	l;  } __attribute__((__aligned__(SMP_CACHE_BYTES)));  /* lock array for conn table */  static struct ip_vs_aligned_lock  __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; -static inline void ct_read_lock(unsigned key) -{ -	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock(unsigned key) -{ -	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock(unsigned key) -{ -	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock(unsigned key) -{ -	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_lock_bh(unsigned key) -{ -	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock_bh(unsigned key) -{ -	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock_bh(unsigned key) +static inline void ct_write_lock_bh(unsigned int key)  { -	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +	spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);  } -static inline void ct_write_unlock_bh(unsigned key) +static inline void ct_write_unlock_bh(unsigned int key)  { -	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +	spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);  }  /*   *	Returns hash value for IPVS connection entry   */ -static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,  				       const union nf_inet_addr *addr,  				       __be16 port)  {  #ifdef CONFIG_IP_VS_IPV6  	if (af == AF_INET6) -		return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), -				    (__force u32)port, proto, ip_vs_conn_rnd) -			& ip_vs_conn_tab_mask; +		return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), +				    (__force u32)port, proto, ip_vs_conn_rnd) ^ +			((size_t)net>>8)) & ip_vs_conn_tab_mask;  #endif -	return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, -			    ip_vs_conn_rnd) -		& ip_vs_conn_tab_mask; +	return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, +			    ip_vs_conn_rnd) ^ +		((size_t)net>>8)) & ip_vs_conn_tab_mask;  }  static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, @@ -166,18 +133,18 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,  		port = p->vport;  	} -	return ip_vs_conn_hashkey(p->af, p->protocol, addr, port); +	return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);  }  static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)  {  	struct ip_vs_conn_param p; -	ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport, -			      NULL, 0, &p); +	ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, +			      &cp->caddr, cp->cport, NULL, 0, &p); -	if (cp->dest && cp->dest->svc->pe) { -		p.pe = cp->dest->svc->pe; +	if (cp->pe) { +		p.pe = cp->pe;  		p.pe_data = cp->pe_data;  		p.pe_data_len = cp->pe_data_len;  	} @@ -186,12 +153,12 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)  }  /* - *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + *	Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.   *	returns bool success.   */  static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)  { -	unsigned hash; +	unsigned int hash;  	int ret;  	if (cp->flags & IP_VS_CONN_F_ONE_PACKET) @@ -200,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)  	/* Hash by protocol, client address and port */  	hash = ip_vs_conn_hashkey_conn(cp); -	ct_write_lock(hash); +	ct_write_lock_bh(hash);  	spin_lock(&cp->lock);  	if (!(cp->flags & IP_VS_CONN_F_HASHED)) { -		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);  		cp->flags |= IP_VS_CONN_F_HASHED;  		atomic_inc(&cp->refcnt); +		hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);  		ret = 1;  	} else {  		pr_err("%s(): request for already hashed, called from %pF\n", @@ -215,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)  	}  	spin_unlock(&cp->lock); -	ct_write_unlock(hash); +	ct_write_unlock_bh(hash);  	return ret;  } @@ -223,21 +190,21 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)  /*   *	UNhashes ip_vs_conn from ip_vs_conn_tab. - *	returns bool success. + *	returns bool success. Caller should hold conn reference.   */  static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)  { -	unsigned hash; +	unsigned int hash;  	int ret;  	/* unhash it and decrease its reference counter */  	hash = ip_vs_conn_hashkey_conn(cp); -	ct_write_lock(hash); +	ct_write_lock_bh(hash);  	spin_lock(&cp->lock);  	if (cp->flags & IP_VS_CONN_F_HASHED) { -		list_del(&cp->c_list); +		hlist_del_rcu(&cp->c_list);  		cp->flags &= ~IP_VS_CONN_F_HASHED;  		atomic_dec(&cp->refcnt);  		ret = 1; @@ -245,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)  		ret = 0;  	spin_unlock(&cp->lock); -	ct_write_unlock(hash); +	ct_write_unlock_bh(hash); + +	return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ +	unsigned int hash; +	bool ret; + +	hash = ip_vs_conn_hashkey_conn(cp); + +	ct_write_lock_bh(hash); +	spin_lock(&cp->lock); + +	if (cp->flags & IP_VS_CONN_F_HASHED) { +		ret = false; +		/* Decrease refcnt and unlink conn only if we are last user */ +		if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { +			hlist_del_rcu(&cp->c_list); +			cp->flags &= ~IP_VS_CONN_F_HASHED; +			ret = true; +		} +	} else +		ret = atomic_read(&cp->refcnt) ? false : true; + +	spin_unlock(&cp->lock); +	ct_write_unlock_bh(hash);  	return ret;  } @@ -260,28 +257,30 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)  static inline struct ip_vs_conn *  __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)  { -	unsigned hash; +	unsigned int hash;  	struct ip_vs_conn *cp;  	hash = ip_vs_conn_hashkey_param(p, false); -	ct_read_lock(hash); +	rcu_read_lock(); -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { -		if (cp->af == p->af && +	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { +		if (p->cport == cp->cport && p->vport == cp->vport && +		    cp->af == p->af &&  		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&  		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && -		    p->cport == cp->cport && p->vport == cp->vport &&  		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && -		    p->protocol == cp->protocol) { +		    p->protocol == cp->protocol && +		    ip_vs_conn_net_eq(cp, p->net)) { +			if (!__ip_vs_conn_get(cp)) +				continue;  			/* HIT */ -			atomic_inc(&cp->refcnt); -			ct_read_unlock(hash); +			rcu_read_unlock();  			return cp;  		}  	} -	ct_read_unlock(hash); +	rcu_read_unlock();  	return NULL;  } @@ -309,33 +308,31 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)  static int  ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,  			    const struct ip_vs_iphdr *iph, -			    unsigned int proto_off, int inverse, -			    struct ip_vs_conn_param *p) +			    int inverse, struct ip_vs_conn_param *p)  {  	__be16 _ports[2], *pptr; +	struct net *net = skb_net(skb); -	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); +	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);  	if (pptr == NULL)  		return 1;  	if (likely(!inverse)) -		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0], -				      &iph->daddr, pptr[1], p); +		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, +				      pptr[0], &iph->daddr, pptr[1], p);  	else -		ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1], -				      &iph->saddr, pptr[0], p); +		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, +				      pptr[1], &iph->saddr, pptr[0], p);  	return 0;  }  struct ip_vs_conn *  ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, -			struct ip_vs_protocol *pp, -			const struct ip_vs_iphdr *iph, -			unsigned int proto_off, int inverse) +			const struct ip_vs_iphdr *iph, int inverse)  {  	struct ip_vs_conn_param p; -	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) +	if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))  		return NULL;  	return ip_vs_conn_in_get(&p); @@ -345,17 +342,21 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);  /* Get reference to connection template */  struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)  { -	unsigned hash; +	unsigned int hash;  	struct ip_vs_conn *cp;  	hash = ip_vs_conn_hashkey_param(p, false); -	ct_read_lock(hash); +	rcu_read_lock(); -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { -		if (p->pe_data && p->pe->ct_match) { -			if (p->pe->ct_match(p, cp)) -				goto out; +	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { +		if (unlikely(p->pe_data && p->pe->ct_match)) { +			if (!ip_vs_conn_net_eq(cp, p->net)) +				continue; +			if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { +				if (__ip_vs_conn_get(cp)) +					goto out; +			}  			continue;  		} @@ -365,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)  		     * p->vaddr is a fwmark */  		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :  				     p->af, p->vaddr, &cp->vaddr) && -		    p->cport == cp->cport && p->vport == cp->vport && +		    p->vport == cp->vport && p->cport == cp->cport &&  		    cp->flags & IP_VS_CONN_F_TEMPLATE && -		    p->protocol == cp->protocol) -			goto out; +		    p->protocol == cp->protocol && +		    ip_vs_conn_net_eq(cp, p->net)) { +			if (__ip_vs_conn_get(cp)) +				goto out; +		}  	}  	cp = NULL;    out: -	if (cp) -		atomic_inc(&cp->refcnt); -	ct_read_unlock(hash); +	rcu_read_unlock();  	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",  		      ip_vs_proto_name(p->protocol), @@ -392,7 +394,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)   *	p->vaddr, p->vport: pkt dest address (foreign host) */  struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)  { -	unsigned hash; +	unsigned int hash;  	struct ip_vs_conn *cp, *ret=NULL;  	/* @@ -400,22 +402,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)  	 */  	hash = ip_vs_conn_hashkey_param(p, true); -	ct_read_lock(hash); +	rcu_read_lock(); -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { -		if (cp->af == p->af && +	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { +		if (p->vport == cp->cport && p->cport == cp->dport && +		    cp->af == p->af &&  		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&  		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && -		    p->vport == cp->cport && p->cport == cp->dport && -		    p->protocol == cp->protocol) { +		    p->protocol == cp->protocol && +		    ip_vs_conn_net_eq(cp, p->net)) { +			if (!__ip_vs_conn_get(cp)) +				continue;  			/* HIT */ -			atomic_inc(&cp->refcnt);  			ret = cp;  			break;  		}  	} -	ct_read_unlock(hash); +	rcu_read_unlock();  	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",  		      ip_vs_proto_name(p->protocol), @@ -428,13 +432,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)  struct ip_vs_conn *  ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, -			 struct ip_vs_protocol *pp, -			 const struct ip_vs_iphdr *iph, -			 unsigned int proto_off, int inverse) +			 const struct ip_vs_iphdr *iph, int inverse)  {  	struct ip_vs_conn_param p; -	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) +	if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))  		return NULL;  	return ip_vs_conn_out_get(&p); @@ -460,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)  void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)  {  	if (ip_vs_conn_unhash(cp)) { -		spin_lock(&cp->lock); +		spin_lock_bh(&cp->lock);  		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {  			atomic_dec(&ip_vs_conn_no_cport_cnt);  			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;  			cp->cport = cport;  		} -		spin_unlock(&cp->lock); +		spin_unlock_bh(&cp->lock);  		/* hash on new dport */  		ip_vs_conn_hash(cp); @@ -545,28 +547,31 @@ static inline void  ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)  {  	unsigned int conn_flags; +	__u32 flags;  	/* if dest is NULL, then return directly */  	if (!dest)  		return;  	/* Increase the refcnt counter of the dest */ -	atomic_inc(&dest->refcnt); +	ip_vs_dest_hold(dest);  	conn_flags = atomic_read(&dest->conn_flags);  	if (cp->protocol != IPPROTO_UDP)  		conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; +	flags = cp->flags;  	/* Bind with the destination and its corresponding transmitter */ -	if (cp->flags & IP_VS_CONN_F_SYNC) { +	if (flags & IP_VS_CONN_F_SYNC) {  		/* if the connection is not template and is created  		 * by sync, preserve the activity flag.  		 */ -		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) +		if (!(flags & IP_VS_CONN_F_TEMPLATE))  			conn_flags &= ~IP_VS_CONN_F_INACTIVE;  		/* connections inherit forwarding method from dest */ -		cp->flags &= ~IP_VS_CONN_F_FWD_MASK; +		flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);  	} -	cp->flags |= conn_flags; +	flags |= conn_flags; +	cp->flags = flags;  	cp->dest = dest;  	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " @@ -581,18 +586,18 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)  		      atomic_read(&dest->refcnt));  	/* Update the connection counters */ -	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { -		/* It is a normal connection, so increase the inactive -		   connection counter because it is in TCP SYNRECV -		   state (inactive) or other protocol inacive state */ -		if ((cp->flags & IP_VS_CONN_F_SYNC) && -		    (!(cp->flags & IP_VS_CONN_F_INACTIVE))) +	if (!(flags & IP_VS_CONN_F_TEMPLATE)) { +		/* It is a normal connection, so modify the counters +		 * according to the flags, later the protocol can +		 * update them on state change +		 */ +		if (!(flags & IP_VS_CONN_F_INACTIVE))  			atomic_inc(&dest->activeconns);  		else  			atomic_inc(&dest->inactconns);  	} else {  		/* It is a persistent connection/template, so increase -		   the peristent connection counter */ +		   the persistent connection counter */  		atomic_inc(&dest->persistconns);  	} @@ -606,18 +611,46 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)   * Check if there is a destination for the connection, if so   * bind the connection to the destination.   */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +void ip_vs_try_bind_dest(struct ip_vs_conn *cp)  {  	struct ip_vs_dest *dest; -	if ((cp) && (!cp->dest)) { -		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, -				       &cp->vaddr, cp->vport, -				       cp->protocol); +	rcu_read_lock(); +	dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, +			       cp->dport, &cp->vaddr, cp->vport, +			       cp->protocol, cp->fwmark, cp->flags); +	if (dest) { +		struct ip_vs_proto_data *pd; + +		spin_lock_bh(&cp->lock); +		if (cp->dest) { +			spin_unlock_bh(&cp->lock); +			rcu_read_unlock(); +			return; +		} + +		/* Applications work depending on the forwarding method +		 * but better to reassign them always when binding dest */ +		if (cp->app) +			ip_vs_unbind_app(cp); +  		ip_vs_bind_dest(cp, dest); -		return dest; -	} else -		return NULL; +		spin_unlock_bh(&cp->lock); + +		/* Update its packet transmitter */ +		cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 +		if (cp->af == AF_INET6) +			ip_vs_bind_xmit_v6(cp); +		else +#endif +			ip_vs_bind_xmit(cp); + +		pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); +		if (pd && atomic_read(&pd->appcnt)) +			ip_vs_bind_app(cp, pd->pp); +	} +	rcu_read_unlock();  } @@ -654,7 +687,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)  		}  	} else {  		/* It is a persistent connection/template, so decrease -		   the peristent connection counter */ +		   the persistent connection counter */  		atomic_dec(&dest->persistconns);  	} @@ -669,14 +702,19 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)  			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;  	} -	/* -	 * Simply decrease the refcnt of the dest, because the -	 * dest will be either in service's destination list -	 * or in the trash. -	 */ -	atomic_dec(&dest->refcnt); +	ip_vs_dest_put(dest);  } +static int expire_quiescent_template(struct netns_ipvs *ipvs, +				     struct ip_vs_dest *dest) +{ +#ifdef CONFIG_SYSCTL +	return ipvs->sysctl_expire_quiescent_template && +		(atomic_read(&dest->weight) == 0); +#else +	return 0; +#endif +}  /*   *	Checking if the destination of a connection template is available. @@ -686,14 +724,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)  int ip_vs_check_template(struct ip_vs_conn *ct)  {  	struct ip_vs_dest *dest = ct->dest; +	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));  	/*  	 * Checking the dest server status.  	 */  	if ((dest == NULL) ||  	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) || -	    (sysctl_ip_vs_expire_quiescent_template && -	     (atomic_read(&dest->weight) == 0))) { +	    expire_quiescent_template(ipvs, dest)) {  		IP_VS_DBG_BUF(9, "check_template: dest not available for "  			      "protocol %s s:%s:%d v:%s:%d "  			      "-> d:%s:%d\n", @@ -721,22 +759,27 @@ int ip_vs_check_template(struct ip_vs_conn *ct)  		 * Simply decrease the refcnt of the template,  		 * don't restart its timer.  		 */ -		atomic_dec(&ct->refcnt); +		__ip_vs_conn_put(ct);  		return 0;  	}  	return 1;  } -static void ip_vs_conn_expire(unsigned long data) +static void ip_vs_conn_rcu_free(struct rcu_head *head)  { -	struct ip_vs_conn *cp = (struct ip_vs_conn *)data; +	struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, +					     rcu_head); -	cp->timeout = 60*HZ; +	ip_vs_pe_put(cp->pe); +	kfree(cp->pe_data); +	kmem_cache_free(ip_vs_conn_cachep, cp); +} -	/* -	 *	hey, I'm using it -	 */ -	atomic_inc(&cp->refcnt); +static void ip_vs_conn_expire(unsigned long data) +{ +	struct ip_vs_conn *cp = (struct ip_vs_conn *)data; +	struct net *net = ip_vs_conn_net(cp); +	struct netns_ipvs *ipvs = net_ipvs(net);  	/*  	 *	do I control anybody? @@ -744,55 +787,60 @@ static void ip_vs_conn_expire(unsigned long data)  	if (atomic_read(&cp->n_control))  		goto expire_later; -	/* -	 *	unhash it if it is hashed in the conn table -	 */ -	if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) -		goto expire_later; - -	/* -	 *	refcnt==1 implies I'm the only one referrer -	 */ -	if (likely(atomic_read(&cp->refcnt) == 1)) { +	/* Unlink conn if not referenced anymore */ +	if (likely(ip_vs_conn_unlink(cp))) {  		/* delete the timer if it is activated by other users */ -		if (timer_pending(&cp->timer)) -			del_timer(&cp->timer); +		del_timer(&cp->timer);  		/* does anybody control me? */  		if (cp->control)  			ip_vs_control_del(cp); -		if (cp->flags & IP_VS_CONN_F_NFCT) -			ip_vs_conn_drop_conntrack(cp); +		if (cp->flags & IP_VS_CONN_F_NFCT) { +			/* Do not access conntracks during subsys cleanup +			 * because nf_conntrack_find_get can not be used after +			 * conntrack cleanup for the net. +			 */ +			smp_rmb(); +			if (ipvs->enable) +				ip_vs_conn_drop_conntrack(cp); +		} -		kfree(cp->pe_data);  		if (unlikely(cp->app != NULL))  			ip_vs_unbind_app(cp);  		ip_vs_unbind_dest(cp);  		if (cp->flags & IP_VS_CONN_F_NO_CPORT)  			atomic_dec(&ip_vs_conn_no_cport_cnt); -		atomic_dec(&ip_vs_conn_count); - -		kmem_cache_free(ip_vs_conn_cachep, cp); +		call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); +		atomic_dec(&ipvs->conn_count);  		return;  	} -	/* hash it back to the table */ -	ip_vs_conn_hash(cp); -    expire_later: -	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", -		  atomic_read(&cp->refcnt)-1, +	IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", +		  atomic_read(&cp->refcnt),  		  atomic_read(&cp->n_control)); +	atomic_inc(&cp->refcnt); +	cp->timeout = 60*HZ; + +	if (ipvs->sync_state & IP_VS_STATE_MASTER) +		ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); +  	ip_vs_conn_put(cp);  } - +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + */  void ip_vs_conn_expire_now(struct ip_vs_conn *cp)  { -	if (del_timer(&cp->timer)) -		mod_timer(&cp->timer, jiffies); +	/* Using mod_timer_pending will ensure the timer is not +	 * modified after the final del_timer in ip_vs_conn_expire. +	 */ +	if (timer_pending(&cp->timer) && +	    time_after(cp->timer.expires, jiffies)) +		mod_timer_pending(&cp->timer, jiffies);  } @@ -801,34 +849,44 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)   */  struct ip_vs_conn *  ip_vs_conn_new(const struct ip_vs_conn_param *p, -	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags, -	       struct ip_vs_dest *dest) +	       const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, +	       struct ip_vs_dest *dest, __u32 fwmark)  {  	struct ip_vs_conn *cp; -	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol); +	struct netns_ipvs *ipvs = net_ipvs(p->net); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, +							   p->protocol); -	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); +	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);  	if (cp == NULL) {  		IP_VS_ERR_RL("%s(): no memory\n", __func__);  		return NULL;  	} -	INIT_LIST_HEAD(&cp->c_list); +	INIT_HLIST_NODE(&cp->c_list);  	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); +	ip_vs_conn_net_set(cp, p->net);  	cp->af		   = p->af;  	cp->protocol	   = p->protocol; -	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); +	ip_vs_addr_set(p->af, &cp->caddr, p->caddr);  	cp->cport	   = p->cport; -	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); +	/* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ +	ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, +		       &cp->vaddr, p->vaddr);  	cp->vport	   = p->vport; -	/* proto should only be IPPROTO_IP if d_addr is a fwmark */ -	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, -			&cp->daddr, daddr); +	ip_vs_addr_set(p->af, &cp->daddr, daddr);  	cp->dport          = dport;  	cp->flags	   = flags; -	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) { +	cp->fwmark         = fwmark; +	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { +		ip_vs_pe_get(p->pe); +		cp->pe = p->pe;  		cp->pe_data = p->pe_data;  		cp->pe_data_len = p->pe_data_len; +	} else { +		cp->pe = NULL; +		cp->pe_data = NULL; +		cp->pe_data_len = 0;  	}  	spin_lock_init(&cp->lock); @@ -839,19 +897,30 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,  	 */  	atomic_set(&cp->refcnt, 1); +	cp->control = NULL;  	atomic_set(&cp->n_control, 0);  	atomic_set(&cp->in_pkts, 0); -	atomic_inc(&ip_vs_conn_count); +	cp->packet_xmit = NULL; +	cp->app = NULL; +	cp->app_data = NULL; +	/* reset struct ip_vs_seq */ +	cp->in_seq.delta = 0; +	cp->out_seq.delta = 0; + +	atomic_inc(&ipvs->conn_count);  	if (flags & IP_VS_CONN_F_NO_CPORT)  		atomic_inc(&ip_vs_conn_no_cport_cnt);  	/* Bind the connection with a destination server */ +	cp->dest = NULL;  	ip_vs_bind_dest(cp, dest);  	/* Set its state and timeout */  	cp->state = 0; +	cp->old_state = 0;  	cp->timeout = 3*HZ; +	cp->sync_endtime = jiffies & ~3UL;  	/* Bind its packet transmitter */  #ifdef CONFIG_IP_VS_IPV6 @@ -861,8 +930,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,  #endif  		ip_vs_bind_xmit(cp); -	if (unlikely(pp && atomic_read(&pp->appcnt))) -		ip_vs_bind_app(cp, pp); +	if (unlikely(pd && atomic_read(&pd->appcnt))) +		ip_vs_bind_app(cp, pd->pp);  	/*  	 * Allow conntrack to be preserved. By default, conntrack @@ -871,7 +940,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,  	 * IP_VS_CONN_F_ONE_PACKET too.  	 */ -	if (ip_vs_conntrack_enabled()) +	if (ip_vs_conntrack_enabled(ipvs))  		cp->flags |= IP_VS_CONN_F_NFCT;  	/* Hash it in the ip_vs_conn_tab finally */ @@ -884,36 +953,49 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,   *	/proc/net/ip_vs_conn entries   */  #ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { +	struct seq_net_private	p; +	struct hlist_head	*l; +};  static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)  {  	int idx;  	struct ip_vs_conn *cp; +	struct ip_vs_iter_state *iter = seq->private;  	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { -		ct_read_lock_bh(idx); -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { +		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { +			/* __ip_vs_conn_get() is not needed by +			 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show +			 */  			if (pos-- == 0) { -				seq->private = &ip_vs_conn_tab[idx]; -			return cp; +				iter->l = &ip_vs_conn_tab[idx]; +				return cp;  			}  		} -		ct_read_unlock_bh(idx); +		cond_resched_rcu();  	}  	return NULL;  }  static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +	__acquires(RCU)  { -	seq->private = NULL; +	struct ip_vs_iter_state *iter = seq->private; + +	iter->l = NULL; +	rcu_read_lock();  	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;  }  static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)  {  	struct ip_vs_conn *cp = v; -	struct list_head *e, *l = seq->private; +	struct ip_vs_iter_state *iter = seq->private; +	struct hlist_node *e; +	struct hlist_head *l = iter->l;  	int idx;  	++*pos; @@ -921,30 +1003,26 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)  		return ip_vs_conn_array(seq, 0);  	/* more on same hash chain? */ -	if ((e = cp->c_list.next) != l) -		return list_entry(e, struct ip_vs_conn, c_list); +	e = rcu_dereference(hlist_next_rcu(&cp->c_list)); +	if (e) +		return hlist_entry(e, struct ip_vs_conn, c_list);  	idx = l - ip_vs_conn_tab; -	ct_read_unlock_bh(idx); -  	while (++idx < ip_vs_conn_tab_size) { -		ct_read_lock_bh(idx); -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { -			seq->private = &ip_vs_conn_tab[idx]; +		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { +			iter->l = &ip_vs_conn_tab[idx];  			return cp;  		} -		ct_read_unlock_bh(idx); +		cond_resched_rcu();  	} -	seq->private = NULL; +	iter->l = NULL;  	return NULL;  }  static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +	__releases(RCU)  { -	struct list_head *l = seq->private; - -	if (l) -		ct_read_unlock_bh(l - ip_vs_conn_tab); +	rcu_read_unlock();  }  static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) @@ -955,18 +1033,19 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)     "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");  	else {  		const struct ip_vs_conn *cp = v; +		struct net *net = seq_file_net(seq);  		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];  		size_t len = 0; -		if (cp->dest && cp->pe_data && -		    cp->dest->svc->pe->show_pe_data) { +		if (!ip_vs_conn_net_eq(cp, net)) +			return 0; +		if (cp->pe_data) {  			pe_data[0] = ' '; -			len = strlen(cp->dest->svc->pe->name); -			memcpy(pe_data + 1, cp->dest->svc->pe->name, len); +			len = strlen(cp->pe->name); +			memcpy(pe_data + 1, cp->pe->name, len);  			pe_data[len + 1] = ' ';  			len += 2; -			len += cp->dest->svc->pe->show_pe_data(cp, -							       pe_data + len); +			len += cp->pe->show_pe_data(cp, pe_data + len);  		}  		pe_data[len] = '\0'; @@ -1004,7 +1083,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = {  static int ip_vs_conn_open(struct inode *inode, struct file *file)  { -	return seq_open(file, &ip_vs_conn_seq_ops); +	return seq_open_net(inode, file, &ip_vs_conn_seq_ops, +			    sizeof(struct ip_vs_iter_state));  }  static const struct file_operations ip_vs_conn_fops = { @@ -1012,10 +1092,10 @@ static const struct file_operations ip_vs_conn_fops = {  	.open    = ip_vs_conn_open,  	.read    = seq_read,  	.llseek  = seq_lseek, -	.release = seq_release, +	.release = seq_release_net,  }; -static const char *ip_vs_origin_name(unsigned flags) +static const char *ip_vs_origin_name(unsigned int flags)  {  	if (flags & IP_VS_CONN_F_SYNC)  		return "SYNC"; @@ -1031,6 +1111,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)     "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");  	else {  		const struct ip_vs_conn *cp = v; +		struct net *net = seq_file_net(seq); + +		if (!ip_vs_conn_net_eq(cp, net)) +			return 0;  #ifdef CONFIG_IP_VS_IPV6  		if (cp->af == AF_INET6) @@ -1067,7 +1151,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {  static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)  { -	return seq_open(file, &ip_vs_conn_sync_seq_ops); +	return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, +			    sizeof(struct ip_vs_iter_state));  }  static const struct file_operations ip_vs_conn_sync_fops = { @@ -1075,7 +1160,7 @@ static const struct file_operations ip_vs_conn_sync_fops = {  	.open    = ip_vs_conn_sync_open,  	.read    = seq_read,  	.llseek  = seq_lseek, -	.release = seq_release, +	.release = seq_release_net,  };  #endif @@ -1113,27 +1198,24 @@ static inline int todrop_entry(struct ip_vs_conn *cp)  }  /* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) +void ip_vs_random_dropentry(struct net *net)  {  	int idx; -	struct ip_vs_conn *cp; +	struct ip_vs_conn *cp, *cp_c; +	rcu_read_lock();  	/*  	 * Randomly scan 1/32 of the whole table every second  	 */  	for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { -		unsigned hash = net_random() & ip_vs_conn_tab_mask; +		unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; -		/* -		 *  Lock is actually needed in this loop. -		 */ -		ct_write_lock_bh(hash); - -		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { +		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {  			if (cp->flags & IP_VS_CONN_F_TEMPLATE)  				/* connection template */  				continue; - +			if (!ip_vs_conn_net_eq(cp, net)) +				continue;  			if (cp->protocol == IPPROTO_TCP) {  				switch(cp->state) {  				case IP_VS_TCP_S_SYN_RECV: @@ -1148,6 +1230,18 @@ void ip_vs_random_dropentry(void)  				default:  					continue;  				} +			} else if (cp->protocol == IPPROTO_SCTP) { +				switch (cp->state) { +				case IP_VS_SCTP_S_INIT1: +				case IP_VS_SCTP_S_INIT: +					break; +				case IP_VS_SCTP_S_ESTABLISHED: +					if (todrop_entry(cp)) +						break; +					continue; +				default: +					continue; +				}  			} else {  				if (!todrop_entry(cp))  					continue; @@ -1155,51 +1249,78 @@ void ip_vs_random_dropentry(void)  			IP_VS_DBG(4, "del connection\n");  			ip_vs_conn_expire_now(cp); -			if (cp->control) { +			cp_c = cp->control; +			/* cp->control is valid only with reference to cp */ +			if (cp_c && __ip_vs_conn_get(cp)) {  				IP_VS_DBG(4, "del conn template\n"); -				ip_vs_conn_expire_now(cp->control); +				ip_vs_conn_expire_now(cp_c); +				__ip_vs_conn_put(cp);  			}  		} -		ct_write_unlock_bh(hash); +		cond_resched_rcu();  	} +	rcu_read_unlock();  }  /*   *      Flush all the connection entries in the ip_vs_conn_tab   */ -static void ip_vs_conn_flush(void) +static void ip_vs_conn_flush(struct net *net)  {  	int idx; -	struct ip_vs_conn *cp; +	struct ip_vs_conn *cp, *cp_c; +	struct netns_ipvs *ipvs = net_ipvs(net); -  flush_again: +flush_again: +	rcu_read_lock();  	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { -		/* -		 *  Lock is actually needed in this loop. -		 */ -		ct_write_lock_bh(idx); - -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { +		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { +			if (!ip_vs_conn_net_eq(cp, net)) +				continue;  			IP_VS_DBG(4, "del connection\n");  			ip_vs_conn_expire_now(cp); -			if (cp->control) { +			cp_c = cp->control; +			/* cp->control is valid only with reference to cp */ +			if (cp_c && __ip_vs_conn_get(cp)) {  				IP_VS_DBG(4, "del conn template\n"); -				ip_vs_conn_expire_now(cp->control); +				ip_vs_conn_expire_now(cp_c); +				__ip_vs_conn_put(cp);  			}  		} -		ct_write_unlock_bh(idx); +		cond_resched_rcu();  	} +	rcu_read_unlock();  	/* the counter may be not NULL, because maybe some conn entries  	   are run by slow timer handler or unhashed but still referred */ -	if (atomic_read(&ip_vs_conn_count) != 0) { +	if (atomic_read(&ipvs->conn_count) != 0) {  		schedule();  		goto flush_again;  	}  } +/* + * per netns init and exit + */ +int __net_init ip_vs_conn_net_init(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); +	atomic_set(&ipvs->conn_count, 0); + +	proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); +	proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); +	return 0; +} + +void __net_exit ip_vs_conn_net_cleanup(struct net *net) +{ +	/* flush all the connection entries first */ +	ip_vs_conn_flush(net); +	remove_proc_entry("ip_vs_conn", net->proc_net); +	remove_proc_entry("ip_vs_conn_sync", net->proc_net); +}  int __init ip_vs_conn_init(void)  { @@ -1212,8 +1333,7 @@ int __init ip_vs_conn_init(void)  	/*  	 * Allocate the connection hash table and initialize its list heads  	 */ -	ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * -				 sizeof(struct list_head)); +	ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));  	if (!ip_vs_conn_tab)  		return -ENOMEM; @@ -1233,32 +1353,24 @@ int __init ip_vs_conn_init(void)  	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",  		  sizeof(struct ip_vs_conn)); -	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { -		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); -	} +	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) +		INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);  	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  { -		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); +		spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);  	} -	proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); -	proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); -  	/* calculate the random value for connection hash */  	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));  	return 0;  } -  void ip_vs_conn_cleanup(void)  { -	/* flush all the connection entries first */ -	ip_vs_conn_flush(); - +	/* Wait all ip_vs_conn_rcu_free() callbacks to complete */ +	rcu_barrier();  	/* Release the empty cache */  	kmem_cache_destroy(ip_vs_conn_cachep); -	proc_net_remove(&init_net, "ip_vs_conn"); -	proc_net_remove(&init_net, "ip_vs_conn_sync");  	vfree(ip_vs_conn_tab);  } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b4e51e9c5a0..e6836755c45 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -41,6 +41,7 @@  #include <net/icmp.h>                   /* for icmp_send */  #include <net/route.h>  #include <net/ip6_checksum.h> +#include <net/netns/generic.h>		/* net_generic() */  #include <linux/netfilter.h>  #include <linux/netfilter_ipv4.h> @@ -68,12 +69,15 @@ EXPORT_SYMBOL(ip_vs_conn_put);  EXPORT_SYMBOL(ip_vs_get_debug_level);  #endif +static int ip_vs_net_id __read_mostly; +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);  /* ID used in ICMP lookups */  #define icmp_id(icmph)          (((icmph)->un).echo.id)  #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier) -const char *ip_vs_proto_name(unsigned proto) +const char *ip_vs_proto_name(unsigned int proto)  {  	static char buf[20]; @@ -93,7 +97,7 @@ const char *ip_vs_proto_name(unsigned proto)  		return "ICMPv6";  #endif  	default: -		sprintf(buf, "IP_%d", proto); +		sprintf(buf, "IP_%u", proto);  		return buf;  	}  } @@ -108,21 +112,32 @@ static inline void  ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)  {  	struct ip_vs_dest *dest = cp->dest; +	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); +  	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { -		spin_lock(&dest->stats.lock); -		dest->stats.ustats.inpkts++; -		dest->stats.ustats.inbytes += skb->len; -		spin_unlock(&dest->stats.lock); - -		spin_lock(&dest->svc->stats.lock); -		dest->svc->stats.ustats.inpkts++; -		dest->svc->stats.ustats.inbytes += skb->len; -		spin_unlock(&dest->svc->stats.lock); - -		spin_lock(&ip_vs_stats.lock); -		ip_vs_stats.ustats.inpkts++; -		ip_vs_stats.ustats.inbytes += skb->len; -		spin_unlock(&ip_vs_stats.lock); +		struct ip_vs_cpu_stats *s; +		struct ip_vs_service *svc; + +		s = this_cpu_ptr(dest->stats.cpustats); +		s->ustats.inpkts++; +		u64_stats_update_begin(&s->syncp); +		s->ustats.inbytes += skb->len; +		u64_stats_update_end(&s->syncp); + +		rcu_read_lock(); +		svc = rcu_dereference(dest->svc); +		s = this_cpu_ptr(svc->stats.cpustats); +		s->ustats.inpkts++; +		u64_stats_update_begin(&s->syncp); +		s->ustats.inbytes += skb->len; +		u64_stats_update_end(&s->syncp); +		rcu_read_unlock(); + +		s = this_cpu_ptr(ipvs->tot_stats.cpustats); +		s->ustats.inpkts++; +		u64_stats_update_begin(&s->syncp); +		s->ustats.inbytes += skb->len; +		u64_stats_update_end(&s->syncp);  	}  } @@ -131,21 +146,32 @@ static inline void  ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)  {  	struct ip_vs_dest *dest = cp->dest; +	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); +  	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { -		spin_lock(&dest->stats.lock); -		dest->stats.ustats.outpkts++; -		dest->stats.ustats.outbytes += skb->len; -		spin_unlock(&dest->stats.lock); - -		spin_lock(&dest->svc->stats.lock); -		dest->svc->stats.ustats.outpkts++; -		dest->svc->stats.ustats.outbytes += skb->len; -		spin_unlock(&dest->svc->stats.lock); - -		spin_lock(&ip_vs_stats.lock); -		ip_vs_stats.ustats.outpkts++; -		ip_vs_stats.ustats.outbytes += skb->len; -		spin_unlock(&ip_vs_stats.lock); +		struct ip_vs_cpu_stats *s; +		struct ip_vs_service *svc; + +		s = this_cpu_ptr(dest->stats.cpustats); +		s->ustats.outpkts++; +		u64_stats_update_begin(&s->syncp); +		s->ustats.outbytes += skb->len; +		u64_stats_update_end(&s->syncp); + +		rcu_read_lock(); +		svc = rcu_dereference(dest->svc); +		s = this_cpu_ptr(svc->stats.cpustats); +		s->ustats.outpkts++; +		u64_stats_update_begin(&s->syncp); +		s->ustats.outbytes += skb->len; +		u64_stats_update_end(&s->syncp); +		rcu_read_unlock(); + +		s = this_cpu_ptr(ipvs->tot_stats.cpustats); +		s->ustats.outpkts++; +		u64_stats_update_begin(&s->syncp); +		s->ustats.outbytes += skb->len; +		u64_stats_update_end(&s->syncp);  	}  } @@ -153,41 +179,43 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)  static inline void  ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)  { -	spin_lock(&cp->dest->stats.lock); -	cp->dest->stats.ustats.conns++; -	spin_unlock(&cp->dest->stats.lock); +	struct netns_ipvs *ipvs = net_ipvs(svc->net); +	struct ip_vs_cpu_stats *s; -	spin_lock(&svc->stats.lock); -	svc->stats.ustats.conns++; -	spin_unlock(&svc->stats.lock); +	s = this_cpu_ptr(cp->dest->stats.cpustats); +	s->ustats.conns++; -	spin_lock(&ip_vs_stats.lock); -	ip_vs_stats.ustats.conns++; -	spin_unlock(&ip_vs_stats.lock); +	s = this_cpu_ptr(svc->stats.cpustats); +	s->ustats.conns++; + +	s = this_cpu_ptr(ipvs->tot_stats.cpustats); +	s->ustats.conns++;  } -static inline int +static inline void  ip_vs_set_state(struct ip_vs_conn *cp, int direction,  		const struct sk_buff *skb, -		struct ip_vs_protocol *pp) +		struct ip_vs_proto_data *pd)  { -	if (unlikely(!pp->state_transition)) -		return 0; -	return pp->state_transition(cp, direction, skb, pp); +	if (likely(pd->pp->state_transition)) +		pd->pp->state_transition(cp, direction, skb, pd);  } -static inline void +static inline int  ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,  			      struct sk_buff *skb, int protocol,  			      const union nf_inet_addr *caddr, __be16 cport,  			      const union nf_inet_addr *vaddr, __be16 vport,  			      struct ip_vs_conn_param *p)  { -	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); -	p->pe = svc->pe; +	ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, +			      vport, p); +	p->pe = rcu_dereference(svc->pe);  	if (p->pe && p->pe->fill_param) -		p->pe->fill_param(p, skb); +		return p->pe->fill_param(p, skb); + +	return 0;  }  /* @@ -199,33 +227,32 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,   */  static struct ip_vs_conn *  ip_vs_sched_persist(struct ip_vs_service *svc, -		    struct sk_buff *skb, -		    __be16 ports[2]) +		    struct sk_buff *skb, __be16 src_port, __be16 dst_port, +		    int *ignored, struct ip_vs_iphdr *iph)  {  	struct ip_vs_conn *cp = NULL; -	struct ip_vs_iphdr iph;  	struct ip_vs_dest *dest;  	struct ip_vs_conn *ct;  	__be16 dport = 0;		/* destination port to forward */  	unsigned int flags;  	struct ip_vs_conn_param param; +	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };  	union nf_inet_addr snet;	/* source network of the client,  					   after masking */ -	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); -  	/* Mask saddr with the netmask to adjust template granularity */  #ifdef CONFIG_IP_VS_IPV6  	if (svc->af == AF_INET6) -		ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); +		ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, +				 (__force __u32) svc->netmask);  	else  #endif -		snet.ip = iph.saddr.ip & svc->netmask; +		snet.ip = iph->saddr.ip & svc->netmask;  	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "  		      "mnet %s\n", -		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), -		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), +		      IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), +		      IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),  		      IP_VS_DBG_ADDR(svc->af, &snet));  	/* @@ -242,19 +269,18 @@ ip_vs_sched_persist(struct ip_vs_service *svc,  	 * is created for other persistent services.  	 */  	{ -		int protocol = iph.protocol; -		const union nf_inet_addr *vaddr = &iph.daddr; -		const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; +		int protocol = iph->protocol; +		const union nf_inet_addr *vaddr = &iph->daddr;  		__be16 vport = 0; -		if (ports[1] == svc->port) { +		if (dst_port == svc->port) {  			/* non-FTP template:  			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>  			 * FTP template:  			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>  			 */  			if (svc->port != FTPPORT) -				vport = ports[1]; +				vport = dst_port;  		} else {  			/* Note: persistent fwmark-based services and  			 * persistent port zero service are handled here. @@ -268,24 +294,34 @@ ip_vs_sched_persist(struct ip_vs_service *svc,  				vaddr = &fwmark;  			}  		} -		ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, -					      vaddr, vport, ¶m); +		/* return *ignored = -1 so NF_DROP can be used */ +		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, +						  vaddr, vport, ¶m) < 0) { +			*ignored = -1; +			return NULL; +		}  	}  	/* Check if a template already exists */  	ct = ip_vs_ct_in_get(¶m);  	if (!ct || !ip_vs_check_template(ct)) { -		/* No template found or the dest of the connection +		struct ip_vs_scheduler *sched; + +		/* +		 * No template found or the dest of the connection  		 * template is not available. +		 * return *ignored=0 i.e. ICMP and NF_DROP  		 */ -		dest = svc->scheduler->schedule(svc, skb); +		sched = rcu_dereference(svc->scheduler); +		dest = sched->schedule(svc, skb, iph);  		if (!dest) {  			IP_VS_DBG(1, "p-schedule: no dest found.\n");  			kfree(param.pe_data); +			*ignored = 0;  			return NULL;  		} -		if (ports[1] == svc->port && svc->port != FTPPORT) +		if (dst_port == svc->port && svc->port != FTPPORT)  			dport = dest->port;  		/* Create a template @@ -293,9 +329,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,  		 * and thus param.pe_data will be destroyed  		 * when the template expires */  		ct = ip_vs_conn_new(¶m, &dest->addr, dport, -				    IP_VS_CONN_F_TEMPLATE, dest); +				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);  		if (ct == NULL) {  			kfree(param.pe_data); +			*ignored = -1;  			return NULL;  		} @@ -306,22 +343,24 @@ ip_vs_sched_persist(struct ip_vs_service *svc,  		kfree(param.pe_data);  	} -	dport = ports[1]; +	dport = dst_port;  	if (dport == svc->port && dest->port)  		dport = dest->port;  	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET -		 && iph.protocol == IPPROTO_UDP)? +		 && iph->protocol == IPPROTO_UDP) ?  		IP_VS_CONN_F_ONE_PACKET : 0;  	/*  	 *    Create a new connection according to the template  	 */ -	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], -			      &iph.daddr, ports[1], ¶m); -	cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest); +	ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, +			      src_port, &iph->daddr, dst_port, ¶m); + +	cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark);  	if (cp == NULL) {  		ip_vs_conn_put(ct); +		*ignored = -1;  		return NULL;  	} @@ -341,20 +380,39 @@ ip_vs_sched_persist(struct ip_vs_service *svc,   *  It selects a server according to the virtual service, and   *  creates a connection entry.   *  Protocols supported: TCP, UDP + * + *  Usage of *ignored + * + * 1 :   protocol tried to schedule (eg. on SYN), found svc but the + *       svc/scheduler decides that this packet should be accepted with + *       NF_ACCEPT because it must not be scheduled. + * + * 0 :   scheduler can not find destination, so try bypass or + *       return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 :  scheduler tried to schedule but fatal error occurred, eg. + *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + *       failure such as missing Call-ID, ENOMEM on skb_linearize + *       or pe_data. In this case we should return NF_DROP without + *       any attempts to send ICMP with ip_vs_leave.   */  struct ip_vs_conn *  ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, -	       struct ip_vs_protocol *pp, int *ignored) +	       struct ip_vs_proto_data *pd, int *ignored, +	       struct ip_vs_iphdr *iph)  { +	struct ip_vs_protocol *pp = pd->pp;  	struct ip_vs_conn *cp = NULL; -	struct ip_vs_iphdr iph; +	struct ip_vs_scheduler *sched;  	struct ip_vs_dest *dest;  	__be16 _ports[2], *pptr;  	unsigned int flags;  	*ignored = 1; -	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); -	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); +	/* +	 * IPv6 frags, only the first hit here. +	 */ +	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);  	if (pptr == NULL)  		return NULL; @@ -371,12 +429,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,  	}  	/* -	 * Do not schedule replies from local real server. It is risky -	 * for fwmark services but mostly for persistent services. +	 *    Do not schedule replies from local real server.  	 */  	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && -	    (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && -	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { +	    (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {  		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,  			      "Not scheduling reply for existing connection");  		__ip_vs_conn_put(cp); @@ -386,10 +442,11 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,  	/*  	 *    Persistent service  	 */ -	if (svc->flags & IP_VS_SVC_F_PERSISTENT) { -		*ignored = 0; -		return ip_vs_sched_persist(svc, skb, pptr); -	} +	if (svc->flags & IP_VS_SVC_F_PERSISTENT) +		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, +					   iph); + +	*ignored = 0;  	/*  	 *    Non-persistent service @@ -402,16 +459,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,  		return NULL;  	} -	*ignored = 0; - -	dest = svc->scheduler->schedule(svc, skb); +	sched = rcu_dereference(svc->scheduler); +	dest = sched->schedule(svc, skb, iph);  	if (dest == NULL) {  		IP_VS_DBG(1, "Schedule: no dest found.\n");  		return NULL;  	}  	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET -		 && iph.protocol == IPPROTO_UDP)? +		 && iph->protocol == IPPROTO_UDP) ?  		IP_VS_CONN_F_ONE_PACKET : 0;  	/* @@ -419,13 +475,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,  	 */  	{  		struct ip_vs_conn_param p; -		ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, -				      pptr[0], &iph.daddr, pptr[1], &p); + +		ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, +				      &iph->saddr, pptr[0], &iph->daddr, +				      pptr[1], &p);  		cp = ip_vs_conn_new(&p, &dest->addr,  				    dest->port ? dest->port : pptr[1], -				    flags, dest); -		if (!cp) +				    flags, dest, skb->mark); +		if (!cp) { +			*ignored = -1;  			return NULL; +		}  	}  	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " @@ -447,49 +507,52 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,   *  no destination is available for a new connection.   */  int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, -		struct ip_vs_protocol *pp) +		struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)  {  	__be16 _ports[2], *pptr; -	struct ip_vs_iphdr iph; +#ifdef CONFIG_SYSCTL +	struct net *net; +	struct netns_ipvs *ipvs;  	int unicast; -	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); +#endif -	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); +	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);  	if (pptr == NULL) { -		ip_vs_service_put(svc);  		return NF_DROP;  	} +#ifdef CONFIG_SYSCTL +	net = skb_net(skb); +  #ifdef CONFIG_IP_VS_IPV6  	if (svc->af == AF_INET6) -		unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; +		unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;  	else  #endif -		unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); +		unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);  	/* if it is fwmark-based service, the cache_bypass sysctl is up  	   and the destination is a non-local unicast, then create  	   a cache_bypass connection entry */ -	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { -		int ret, cs; +	ipvs = net_ipvs(net); +	if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { +		int ret;  		struct ip_vs_conn *cp;  		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && -				      iph.protocol == IPPROTO_UDP)? +				      iph->protocol == IPPROTO_UDP) ?  				      IP_VS_CONN_F_ONE_PACKET : 0;  		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } }; -		ip_vs_service_put(svc); -  		/* create a new connection entry */  		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);  		{  			struct ip_vs_conn_param p; -			ip_vs_conn_fill_param(svc->af, iph.protocol, -					      &iph.saddr, pptr[0], -					      &iph.daddr, pptr[1], &p); +			ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, +					      &iph->saddr, pptr[0], +					      &iph->daddr, pptr[1], &p);  			cp = ip_vs_conn_new(&p, &daddr, 0,  					    IP_VS_CONN_F_BYPASS | flags, -					    NULL); +					    NULL, skb->mark);  			if (!cp)  				return NF_DROP;  		} @@ -498,16 +561,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,  		ip_vs_in_stats(cp, skb);  		/* set state */ -		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); +		ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);  		/* transmit the first SYN packet */ -		ret = cp->packet_xmit(skb, cp, pp); +		ret = cp->packet_xmit(skb, cp, pd->pp, iph);  		/* do not touch skb anymore */  		atomic_inc(&cp->in_pkts);  		ip_vs_conn_put(cp);  		return ret;  	} +#endif  	/*  	 * When the virtual ftp service is presented, packets destined @@ -515,12 +579,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,  	 * listed in the ipvs table), pass the packets, because it is  	 * not ipvs job to decide to drop the packets.  	 */ -	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { -		ip_vs_service_put(svc); +	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))  		return NF_ACCEPT; -	} - -	ip_vs_service_put(svc);  	/*  	 * Notify the client that the destination is unreachable, and @@ -532,9 +592,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,  #ifdef CONFIG_IP_VS_IPV6  	if (svc->af == AF_INET6) {  		if (!skb->dev) { -			struct net *net = dev_net(skb_dst(skb)->dev); +			struct net *net_ = dev_net(skb_dst(skb)->dev); -			skb->dev = net->loopback_dev; +			skb->dev = net_->loopback_dev;  		}  		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);  	} else @@ -544,6 +604,33 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,  	return NF_DROP;  } +#ifdef CONFIG_SYSCTL + +static int sysctl_snat_reroute(struct sk_buff *skb) +{ +	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); +	return ipvs->sysctl_snat_reroute; +} + +static int sysctl_nat_icmp_send(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); +	return ipvs->sysctl_nat_icmp_send; +} + +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ +	return ipvs->sysctl_expire_nodest_conn; +} + +#else + +static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } +static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } + +#endif +  __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)  {  	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -560,21 +647,32 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)  static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)  { -	int err = ip_defrag(skb, user); +	int err; +	local_bh_disable(); +	err = ip_defrag(skb, user); +	local_bh_enable();  	if (!err)  		ip_send_check(ip_hdr(skb));  	return err;  } -#ifdef CONFIG_IP_VS_IPV6 -static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +static int ip_vs_route_me_harder(int af, struct sk_buff *skb)  { -	/* TODO IPv6: Find out what to do here for IPv6 */ +#ifdef CONFIG_IP_VS_IPV6 +	if (af == AF_INET6) { +		if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) +			return 1; +	} else +#endif +		if ((sysctl_snat_reroute(skb) || +		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) && +		    ip_route_me_harder(skb, RTN_LOCAL) != 0) +			return 1; +  	return 0;  } -#endif  /*   * Packet has been made sufficiently writable in caller @@ -630,10 +728,19 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,  		    struct ip_vs_conn *cp, int inout)  {  	struct ipv6hdr *iph	 = ipv6_hdr(skb); -	unsigned int icmp_offset = sizeof(struct ipv6hdr); -	struct icmp6hdr *icmph	 = (struct icmp6hdr *)(skb_network_header(skb) + -						      icmp_offset); -	struct ipv6hdr *ciph	 = (struct ipv6hdr *)(icmph + 1); +	unsigned int icmp_offset = 0; +	unsigned int offs	 = 0; /* header offset*/ +	int protocol; +	struct icmp6hdr *icmph; +	struct ipv6hdr *ciph; +	unsigned short fragoffs; + +	ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); +	icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); +	offs = icmp_offset + sizeof(struct icmp6hdr); +	ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); + +	protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);  	if (inout) {  		iph->saddr = cp->vaddr.in6; @@ -644,10 +751,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,  	}  	/* the TCP/UDP/SCTP port */ -	if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr || -	    IPPROTO_SCTP == ciph->nexthdr) { -		__be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); +	if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || +			  IPPROTO_SCTP == protocol)) { +		__be16 *ports = (void *)(skb_network_header(skb) + offs); +		IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, +			      ntohs(inout ? ports[1] : ports[0]), +			      ntohs(inout ? cp->vport : cp->dport));  		if (inout)  			ports[1] = cp->vport;  		else @@ -674,7 +784,7 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,  #endif  /* Handle relevant response ICMP messages - forward to the right - * destination host. Used for NAT and local client. + * destination host.   */  static int handle_response_icmp(int af, struct sk_buff *skb,  				union nf_inet_addr *snet, @@ -710,16 +820,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,  #endif  		ip_vs_nat_icmp(skb, pp, cp, 1); -#ifdef CONFIG_IP_VS_IPV6 -	if (af == AF_INET6) { -		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) -			goto out; -	} else -#endif -		if ((sysctl_ip_vs_snat_reroute || -		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) && -		    ip_route_me_harder(skb, RTN_LOCAL) != 0) -			goto out; +	if (ip_vs_route_me_harder(af, skb)) +		goto out;  	/* do the statistics and put it back */  	ip_vs_out_stats(cp, skb); @@ -757,7 +859,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,  	*related = 1;  	/* reassemble IP fragments */ -	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { +	if (ip_is_fragment(ip_hdr(skb))) {  		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))  			return NF_STOLEN;  	} @@ -804,51 +906,35 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,  	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,  		      "Checking outgoing ICMP for"); -	offset += cih->ihl * 4; - -	ip_vs_fill_iphdr(AF_INET, cih, &ciph); +	ip_vs_fill_ip4hdr(cih, &ciph); +	ciph.len += offset;  	/* The embedded headers contain source and dest in reverse order */ -	cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); +	cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);  	if (!cp)  		return NF_ACCEPT;  	snet.ip = iph->saddr;  	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, -				    pp, offset, ihl); +				    pp, ciph.len, ihl);  }  #ifdef CONFIG_IP_VS_IPV6  static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, -			     unsigned int hooknum) +			     unsigned int hooknum, struct ip_vs_iphdr *ipvsh)  { -	struct ipv6hdr *iph;  	struct icmp6hdr	_icmph, *ic; -	struct ipv6hdr	_ciph, *cih;	/* The ip header contained -					   within the ICMP */ -	struct ip_vs_iphdr ciph; +	struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ +	struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */  	struct ip_vs_conn *cp;  	struct ip_vs_protocol *pp; -	unsigned int offset;  	union nf_inet_addr snet; +	unsigned int writable;  	*related = 1; - -	/* reassemble IP fragments */ -	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { -		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) -			return NF_STOLEN; -	} - -	iph = ipv6_hdr(skb); -	offset = sizeof(struct ipv6hdr); -	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); +	ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);  	if (ic == NULL)  		return NF_DROP; -	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n", -		  ic->icmp6_type, ntohs(icmpv6_id(ic)), -		  &iph->saddr, &iph->daddr); -  	/*  	 * Work through seeing if this is for us.  	 * These checks are supposed to be in an order that means easy @@ -856,42 +942,45 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,  	 * this means that some packets will manage to get a long way  	 * down this stack and then be rejected, but that's life.  	 */ -	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && -	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && -	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { +	if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {  		*related = 0;  		return NF_ACCEPT;  	} +	/* Fragment header that is before ICMP header tells us that: +	 * it's not an error message since they can't be fragmented. +	 */ +	if (ipvsh->flags & IP6_FH_F_FRAG) +		return NF_DROP; + +	IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", +		  ic->icmp6_type, ntohs(icmpv6_id(ic)), +		  &ipvsh->saddr, &ipvsh->daddr);  	/* Now find the contained IP header */ -	offset += sizeof(_icmph); -	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); -	if (cih == NULL) +	ciph.len = ipvsh->len + sizeof(_icmph); +	ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); +	if (ip6h == NULL)  		return NF_ACCEPT; /* The packet looks wrong, ignore */ - -	pp = ip_vs_proto_get(cih->nexthdr); +	ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ +	ciph.daddr.in6 = ip6h->daddr; +	/* skip possible IPv6 exthdrs of contained IPv6 packet */ +	ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); +	if (ciph.protocol < 0) +		return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + +	pp = ip_vs_proto_get(ciph.protocol);  	if (!pp)  		return NF_ACCEPT; -	/* Is the embedded protocol header present? */ -	/* TODO: we don't support fragmentation at the moment anyways */ -	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) -		return NF_ACCEPT; - -	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, -		      "Checking outgoing ICMPv6 for"); - -	offset += sizeof(struct ipv6hdr); - -	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);  	/* The embedded headers contain source and dest in reverse order */ -	cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); +	cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);  	if (!cp)  		return NF_ACCEPT; -	ipv6_addr_copy(&snet.in6, &iph->saddr); -	return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, -				    pp, offset, sizeof(struct ipv6hdr)); +	snet.in6 = ciph.saddr.in6; +	writable = ciph.len; +	return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, +				    pp, writable, sizeof(struct ipv6hdr));  }  #endif @@ -920,20 +1009,47 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)  	return th->rst;  } +static inline bool is_new_conn(const struct sk_buff *skb, +			       struct ip_vs_iphdr *iph) +{ +	switch (iph->protocol) { +	case IPPROTO_TCP: { +		struct tcphdr _tcph, *th; + +		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); +		if (th == NULL) +			return false; +		return th->syn; +	} +	case IPPROTO_SCTP: { +		sctp_chunkhdr_t *sch, schunk; + +		sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), +					 sizeof(schunk), &schunk); +		if (sch == NULL) +			return false; +		return sch->type == SCTP_CID_INIT; +	} +	default: +		return false; +	} +} +  /* Handle response packets: rewrite addresses and send away... - * Used for NAT and local client.   */  static unsigned int -handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, -		struct ip_vs_conn *cp, int ihl) +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +		struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)  { +	struct ip_vs_protocol *pp = pd->pp; +  	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); -	if (!skb_make_writable(skb, ihl)) +	if (!skb_make_writable(skb, iph->len))  		goto drop;  	/* mangle the packet */ -	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) +	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))  		goto drop;  #ifdef CONFIG_IP_VS_IPV6 @@ -961,21 +1077,13 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,  	 * if it came from this machine itself.  So re-compute  	 * the routing information.  	 */ -#ifdef CONFIG_IP_VS_IPV6 -	if (af == AF_INET6) { -		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) -			goto drop; -	} else -#endif -		if ((sysctl_ip_vs_snat_reroute || -		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) && -		    ip_route_me_harder(skb, RTN_LOCAL) != 0) -			goto drop; +	if (ip_vs_route_me_harder(af, skb)) +		goto drop;  	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");  	ip_vs_out_stats(cp, skb); -	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); +	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);  	skb->ipvs_property = 1;  	if (!(cp->flags & IP_VS_CONN_F_NFCT))  		ip_vs_notrack(skb); @@ -999,8 +1107,10 @@ drop:  static unsigned int  ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)  { +	struct net *net = NULL;  	struct ip_vs_iphdr iph;  	struct ip_vs_protocol *pp; +	struct ip_vs_proto_data *pd;  	struct ip_vs_conn *cp;  	EnterFunction(11); @@ -1022,17 +1132,20 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)  	if (unlikely(!skb_dst(skb)))  		return NF_ACCEPT; -	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +	net = skb_net(skb); +	if (!net_ipvs(net)->enable) +		return NF_ACCEPT; + +	ip_vs_fill_iph_skb(af, skb, &iph);  #ifdef CONFIG_IP_VS_IPV6  	if (af == AF_INET6) {  		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {  			int related;  			int verdict = ip_vs_out_icmp_v6(skb, &related, -							hooknum); +							hooknum, &iph);  			if (related)  				return verdict; -			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);  		}  	} else  #endif @@ -1042,54 +1155,44 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)  			if (related)  				return verdict; -			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);  		} -	pp = ip_vs_proto_get(iph.protocol); -	if (unlikely(!pp)) +	pd = ip_vs_proto_data_get(net, iph.protocol); +	if (unlikely(!pd))  		return NF_ACCEPT; +	pp = pd->pp;  	/* reassemble IP fragments */  #ifdef CONFIG_IP_VS_IPV6 -	if (af == AF_INET6) { -		if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { -			if (ip_vs_gather_frags_v6(skb, -						  ip_vs_defrag_user(hooknum))) -				return NF_STOLEN; -		} - -		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); -	} else +	if (af == AF_INET)  #endif -		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && -			     !pp->dont_defrag)) { +		if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {  			if (ip_vs_gather_frags(skb,  					       ip_vs_defrag_user(hooknum)))  				return NF_STOLEN; -			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +			ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);  		}  	/*  	 * Check if the packet belongs to an existing entry  	 */ -	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); +	cp = pp->conn_out_get(af, skb, &iph, 0);  	if (likely(cp)) -		return handle_response(af, skb, pp, cp, iph.len); -	if (sysctl_ip_vs_nat_icmp_send && +		return handle_response(af, skb, pd, cp, &iph); +	if (sysctl_nat_icmp_send(net) &&  	    (pp->protocol == IPPROTO_TCP ||  	     pp->protocol == IPPROTO_UDP ||  	     pp->protocol == IPPROTO_SCTP)) {  		__be16 _ports[2], *pptr; -		pptr = skb_header_pointer(skb, iph.len, -					  sizeof(_ports), _ports); +		pptr = frag_safe_skb_hp(skb, iph.len, +					 sizeof(_ports), _ports, &iph);  		if (pptr == NULL)  			return NF_ACCEPT;	/* Not for me */ -		if (ip_vs_lookup_real_service(af, iph.protocol, -					      &iph.saddr, -					      pptr[0])) { +		if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, +					   pptr[0])) {  			/*  			 * Notify the real server: there is no  			 * existing entry if it is not RST @@ -1104,9 +1207,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)  						iph.len)))) {  #ifdef CONFIG_IP_VS_IPV6  				if (af == AF_INET6) { -					struct net *net = -						dev_net(skb_dst(skb)->dev); -  					if (!skb->dev)  						skb->dev = net->loopback_dev;  					icmpv6_send(skb, @@ -1133,11 +1233,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,  	     const struct net_device *in, const struct net_device *out,  	     int (*okfn)(struct sk_buff *))  { -	return ip_vs_out(hooknum, skb, AF_INET); +	return ip_vs_out(ops->hooknum, skb, AF_INET);  }  /* @@ -1145,17 +1245,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,  		   const struct net_device *in, const struct net_device *out,  		   int (*okfn)(struct sk_buff *))  { -	unsigned int verdict; - -	/* Disable BH in LOCAL_OUT until all places are fixed */ -	local_bh_disable(); -	verdict = ip_vs_out(hooknum, skb, AF_INET); -	local_bh_enable(); -	return verdict; +	return ip_vs_out(ops->hooknum, skb, AF_INET);  }  #ifdef CONFIG_IP_VS_IPV6 @@ -1166,11 +1260,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,  	     const struct net_device *in, const struct net_device *out,  	     int (*okfn)(struct sk_buff *))  { -	return ip_vs_out(hooknum, skb, AF_INET6); +	return ip_vs_out(ops->hooknum, skb, AF_INET6);  }  /* @@ -1178,17 +1272,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,   *	Check if packet is reply for established ip_vs_conn.   */  static unsigned int -ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		   const struct net_device *in, const struct net_device *out,  		   int (*okfn)(struct sk_buff *))  { -	unsigned int verdict; - -	/* Disable BH in LOCAL_OUT until all places are fixed */ -	local_bh_disable(); -	verdict = ip_vs_out(hooknum, skb, AF_INET6); -	local_bh_enable(); -	return verdict; +	return ip_vs_out(ops->hooknum, skb, AF_INET6);  }  #endif @@ -1202,19 +1290,21 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,  static int  ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)  { +	struct net *net = NULL;  	struct iphdr *iph;  	struct icmphdr	_icmph, *ic;  	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */  	struct ip_vs_iphdr ciph;  	struct ip_vs_conn *cp;  	struct ip_vs_protocol *pp; -	unsigned int offset, ihl, verdict; -	union nf_inet_addr snet; +	struct ip_vs_proto_data *pd; +	unsigned int offset, offset2, ihl, verdict; +	bool ipip;  	*related = 1;  	/* reassemble IP fragments */ -	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { +	if (ip_is_fragment(ip_hdr(skb))) {  		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))  			return NF_STOLEN;  	} @@ -1249,9 +1339,27 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)  	if (cih == NULL)  		return NF_ACCEPT; /* The packet looks wrong, ignore */ -	pp = ip_vs_proto_get(cih->protocol); -	if (!pp) +	net = skb_net(skb); + +	/* Special case for errors for IPIP packets */ +	ipip = false; +	if (cih->protocol == IPPROTO_IPIP) { +		if (unlikely(cih->frag_off & htons(IP_OFFSET))) +			return NF_ACCEPT; +		/* Error for our IPIP must arrive at LOCAL_IN */ +		if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) +			return NF_ACCEPT; +		offset += cih->ihl * 4; +		cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); +		if (cih == NULL) +			return NF_ACCEPT; /* The packet looks wrong, ignore */ +		ipip = true; +	} + +	pd = ip_vs_proto_data_get(net, cih->protocol); +	if (!pd)  		return NF_ACCEPT; +	pp = pd->pp;  	/* Is the embedded protocol header present? */  	if (unlikely(cih->frag_off & htons(IP_OFFSET) && @@ -1261,22 +1369,16 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)  	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,  		      "Checking incoming ICMP for"); -	offset += cih->ihl * 4; - -	ip_vs_fill_iphdr(AF_INET, cih, &ciph); -	/* The embedded headers contain source and dest in reverse order */ -	cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); -	if (!cp) { -		/* The packet could also belong to a local client */ -		cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); -		if (cp) { -			snet.ip = iph->saddr; -			return handle_response_icmp(AF_INET, skb, &snet, -						    cih->protocol, cp, pp, -						    offset, ihl); -		} +	offset2 = offset; +	ip_vs_fill_ip4hdr(cih, &ciph); +	ciph.len += offset; +	offset = ciph.len; +	/* The embedded headers contain source and dest in reverse order. +	 * For IPIP this is error for request, not for reply. +	 */ +	cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); +	if (!cp)  		return NF_ACCEPT; -	}  	verdict = NF_DROP; @@ -1288,59 +1390,95 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)  		goto out;  	} +	if (ipip) { +		__be32 info = ic->un.gateway; +		__u8 type = ic->type; +		__u8 code = ic->code; + +		/* Update the MTU */ +		if (ic->type == ICMP_DEST_UNREACH && +		    ic->code == ICMP_FRAG_NEEDED) { +			struct ip_vs_dest *dest = cp->dest; +			u32 mtu = ntohs(ic->un.frag.mtu); +			__be16 frag_off = cih->frag_off; + +			/* Strip outer IP and ICMP, go to IPIP header */ +			if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) +				goto ignore_ipip; +			offset2 -= ihl + sizeof(_icmph); +			skb_reset_network_header(skb); +			IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", +				&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); +			ipv4_update_pmtu(skb, dev_net(skb->dev), +					 mtu, 0, 0, 0, 0); +			/* Client uses PMTUD? */ +			if (!(frag_off & htons(IP_DF))) +				goto ignore_ipip; +			/* Prefer the resulting PMTU */ +			if (dest) { +				struct ip_vs_dest_dst *dest_dst; + +				rcu_read_lock(); +				dest_dst = rcu_dereference(dest->dest_dst); +				if (dest_dst) +					mtu = dst_mtu(dest_dst->dst_cache); +				rcu_read_unlock(); +			} +			if (mtu > 68 + sizeof(struct iphdr)) +				mtu -= sizeof(struct iphdr); +			info = htonl(mtu); +		} +		/* Strip outer IP, ICMP and IPIP, go to IP header of +		 * original request. +		 */ +		if (pskb_pull(skb, offset2) == NULL) +			goto ignore_ipip; +		skb_reset_network_header(skb); +		IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", +			&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, +			type, code, ntohl(info)); +		icmp_send(skb, type, code, info); +		/* ICMP can be shorter but anyways, account it */ +		ip_vs_out_stats(cp, skb); + +ignore_ipip: +		consume_skb(skb); +		verdict = NF_STOLEN; +		goto out; +	} +  	/* do the statistics and put it back */  	ip_vs_in_stats(cp, skb); -	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) +	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || +	    IPPROTO_SCTP == cih->protocol)  		offset += 2 * sizeof(__u16); -	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); -	/* LOCALNODE from FORWARD hook is not supported */ -	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && -	    skb_rtable(skb)->rt_flags & RTCF_LOCAL) { -		IP_VS_DBG(1, "%s(): " -			  "local delivery to %pI4 but in FORWARD\n", -			  __func__, &skb_rtable(skb)->rt_dst); -		verdict = NF_DROP; -	} +	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); -  out: +out:  	__ip_vs_conn_put(cp);  	return verdict;  }  #ifdef CONFIG_IP_VS_IPV6 -static int -ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, +			    unsigned int hooknum, struct ip_vs_iphdr *iph)  { -	struct ipv6hdr *iph; +	struct net *net = NULL; +	struct ipv6hdr _ip6h, *ip6h;  	struct icmp6hdr	_icmph, *ic; -	struct ipv6hdr	_ciph, *cih;	/* The ip header contained -					   within the ICMP */ -	struct ip_vs_iphdr ciph; +	struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */  	struct ip_vs_conn *cp;  	struct ip_vs_protocol *pp; -	unsigned int offset, verdict; -	union nf_inet_addr snet; -	struct rt6_info *rt; +	struct ip_vs_proto_data *pd; +	unsigned int offs_ciph, writable, verdict;  	*related = 1; -	/* reassemble IP fragments */ -	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { -		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) -			return NF_STOLEN; -	} - -	iph = ipv6_hdr(skb); -	offset = sizeof(struct ipv6hdr); -	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); +	ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);  	if (ic == NULL)  		return NF_DROP; -	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n", -		  ic->icmp6_type, ntohs(icmpv6_id(ic)), -		  &iph->saddr, &iph->daddr); -  	/*  	 * Work through seeing if this is for us.  	 * These checks are supposed to be in an order that means easy @@ -1348,66 +1486,71 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)  	 * this means that some packets will manage to get a long way  	 * down this stack and then be rejected, but that's life.  	 */ -	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && -	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && -	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { +	if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {  		*related = 0;  		return NF_ACCEPT;  	} +	/* Fragment header that is before ICMP header tells us that: +	 * it's not an error message since they can't be fragmented. +	 */ +	if (iph->flags & IP6_FH_F_FRAG) +		return NF_DROP; + +	IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", +		  ic->icmp6_type, ntohs(icmpv6_id(ic)), +		  &iph->saddr, &iph->daddr);  	/* Now find the contained IP header */ -	offset += sizeof(_icmph); -	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); -	if (cih == NULL) +	ciph.len = iph->len + sizeof(_icmph); +	offs_ciph = ciph.len; /* Save ip header offset */ +	ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); +	if (ip6h == NULL)  		return NF_ACCEPT; /* The packet looks wrong, ignore */ - -	pp = ip_vs_proto_get(cih->nexthdr); -	if (!pp) +	ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ +	ciph.daddr.in6 = ip6h->daddr; +	/* skip possible IPv6 exthdrs of contained IPv6 packet */ +	ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); +	if (ciph.protocol < 0) +		return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + +	net = skb_net(skb); +	pd = ip_vs_proto_data_get(net, ciph.protocol); +	if (!pd)  		return NF_ACCEPT; +	pp = pd->pp; -	/* Is the embedded protocol header present? */ -	/* TODO: we don't support fragmentation at the moment anyways */ -	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) +	/* Cannot handle fragmented embedded protocol */ +	if (ciph.fragoffs)  		return NF_ACCEPT; -	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, +	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,  		      "Checking incoming ICMPv6 for"); -	offset += sizeof(struct ipv6hdr); +	/* The embedded headers contain source and dest in reverse order +	 * if not from localhost +	 */ +	cp = pp->conn_in_get(AF_INET6, skb, &ciph, +			     (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); -	ip_vs_fill_iphdr(AF_INET6, cih, &ciph); -	/* The embedded headers contain source and dest in reverse order */ -	cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); -	if (!cp) { -		/* The packet could also belong to a local client */ -		cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); -		if (cp) { -			ipv6_addr_copy(&snet.in6, &iph->saddr); -			return handle_response_icmp(AF_INET6, skb, &snet, -						    cih->nexthdr, -						    cp, pp, offset, -						    sizeof(struct ipv6hdr)); -		} +	if (!cp) +		return NF_ACCEPT; +	/* VS/TUN, VS/DR and LOCALNODE just let it go */ +	if ((hooknum == NF_INET_LOCAL_OUT) && +	    (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { +		__ip_vs_conn_put(cp);  		return NF_ACCEPT;  	} -	verdict = NF_DROP; -  	/* do the statistics and put it back */  	ip_vs_in_stats(cp, skb); -	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr || -	    IPPROTO_SCTP == cih->nexthdr) -		offset += 2 * sizeof(__u16); -	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); -	/* LOCALNODE from FORWARD hook is not supported */ -	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && -	    (rt = (struct rt6_info *) skb_dst(skb)) && -	    rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) { -		IP_VS_DBG(1, "%s(): " -			  "local delivery to %pI6 but in FORWARD\n", -			  __func__, &rt->rt6i_dst); -		verdict = NF_DROP; -	} + +	/* Need to mangle contained IPv6 header in ICMPv6 packet */ +	writable = ciph.len; +	if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || +	    IPPROTO_SCTP == ciph.protocol) +		writable += 2 * sizeof(__u16); /* Also mangle ports */ + +	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);  	__ip_vs_conn_put(cp); @@ -1423,10 +1566,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)  static unsigned int  ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  { +	struct net *net;  	struct ip_vs_iphdr iph;  	struct ip_vs_protocol *pp; +	struct ip_vs_proto_data *pd;  	struct ip_vs_conn *cp; -	int ret, restart, pkts; +	int ret, pkts; +	struct netns_ipvs *ipvs;  	/* Already marked as IPVS request or reply? */  	if (skb->ipvs_property) @@ -1440,14 +1586,20 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  	if (unlikely((skb->pkt_type != PACKET_HOST &&  		      hooknum != NF_INET_LOCAL_OUT) ||  		     !skb_dst(skb))) { -		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +		ip_vs_fill_iph_skb(af, skb, &iph);  		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"  			      " ignored in hook %u\n",  			      skb->pkt_type, iph.protocol,  			      IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);  		return NF_ACCEPT;  	} -	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +	/* ipvs enabled in this netns ? */ +	net = skb_net(skb); +	ipvs = net_ipvs(net); +	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) +		return NF_ACCEPT; + +	ip_vs_fill_iph_skb(af, skb, &iph);  	/* Bad... Do not break raw sockets */  	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && @@ -1463,11 +1615,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  	if (af == AF_INET6) {  		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {  			int related; -			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); +			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, +						       &iph);  			if (related)  				return verdict; -			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);  		}  	} else  #endif @@ -1477,23 +1629,34 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  			if (related)  				return verdict; -			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);  		}  	/* Protocol supported? */ -	pp = ip_vs_proto_get(iph.protocol); -	if (unlikely(!pp)) +	pd = ip_vs_proto_data_get(net, iph.protocol); +	if (unlikely(!pd))  		return NF_ACCEPT; - +	pp = pd->pp;  	/*  	 * Check if the packet belongs to an existing connection entry  	 */ -	cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); +	cp = pp->conn_in_get(af, skb, &iph, 0); -	if (unlikely(!cp)) { +	if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && +	    unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && +	    is_new_conn(skb, &iph)) { +		ip_vs_conn_expire_now(cp); +		__ip_vs_conn_put(cp); +		cp = NULL; +	} + +	if (unlikely(!cp) && !iph.fragoffs) { +		/* No (second) fragments need to enter here, as nf_defrag_ipv6 +		 * replayed fragment zero will already have created the cp +		 */  		int v; -		if (!pp->conn_schedule(af, skb, pp, &v, &cp)) +		/* Schedule and create new connection entry into &cp */ +		if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))  			return v;  	} @@ -1501,16 +1664,22 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  		/* sorry, all this trouble for a no-hit :) */  		IP_VS_DBG_PKT(12, af, pp, skb, 0,  			      "ip_vs_in: packet continues traversal as normal"); +		if (iph.fragoffs) { +			/* Fragment that couldn't be mapped to a conn entry +			 * is missing module nf_defrag_ipv6 +			 */ +			IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); +			IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); +		}  		return NF_ACCEPT;  	}  	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); -  	/* Check the server status */  	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {  		/* the destination server is not available */ -		if (sysctl_ip_vs_expire_nodest_conn) { +		if (sysctl_expire_nodest_conn(ipvs)) {  			/* try to expire the connection immediately */  			ip_vs_conn_expire_now(cp);  		} @@ -1521,9 +1690,9 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  	}  	ip_vs_in_stats(cp, skb); -	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); +	ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);  	if (cp->packet_xmit) -		ret = cp->packet_xmit(skb, cp, pp); +		ret = cp->packet_xmit(skb, cp, pp, &iph);  		/* do not touch skb anymore */  	else {  		IP_VS_DBG_RL("warning: packet_xmit is null"); @@ -1535,37 +1704,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)  	 *  	 * Sync connection if it is about to close to  	 * encorage the standby servers to update the connections timeout +	 * +	 * For ONE_PKT let ip_vs_sync_conn() do the filter work.  	 */ -	pkts = atomic_add_return(1, &cp->in_pkts); -	if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && -	    cp->protocol == IPPROTO_SCTP) { -		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && -			(pkts % sysctl_ip_vs_sync_threshold[1] -			 == sysctl_ip_vs_sync_threshold[0])) || -				(cp->old_state != cp->state && -				 ((cp->state == IP_VS_SCTP_S_CLOSED) || -				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || -				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { -			ip_vs_sync_conn(cp); -			goto out; -		} -	} -	/* Keep this block last: TCP and others with pp->num_states <= 1 */ -	else if (af == AF_INET && -	    (ip_vs_sync_state & IP_VS_STATE_MASTER) && -	    (((cp->protocol != IPPROTO_TCP || -	       cp->state == IP_VS_TCP_S_ESTABLISHED) && -	      (pkts % sysctl_ip_vs_sync_threshold[1] -	       == sysctl_ip_vs_sync_threshold[0])) || -	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && -	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) || -	       (cp->state == IP_VS_TCP_S_CLOSE) || -	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || -	       (cp->state == IP_VS_TCP_S_TIME_WAIT))))) -		ip_vs_sync_conn(cp); -out: -	cp->old_state = cp->state; +	if (cp->flags & IP_VS_CONN_F_ONE_PACKET) +		pkts = sysctl_sync_threshold(ipvs); +	else +		pkts = atomic_add_return(1, &cp->in_pkts); + +	if (ipvs->sync_state & IP_VS_STATE_MASTER) +		ip_vs_sync_conn(net, cp, pkts);  	ip_vs_conn_put(cp);  	return ret; @@ -1576,12 +1725,12 @@ out:   *	Schedule and forward packets from remote clients   */  static unsigned int -ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in,  		      const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  { -	return ip_vs_in(hooknum, skb, AF_INET); +	return ip_vs_in(ops->hooknum, skb, AF_INET);  }  /* @@ -1589,17 +1738,11 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,   *	Schedule and forward packets from local clients   */  static unsigned int -ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,  		     const struct net_device *in, const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  { -	unsigned int verdict; - -	/* Disable BH in LOCAL_OUT until all places are fixed */ -	local_bh_disable(); -	verdict = ip_vs_in(hooknum, skb, AF_INET); -	local_bh_enable(); -	return verdict; +	return ip_vs_in(ops->hooknum, skb, AF_INET);  }  #ifdef CONFIG_IP_VS_IPV6 @@ -1609,12 +1752,12 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,   *	Schedule and forward packets from remote clients   */  static unsigned int -ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in,  		      const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  { -	return ip_vs_in(hooknum, skb, AF_INET6); +	return ip_vs_in(ops->hooknum, skb, AF_INET6);  }  /* @@ -1622,17 +1765,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,   *	Schedule and forward packets from local clients   */  static unsigned int -ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		     const struct net_device *in, const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  { -	unsigned int verdict; - -	/* Disable BH in LOCAL_OUT until all places are fixed */ -	local_bh_disable(); -	verdict = ip_vs_in(hooknum, skb, AF_INET6); -	local_bh_enable(); -	return verdict; +	return ip_vs_in(ops->hooknum, skb, AF_INET6);  }  #endif @@ -1648,30 +1785,48 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,   *      and send them to ip_vs_in_icmp.   */  static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,  		   const struct net_device *in, const struct net_device *out,  		   int (*okfn)(struct sk_buff *))  {  	int r; +	struct net *net; +	struct netns_ipvs *ipvs;  	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)  		return NF_ACCEPT; -	return ip_vs_in_icmp(skb, &r, hooknum); +	/* ipvs enabled in this netns ? */ +	net = skb_net(skb); +	ipvs = net_ipvs(net); +	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) +		return NF_ACCEPT; + +	return ip_vs_in_icmp(skb, &r, ops->hooknum);  }  #ifdef CONFIG_IP_VS_IPV6  static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in, const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  {  	int r; +	struct net *net; +	struct netns_ipvs *ipvs; +	struct ip_vs_iphdr iphdr; + +	ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); +	if (iphdr.protocol != IPPROTO_ICMPV6) +		return NF_ACCEPT; -	if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) +	/* ipvs enabled in this netns ? */ +	net = skb_net(skb); +	ipvs = net_ipvs(net); +	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))  		return NF_ACCEPT; -	return ip_vs_in_icmp_v6(skb, &r, hooknum); +	return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);  }  #endif @@ -1681,9 +1836,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {  	{  		.hook		= ip_vs_reply4,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_LOCAL_IN, -		.priority	= 99, +		.priority	= NF_IP_PRI_NAT_SRC - 2,  	},  	/* After packet filtering, forward packet through VS/DR, VS/TUN,  	 * or VS/NAT(change destination), so that filtering rules can be @@ -1691,32 +1846,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {  	{  		.hook		= ip_vs_remote_request4,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_LOCAL_IN, -		.priority	= 101, +		.priority	= NF_IP_PRI_NAT_SRC - 1,  	},  	/* Before ip_vs_in, change source only for VS/NAT */  	{  		.hook		= ip_vs_local_reply4,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_LOCAL_OUT, -		.priority	= -99, +		.priority	= NF_IP_PRI_NAT_DST + 1,  	},  	/* After mangle, schedule and forward local requests */  	{  		.hook		= ip_vs_local_request4,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_LOCAL_OUT, -		.priority	= -98, +		.priority	= NF_IP_PRI_NAT_DST + 2,  	},  	/* After packet filtering (but before ip_vs_out_icmp), catch icmp  	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */  	{  		.hook		= ip_vs_forward_icmp,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_FORWARD,  		.priority	= 99,  	}, @@ -1724,7 +1879,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {  	{  		.hook		= ip_vs_reply4,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_FORWARD,  		.priority	= 100,  	}, @@ -1733,9 +1888,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {  	{  		.hook		= ip_vs_reply6,  		.owner		= THIS_MODULE, -		.pf		= PF_INET6, +		.pf		= NFPROTO_IPV6,  		.hooknum	= NF_INET_LOCAL_IN, -		.priority	= 99, +		.priority	= NF_IP6_PRI_NAT_SRC - 2,  	},  	/* After packet filtering, forward packet through VS/DR, VS/TUN,  	 * or VS/NAT(change destination), so that filtering rules can be @@ -1743,32 +1898,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {  	{  		.hook		= ip_vs_remote_request6,  		.owner		= THIS_MODULE, -		.pf		= PF_INET6, +		.pf		= NFPROTO_IPV6,  		.hooknum	= NF_INET_LOCAL_IN, -		.priority	= 101, +		.priority	= NF_IP6_PRI_NAT_SRC - 1,  	},  	/* Before ip_vs_in, change source only for VS/NAT */  	{  		.hook		= ip_vs_local_reply6,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_LOCAL_OUT, -		.priority	= -99, +		.priority	= NF_IP6_PRI_NAT_DST + 1,  	},  	/* After mangle, schedule and forward local requests */  	{  		.hook		= ip_vs_local_request6,  		.owner		= THIS_MODULE, -		.pf		= PF_INET6, +		.pf		= NFPROTO_IPV6,  		.hooknum	= NF_INET_LOCAL_OUT, -		.priority	= -98, +		.priority	= NF_IP6_PRI_NAT_DST + 2,  	},  	/* After packet filtering (but before ip_vs_out_icmp), catch icmp  	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */  	{  		.hook		= ip_vs_forward_icmp_v6,  		.owner		= THIS_MODULE, -		.pf		= PF_INET6, +		.pf		= NFPROTO_IPV6,  		.hooknum	= NF_INET_FORWARD,  		.priority	= 99,  	}, @@ -1776,13 +1931,102 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {  	{  		.hook		= ip_vs_reply6,  		.owner		= THIS_MODULE, -		.pf		= PF_INET6, +		.pf		= NFPROTO_IPV6,  		.hooknum	= NF_INET_FORWARD,  		.priority	= 100,  	},  #endif  }; +/* + *	Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ +	struct netns_ipvs *ipvs; + +	ipvs = net_generic(net, ip_vs_net_id); +	if (ipvs == NULL) +		return -ENOMEM; + +	/* Hold the beast until a service is registerd */ +	ipvs->enable = 0; +	ipvs->net = net; +	/* Counters used for creating unique names */ +	ipvs->gen = atomic_read(&ipvs_netns_cnt); +	atomic_inc(&ipvs_netns_cnt); +	net->ipvs = ipvs; + +	if (ip_vs_estimator_net_init(net) < 0) +		goto estimator_fail; + +	if (ip_vs_control_net_init(net) < 0) +		goto control_fail; + +	if (ip_vs_protocol_net_init(net) < 0) +		goto protocol_fail; + +	if (ip_vs_app_net_init(net) < 0) +		goto app_fail; + +	if (ip_vs_conn_net_init(net) < 0) +		goto conn_fail; +	if (ip_vs_sync_net_init(net) < 0) +		goto sync_fail; + +	printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", +			 sizeof(struct netns_ipvs), ipvs->gen); +	return 0; +/* + * Error handling + */ + +sync_fail: +	ip_vs_conn_net_cleanup(net); +conn_fail: +	ip_vs_app_net_cleanup(net); +app_fail: +	ip_vs_protocol_net_cleanup(net); +protocol_fail: +	ip_vs_control_net_cleanup(net); +control_fail: +	ip_vs_estimator_net_cleanup(net); +estimator_fail: +	net->ipvs = NULL; +	return -ENOMEM; +} + +static void __net_exit __ip_vs_cleanup(struct net *net) +{ +	ip_vs_service_net_cleanup(net);	/* ip_vs_flush() with locks */ +	ip_vs_conn_net_cleanup(net); +	ip_vs_app_net_cleanup(net); +	ip_vs_protocol_net_cleanup(net); +	ip_vs_control_net_cleanup(net); +	ip_vs_estimator_net_cleanup(net); +	IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); +	net->ipvs = NULL; +} + +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ +	EnterFunction(2); +	net_ipvs(net)->enable = 0;	/* Disable packet reception */ +	smp_wmb(); +	ip_vs_sync_net_cleanup(net); +	LeaveFunction(2); +} + +static struct pernet_operations ipvs_core_ops = { +	.init = __ip_vs_init, +	.exit = __ip_vs_cleanup, +	.id   = &ip_vs_net_id, +	.size = sizeof(struct netns_ipvs), +}; + +static struct pernet_operations ipvs_core_dev_ops = { +	.exit = __ip_vs_dev_cleanup, +};  /*   *	Initialize IP Virtual Server @@ -1791,57 +2035,68 @@ static int __init ip_vs_init(void)  {  	int ret; -	ip_vs_estimator_init(); -  	ret = ip_vs_control_init();  	if (ret < 0) {  		pr_err("can't setup control.\n"); -		goto cleanup_estimator; +		goto exit;  	}  	ip_vs_protocol_init(); -	ret = ip_vs_app_init(); -	if (ret < 0) { -		pr_err("can't setup application helper.\n"); -		goto cleanup_protocol; -	} -  	ret = ip_vs_conn_init();  	if (ret < 0) {  		pr_err("can't setup connection table.\n"); -		goto cleanup_app; +		goto cleanup_protocol;  	} +	ret = register_pernet_subsys(&ipvs_core_ops);	/* Alloc ip_vs struct */ +	if (ret < 0) +		goto cleanup_conn; + +	ret = register_pernet_device(&ipvs_core_dev_ops); +	if (ret < 0) +		goto cleanup_sub; +  	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));  	if (ret < 0) {  		pr_err("can't register hooks.\n"); -		goto cleanup_conn; +		goto cleanup_dev; +	} + +	ret = ip_vs_register_nl_ioctl(); +	if (ret < 0) { +		pr_err("can't register netlink/ioctl.\n"); +		goto cleanup_hooks;  	}  	pr_info("ipvs loaded.\n"); +  	return ret; -  cleanup_conn: +cleanup_hooks: +	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); +cleanup_dev: +	unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: +	unregister_pernet_subsys(&ipvs_core_ops); +cleanup_conn:  	ip_vs_conn_cleanup(); -  cleanup_app: -	ip_vs_app_cleanup(); -  cleanup_protocol: +cleanup_protocol:  	ip_vs_protocol_cleanup();  	ip_vs_control_cleanup(); -  cleanup_estimator: -	ip_vs_estimator_cleanup(); +exit:  	return ret;  }  static void __exit ip_vs_cleanup(void)  { +	ip_vs_unregister_nl_ioctl();  	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); +	unregister_pernet_device(&ipvs_core_dev_ops); +	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */  	ip_vs_conn_cleanup(); -	ip_vs_app_cleanup();  	ip_vs_protocol_cleanup();  	ip_vs_control_cleanup(); -	ip_vs_estimator_cleanup();  	pr_info("ipvs unloaded.\n");  } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c6f29363922..581a6584ed0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -38,6 +38,7 @@  #include <linux/mutex.h>  #include <net/net_namespace.h> +#include <linux/nsproxy.h>  #include <net/ip.h>  #ifdef CONFIG_IP_VS_IPV6  #include <net/ipv6.h> @@ -54,45 +55,7 @@  /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */  static DEFINE_MUTEX(__ip_vs_mutex); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_SPINLOCK(ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; -  /* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; -#ifdef CONFIG_IP_VS_NFCT -int sysctl_ip_vs_conntrack; -#endif -int sysctl_ip_vs_snat_reroute = 1; -  #ifdef CONFIG_IP_VS_DEBUG  static int sysctl_ip_vs_debug_level = 0; @@ -103,29 +66,35 @@ int ip_vs_get_debug_level(void)  }  #endif + +/*  Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); + +  #ifdef CONFIG_IP_VS_IPV6  /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +static bool __ip_vs_addr_is_local_v6(struct net *net, +				     const struct in6_addr *addr)  { -	struct rt6_info *rt; -	struct flowi fl = { -		.oif = 0, -		.fl6_dst = *addr, -		.fl6_src = { .s6_addr32 = {0, 0, 0, 0} }, +	struct flowi6 fl6 = { +		.daddr = *addr,  	}; +	struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); +	bool is_local; -	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); -	if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) -			return 1; +	is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); -	return 0; +	dst_release(dst); +	return is_local;  }  #endif + +#ifdef CONFIG_SYSCTL  /*   *	update_defense_level is called from keventd and from sysctl,   *	so it needs to protect itself from softirqs   */ -static void update_defense_level(void) +static void update_defense_level(struct netns_ipvs *ipvs)  {  	struct sysinfo i;  	static int old_secure_tcp = 0; @@ -141,73 +110,73 @@ static void update_defense_level(void)  	/* si_swapinfo(&i); */  	/* availmem = availmem - (i.totalswap - i.freeswap); */ -	nomem = (availmem < sysctl_ip_vs_amemthresh); +	nomem = (availmem < ipvs->sysctl_amemthresh);  	local_bh_disable();  	/* drop_entry */ -	spin_lock(&__ip_vs_dropentry_lock); -	switch (sysctl_ip_vs_drop_entry) { +	spin_lock(&ipvs->dropentry_lock); +	switch (ipvs->sysctl_drop_entry) {  	case 0: -		atomic_set(&ip_vs_dropentry, 0); +		atomic_set(&ipvs->dropentry, 0);  		break;  	case 1:  		if (nomem) { -			atomic_set(&ip_vs_dropentry, 1); -			sysctl_ip_vs_drop_entry = 2; +			atomic_set(&ipvs->dropentry, 1); +			ipvs->sysctl_drop_entry = 2;  		} else { -			atomic_set(&ip_vs_dropentry, 0); +			atomic_set(&ipvs->dropentry, 0);  		}  		break;  	case 2:  		if (nomem) { -			atomic_set(&ip_vs_dropentry, 1); +			atomic_set(&ipvs->dropentry, 1);  		} else { -			atomic_set(&ip_vs_dropentry, 0); -			sysctl_ip_vs_drop_entry = 1; +			atomic_set(&ipvs->dropentry, 0); +			ipvs->sysctl_drop_entry = 1;  		};  		break;  	case 3: -		atomic_set(&ip_vs_dropentry, 1); +		atomic_set(&ipvs->dropentry, 1);  		break;  	} -	spin_unlock(&__ip_vs_dropentry_lock); +	spin_unlock(&ipvs->dropentry_lock);  	/* drop_packet */ -	spin_lock(&__ip_vs_droppacket_lock); -	switch (sysctl_ip_vs_drop_packet) { +	spin_lock(&ipvs->droppacket_lock); +	switch (ipvs->sysctl_drop_packet) {  	case 0: -		ip_vs_drop_rate = 0; +		ipvs->drop_rate = 0;  		break;  	case 1:  		if (nomem) { -			ip_vs_drop_rate = ip_vs_drop_counter -				= sysctl_ip_vs_amemthresh / -				(sysctl_ip_vs_amemthresh-availmem); -			sysctl_ip_vs_drop_packet = 2; +			ipvs->drop_rate = ipvs->drop_counter +				= ipvs->sysctl_amemthresh / +				(ipvs->sysctl_amemthresh-availmem); +			ipvs->sysctl_drop_packet = 2;  		} else { -			ip_vs_drop_rate = 0; +			ipvs->drop_rate = 0;  		}  		break;  	case 2:  		if (nomem) { -			ip_vs_drop_rate = ip_vs_drop_counter -				= sysctl_ip_vs_amemthresh / -				(sysctl_ip_vs_amemthresh-availmem); +			ipvs->drop_rate = ipvs->drop_counter +				= ipvs->sysctl_amemthresh / +				(ipvs->sysctl_amemthresh-availmem);  		} else { -			ip_vs_drop_rate = 0; -			sysctl_ip_vs_drop_packet = 1; +			ipvs->drop_rate = 0; +			ipvs->sysctl_drop_packet = 1;  		}  		break;  	case 3: -		ip_vs_drop_rate = sysctl_ip_vs_am_droprate; +		ipvs->drop_rate = ipvs->sysctl_am_droprate;  		break;  	} -	spin_unlock(&__ip_vs_droppacket_lock); +	spin_unlock(&ipvs->droppacket_lock);  	/* secure_tcp */ -	spin_lock(&ip_vs_securetcp_lock); -	switch (sysctl_ip_vs_secure_tcp) { +	spin_lock(&ipvs->securetcp_lock); +	switch (ipvs->sysctl_secure_tcp) {  	case 0:  		if (old_secure_tcp >= 2)  			to_change = 0; @@ -216,7 +185,7 @@ static void update_defense_level(void)  		if (nomem) {  			if (old_secure_tcp < 2)  				to_change = 1; -			sysctl_ip_vs_secure_tcp = 2; +			ipvs->sysctl_secure_tcp = 2;  		} else {  			if (old_secure_tcp >= 2)  				to_change = 0; @@ -229,7 +198,7 @@ static void update_defense_level(void)  		} else {  			if (old_secure_tcp >= 2)  				to_change = 0; -			sysctl_ip_vs_secure_tcp = 1; +			ipvs->sysctl_secure_tcp = 1;  		}  		break;  	case 3: @@ -237,10 +206,11 @@ static void update_defense_level(void)  			to_change = 1;  		break;  	} -	old_secure_tcp = sysctl_ip_vs_secure_tcp; +	old_secure_tcp = ipvs->sysctl_secure_tcp;  	if (to_change >= 0) -		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); -	spin_unlock(&ip_vs_securetcp_lock); +		ip_vs_protocol_timeout_change(ipvs, +					      ipvs->sysctl_secure_tcp > 1); +	spin_unlock(&ipvs->securetcp_lock);  	local_bh_enable();  } @@ -250,17 +220,18 @@ static void update_defense_level(void)   *	Timer for checking the defense   */  #define DEFENSE_TIMER_PERIOD	1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);  static void defense_work_handler(struct work_struct *work)  { -	update_defense_level(); -	if (atomic_read(&ip_vs_dropentry)) -		ip_vs_random_dropentry(); +	struct netns_ipvs *ipvs = +		container_of(work, struct netns_ipvs, defense_work.work); -	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); +	update_defense_level(ipvs); +	if (atomic_read(&ipvs->dropentry)) +		ip_vs_random_dropentry(ipvs->net); +	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);  } +#endif  int  ip_vs_use_count_inc(void) @@ -283,67 +254,50 @@ ip_vs_use_count_dec(void)  #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)  /* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];  /* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - *	Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - *	Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - *	FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];  /*   *	Returns hash value for virtual service   */ -static __inline__ unsigned -ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, -		  __be16 port) +static inline unsigned int +ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, +		  const union nf_inet_addr *addr, __be16 port)  { -	register unsigned porth = ntohs(port); +	register unsigned int porth = ntohs(port);  	__be32 addr_fold = addr->ip; +	__u32 ahash;  #ifdef CONFIG_IP_VS_IPV6  	if (af == AF_INET6)  		addr_fold = addr->ip6[0]^addr->ip6[1]^  			    addr->ip6[2]^addr->ip6[3];  #endif +	ahash = ntohl(addr_fold); +	ahash ^= ((size_t) net >> 8); -	return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) -		& IP_VS_SVC_TAB_MASK; +	return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & +	       IP_VS_SVC_TAB_MASK;  }  /*   *	Returns hash value of fwmark for virtual service lookup   */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)  { -	return fwmark & IP_VS_SVC_TAB_MASK; +	return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;  }  /* - *	Hashes a service in the ip_vs_svc_table by <proto,addr,port> + *	Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>   *	or in the ip_vs_svc_fwm_table by fwmark.   *	Should be called with locked tables.   */  static int ip_vs_svc_hash(struct ip_vs_service *svc)  { -	unsigned hash; +	unsigned int hash;  	if (svc->flags & IP_VS_SVC_F_HASHED) {  		pr_err("%s(): request for already hashed, called from %pF\n", @@ -353,17 +307,17 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)  	if (svc->fwmark == 0) {  		/* -		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table +		 *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table  		 */ -		hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, -					 svc->port); -		list_add(&svc->s_list, &ip_vs_svc_table[hash]); +		hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, +					 &svc->addr, svc->port); +		hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);  	} else {  		/* -		 *  Hash it by fwmark in ip_vs_svc_fwm_table +		 *  Hash it by fwmark in svc_fwm_table  		 */ -		hash = ip_vs_svc_fwm_hashkey(svc->fwmark); -		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); +		hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); +		hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);  	}  	svc->flags |= IP_VS_SVC_F_HASHED; @@ -374,7 +328,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)  /* - *	Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + *	Unhashes a service from svc_table / svc_fwm_table.   *	Should be called with locked tables.   */  static int ip_vs_svc_unhash(struct ip_vs_service *svc) @@ -386,11 +340,11 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)  	}  	if (svc->fwmark == 0) { -		/* Remove it from the ip_vs_svc_table table */ -		list_del(&svc->s_list); +		/* Remove it from the svc_table table */ +		hlist_del_rcu(&svc->s_list);  	} else { -		/* Remove it from the ip_vs_svc_fwm_table table */ -		list_del(&svc->f_list); +		/* Remove it from the svc_fwm_table table */ +		hlist_del_rcu(&svc->f_list);  	}  	svc->flags &= ~IP_VS_SVC_F_HASHED; @@ -400,23 +354,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)  /* - *	Get service by {proto,addr,port} in the service table. + *	Get service by {netns, proto,addr,port} in the service table.   */  static inline struct ip_vs_service * -__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr, -		    __be16 vport) +__ip_vs_service_find(struct net *net, int af, __u16 protocol, +		     const union nf_inet_addr *vaddr, __be16 vport)  { -	unsigned hash; +	unsigned int hash;  	struct ip_vs_service *svc;  	/* Check for "full" addressed entries */ -	hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); +	hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); -	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ +	hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {  		if ((svc->af == af)  		    && ip_vs_addr_equal(af, &svc->addr, vaddr)  		    && (svc->port == vport) -		    && (svc->protocol == protocol)) { +		    && (svc->protocol == protocol) +		    && net_eq(svc->net, net)) {  			/* HIT */  			return svc;  		} @@ -430,16 +385,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,   *	Get service by {fwmark} in the service table.   */  static inline struct ip_vs_service * -__ip_vs_svc_fwm_find(int af, __u32 fwmark) +__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)  { -	unsigned hash; +	unsigned int hash;  	struct ip_vs_service *svc;  	/* Check for fwmark addressed entries */ -	hash = ip_vs_svc_fwm_hashkey(fwmark); +	hash = ip_vs_svc_fwm_hashkey(net, fwmark); -	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { -		if (svc->fwmark == fwmark && svc->af == af) { +	hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { +		if (svc->fwmark == fwmark && svc->af == af +		    && net_eq(svc->net, net)) {  			/* HIT */  			return svc;  		} @@ -448,50 +404,49 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark)  	return NULL;  } +/* Find service, called under RCU lock */  struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, -		  const union nf_inet_addr *vaddr, __be16 vport) +ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, +		   const union nf_inet_addr *vaddr, __be16 vport)  {  	struct ip_vs_service *svc; - -	read_lock(&__ip_vs_svc_lock); +	struct netns_ipvs *ipvs = net_ipvs(net);  	/*  	 *	Check the table hashed by fwmark first  	 */ -	if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark))) -		goto out; +	if (fwmark) { +		svc = __ip_vs_svc_fwm_find(net, af, fwmark); +		if (svc) +			goto out; +	}  	/*  	 *	Check the table hashed by <protocol,addr,port>  	 *	for "full" addressed entries  	 */ -	svc = __ip_vs_service_find(af, protocol, vaddr, vport); +	svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);  	if (svc == NULL  	    && protocol == IPPROTO_TCP -	    && atomic_read(&ip_vs_ftpsvc_counter) +	    && atomic_read(&ipvs->ftpsvc_counter)  	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {  		/*  		 * Check if ftp service entry exists, the packet  		 * might belong to FTP data connections.  		 */ -		svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT); +		svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);  	}  	if (svc == NULL -	    && atomic_read(&ip_vs_nullsvc_counter)) { +	    && atomic_read(&ipvs->nullsvc_counter)) {  		/*  		 * Check if the catch-all port (port zero) exists  		 */ -		svc = __ip_vs_service_find(af, protocol, vaddr, 0); +		svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);  	}    out: -	if (svc) -		atomic_inc(&svc->usecnt); -	read_unlock(&__ip_vs_svc_lock); -  	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",  		      fwmark, ip_vs_proto_name(protocol),  		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), @@ -505,21 +460,35 @@ static inline void  __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)  {  	atomic_inc(&svc->refcnt); -	dest->svc = svc; +	rcu_assign_pointer(dest->svc, svc);  } -static void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) +static void ip_vs_service_free(struct ip_vs_service *svc) +{ +	if (svc->stats.cpustats) +		free_percpu(svc->stats.cpustats); +	kfree(svc); +} + +static void ip_vs_service_rcu_free(struct rcu_head *head)  { -	struct ip_vs_service *svc = dest->svc; +	struct ip_vs_service *svc; -	dest->svc = NULL; +	svc = container_of(head, struct ip_vs_service, rcu_head); +	ip_vs_service_free(svc); +} + +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +{  	if (atomic_dec_and_test(&svc->refcnt)) { -		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", +		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",  			      svc->fwmark,  			      IP_VS_DBG_ADDR(svc->af, &svc->addr), -			      ntohs(svc->port), atomic_read(&svc->usecnt)); -		kfree(svc); +			      ntohs(svc->port)); +		if (do_delay) +			call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); +		else +			ip_vs_service_free(svc);  	}  } @@ -527,11 +496,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)  /*   *	Returns hash value for real service   */ -static inline unsigned ip_vs_rs_hashkey(int af, +static inline unsigned int ip_vs_rs_hashkey(int af,  					    const union nf_inet_addr *addr,  					    __be16 port)  { -	register unsigned porth = ntohs(port); +	register unsigned int porth = ntohs(port);  	__be32 addr_fold = addr->ip;  #ifdef CONFIG_IP_VS_IPV6 @@ -544,17 +513,13 @@ static inline unsigned ip_vs_rs_hashkey(int af,  		& IP_VS_RTAB_MASK;  } -/* - *	Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. - *	should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) +/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)  { -	unsigned hash; +	unsigned int hash; -	if (!list_empty(&dest->d_list)) { -		return 0; -	} +	if (dest->in_rs_table) +		return;  	/*  	 *	Hash by proto,addr,port, @@ -562,64 +527,51 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)  	 */  	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); -	list_add(&dest->d_list, &ip_vs_rtable[hash]); - -	return 1; +	hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); +	dest->in_rs_table = 1;  } -/* - *	UNhashes ip_vs_dest from ip_vs_rtable. - *	should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest)  {  	/* -	 * Remove it from the ip_vs_rtable table. +	 * Remove it from the rs_table table.  	 */ -	if (!list_empty(&dest->d_list)) { -		list_del(&dest->d_list); -		INIT_LIST_HEAD(&dest->d_list); +	if (dest->in_rs_table) { +		hlist_del_rcu(&dest->d_list); +		dest->in_rs_table = 0;  	} - -	return 1;  } -/* - *	Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, -			  const union nf_inet_addr *daddr, -			  __be16 dport) +/* Check if real service by <proto,addr,port> is present */ +bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, +			    const union nf_inet_addr *daddr, __be16 dport)  { -	unsigned hash; +	struct netns_ipvs *ipvs = net_ipvs(net); +	unsigned int hash;  	struct ip_vs_dest *dest; -	/* -	 *	Check for "full" addressed entries -	 *	Return the first found entry -	 */ +	/* Check for "full" addressed entries */  	hash = ip_vs_rs_hashkey(af, daddr, dport); -	read_lock(&__ip_vs_rs_lock); -	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { -		if ((dest->af == af) -		    && ip_vs_addr_equal(af, &dest->addr, daddr) -		    && (dest->port == dport) -		    && ((dest->protocol == protocol) || -			dest->vfwmark)) { +	rcu_read_lock(); +	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { +		if (dest->port == dport && +		    dest->af == af && +		    ip_vs_addr_equal(af, &dest->addr, daddr) && +		    (dest->protocol == protocol || dest->vfwmark)) {  			/* HIT */ -			read_unlock(&__ip_vs_rs_lock); -			return dest; +			rcu_read_unlock(); +			return true;  		}  	} -	read_unlock(&__ip_vs_rs_lock); +	rcu_read_unlock(); -	return NULL; +	return false;  } -/* - *	Lookup destination by {addr,port} in the given service +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock.   */  static struct ip_vs_dest *  ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, @@ -630,7 +582,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,  	/*  	 * Find the destination for the given service  	 */ -	list_for_each_entry(dest, &svc->destinations, n_list) { +	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  		if ((dest->af == svc->af)  		    && ip_vs_addr_equal(svc->af, &dest->addr, daddr)  		    && (dest->port == dport)) { @@ -644,32 +596,56 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,  /*   * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in + * Created to be used in ip_vs_process_message() in   * the backup synchronization daemon. It finds the   * destination to be bound to the received connection   * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. + * Called under RCU lock, no refcnt is returned.   */ -struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, +struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af, +				   const union nf_inet_addr *daddr,  				   __be16 dport,  				   const union nf_inet_addr *vaddr, -				   __be16 vport, __u16 protocol) +				   __be16 vport, __u16 protocol, __u32 fwmark, +				   __u32 flags)  {  	struct ip_vs_dest *dest;  	struct ip_vs_service *svc; +	__be16 port = dport; -	svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); +	svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);  	if (!svc)  		return NULL; -	dest = ip_vs_lookup_dest(svc, daddr, dport); -	if (dest) -		atomic_inc(&dest->refcnt); -	ip_vs_service_put(svc); +	if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) +		port = 0; +	dest = ip_vs_lookup_dest(svc, daddr, port); +	if (!dest) +		dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);  	return dest;  } +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ +	struct ip_vs_dest_dst *dest_dst = container_of(head, +						       struct ip_vs_dest_dst, +						       rcu_head); + +	dst_release(dest_dst->dst_cache); +	kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ +	struct ip_vs_dest_dst *old; + +	old = rcu_dereference_protected(dest->dest_dst, 1); +	if (old) { +		RCU_INIT_POINTER(dest->dest_dst, NULL); +		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); +	} +} +  /*   *  Lookup dest by {svc,addr,port} in the destination trash.   *  The destination trash is used to hold the destinations that are removed @@ -684,12 +660,14 @@ static struct ip_vs_dest *  ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,  		     __be16 dport)  { -	struct ip_vs_dest *dest, *nxt; +	struct ip_vs_dest *dest; +	struct netns_ipvs *ipvs = net_ipvs(svc->net);  	/*  	 * Find the destination in trash  	 */ -	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { +	spin_lock_bh(&ipvs->dest_trash_lock); +	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {  		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "  			      "dest->refcnt=%d\n",  			      dest->vfwmark, @@ -705,28 +683,29 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,  		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&  		      dest->vport == svc->port))) {  			/* HIT */ -			return dest; -		} - -		/* -		 * Try to purge the destination from trash if not referenced -		 */ -		if (atomic_read(&dest->refcnt) == 1) { -			IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " -				      "from trash\n", -				      dest->vfwmark, -				      IP_VS_DBG_ADDR(svc->af, &dest->addr), -				      ntohs(dest->port)); -			list_del(&dest->n_list); -			ip_vs_dst_reset(dest); -			__ip_vs_unbind_svc(dest); -			kfree(dest); +			list_del(&dest->t_list); +			ip_vs_dest_hold(dest); +			goto out;  		}  	} -	return NULL; +	dest = NULL; + +out: +	spin_unlock_bh(&ipvs->dest_trash_lock); + +	return dest;  } +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ +	struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); + +	__ip_vs_dst_cache_reset(dest); +	__ip_vs_svc_put(svc, false); +	free_percpu(dest->stats.cpustats); +	ip_vs_dest_put_and_free(dest); +}  /*   *  Clean up all the destinations in the trash @@ -735,27 +714,54 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,   *  When the ip_vs_control_clearup is activated by ipvs module exit,   *  the service tables must have been flushed and all the connections   *  are expired, and the refcnt of each destination in the trash must - *  be 1, so we simply release them here. + *  be 0, so we simply release them here.   */ -static void ip_vs_trash_cleanup(void) +static void ip_vs_trash_cleanup(struct net *net)  {  	struct ip_vs_dest *dest, *nxt; +	struct netns_ipvs *ipvs = net_ipvs(net); -	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { -		list_del(&dest->n_list); -		ip_vs_dst_reset(dest); -		__ip_vs_unbind_svc(dest); -		kfree(dest); +	del_timer_sync(&ipvs->dest_trash_timer); +	/* No need to use dest_trash_lock */ +	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { +		list_del(&dest->t_list); +		ip_vs_dest_free(dest);  	}  } +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c + +	spin_lock_bh(&src->lock); + +	IP_VS_SHOW_STATS_COUNTER(conns); +	IP_VS_SHOW_STATS_COUNTER(inpkts); +	IP_VS_SHOW_STATS_COUNTER(outpkts); +	IP_VS_SHOW_STATS_COUNTER(inbytes); +	IP_VS_SHOW_STATS_COUNTER(outbytes); + +	ip_vs_read_estimator(dst, src); + +	spin_unlock_bh(&src->lock); +}  static void  ip_vs_zero_stats(struct ip_vs_stats *stats)  {  	spin_lock_bh(&stats->lock); -	memset(&stats->ustats, 0, sizeof(stats->ustats)); +	/* get current counters as zero point, rates are zeroed */ + +#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c + +	IP_VS_ZERO_STATS_COUNTER(conns); +	IP_VS_ZERO_STATS_COUNTER(inpkts); +	IP_VS_ZERO_STATS_COUNTER(outpkts); +	IP_VS_ZERO_STATS_COUNTER(inbytes); +	IP_VS_ZERO_STATS_COUNTER(outbytes); +  	ip_vs_zero_estimator(stats);  	spin_unlock_bh(&stats->lock); @@ -768,6 +774,9 @@ static void  __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,  		    struct ip_vs_dest_user_kern *udest, int add)  { +	struct netns_ipvs *ipvs = net_ipvs(svc->net); +	struct ip_vs_service *old_svc; +	struct ip_vs_scheduler *sched;  	int conn_flags;  	/* set the weight and the flags */ @@ -780,23 +789,22 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,  		conn_flags |= IP_VS_CONN_F_NOOUTPUT;  	} else {  		/* -		 *    Put the real service in ip_vs_rtable if not present. +		 *    Put the real service in rs_table if not present.  		 *    For now only for NAT!  		 */ -		write_lock_bh(&__ip_vs_rs_lock); -		ip_vs_rs_hash(dest); -		write_unlock_bh(&__ip_vs_rs_lock); +		ip_vs_rs_hash(ipvs, dest);  	}  	atomic_set(&dest->conn_flags, conn_flags);  	/* bind the service */ -	if (!dest->svc) { +	old_svc = rcu_dereference_protected(dest->svc, 1); +	if (!old_svc) {  		__ip_vs_bind_svc(dest, svc);  	} else { -		if (dest->svc != svc) { -			__ip_vs_unbind_svc(dest); +		if (old_svc != svc) {  			ip_vs_zero_stats(&dest->stats);  			__ip_vs_bind_svc(dest, svc); +			__ip_vs_svc_put(old_svc, true);  		}  	} @@ -808,28 +816,21 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,  	dest->u_threshold = udest->u_threshold;  	dest->l_threshold = udest->l_threshold; -	spin_lock(&dest->dst_lock); -	ip_vs_dst_reset(dest); -	spin_unlock(&dest->dst_lock); - -	if (add) -		ip_vs_new_estimator(&dest->stats); - -	write_lock_bh(&__ip_vs_svc_lock); - -	/* Wait until all other svc users go away */ -	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); +	spin_lock_bh(&dest->dst_lock); +	__ip_vs_dst_cache_reset(dest); +	spin_unlock_bh(&dest->dst_lock); +	sched = rcu_dereference_protected(svc->scheduler, 1);  	if (add) { -		list_add(&dest->n_list, &svc->destinations); +		ip_vs_start_estimator(svc->net, &dest->stats); +		list_add_rcu(&dest->n_list, &svc->destinations);  		svc->num_dests++; +		if (sched->add_dest) +			sched->add_dest(svc, dest); +	} else { +		if (sched->upd_dest) +			sched->upd_dest(svc, dest);  	} - -	/* call the update_service, because server weight may be changed */ -	if (svc->scheduler->update_service) -		svc->scheduler->update_service(svc); - -	write_unlock_bh(&__ip_vs_svc_lock);  } @@ -841,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,  	       struct ip_vs_dest **dest_p)  {  	struct ip_vs_dest *dest; -	unsigned atype; +	unsigned int atype, i;  	EnterFunction(2); @@ -850,20 +851,28 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,  		atype = ipv6_addr_type(&udest->addr.in6);  		if ((!(atype & IPV6_ADDR_UNICAST) ||  			atype & IPV6_ADDR_LINKLOCAL) && -			!__ip_vs_addr_is_local_v6(&udest->addr.in6)) +			!__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))  			return -EINVAL;  	} else  #endif  	{ -		atype = inet_addr_type(&init_net, udest->addr.ip); +		atype = inet_addr_type(svc->net, udest->addr.ip);  		if (atype != RTN_LOCAL && atype != RTN_UNICAST)  			return -EINVAL;  	}  	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); -	if (dest == NULL) { -		pr_err("%s(): no memory.\n", __func__); +	if (dest == NULL)  		return -ENOMEM; + +	dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); +	if (!dest->stats.cpustats) +		goto err_alloc; + +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *ip_vs_dest_stats; +		ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); +		u64_stats_init(&ip_vs_dest_stats->syncp);  	}  	dest->af = svc->af; @@ -879,7 +888,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,  	atomic_set(&dest->persistconns, 0);  	atomic_set(&dest->refcnt, 1); -	INIT_LIST_HEAD(&dest->d_list); +	INIT_HLIST_NODE(&dest->d_list);  	spin_lock_init(&dest->dst_lock);  	spin_lock_init(&dest->stats.lock);  	__ip_vs_update_dest(svc, dest, udest, 1); @@ -888,6 +897,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,  	LeaveFunction(2);  	return 0; + +err_alloc: +	kfree(dest); +	return -ENOMEM;  } @@ -917,10 +930,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)  	ip_vs_addr_copy(svc->af, &daddr, &udest->addr); -	/* -	 * Check if the dest already exists in the list -	 */ +	/* We use function that requires RCU lock */ +	rcu_read_lock();  	dest = ip_vs_lookup_dest(svc, &daddr, dport); +	rcu_read_unlock();  	if (dest != NULL) {  		IP_VS_DBG(1, "%s(): dest already exists\n", __func__); @@ -942,11 +955,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)  			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),  			      ntohs(dest->vport)); -		/* -		 * Get the destination from the trash -		 */ -		list_del(&dest->n_list); -  		__ip_vs_update_dest(svc, dest, udest, 1);  		ret = 0;  	} else { @@ -986,10 +994,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)  	ip_vs_addr_copy(svc->af, &daddr, &udest->addr); -	/* -	 *  Lookup the destination list -	 */ +	/* We use function that requires RCU lock */ +	rcu_read_lock();  	dest = ip_vs_lookup_dest(svc, &daddr, dport); +	rcu_read_unlock();  	if (dest == NULL) {  		IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); @@ -1002,48 +1010,33 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)  	return 0;  } -  /*   *	Delete a destination (must be already unlinked from the service)   */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, +			     bool cleanup)  { -	ip_vs_kill_estimator(&dest->stats); +	struct netns_ipvs *ipvs = net_ipvs(net); + +	ip_vs_stop_estimator(net, &dest->stats);  	/*  	 *  Remove it from the d-linked list with the real services.  	 */ -	write_lock_bh(&__ip_vs_rs_lock);  	ip_vs_rs_unhash(dest); -	write_unlock_bh(&__ip_vs_rs_lock); -	/* -	 *  Decrease the refcnt of the dest, and free the dest -	 *  if nobody refers to it (refcnt=0). Otherwise, throw -	 *  the destination into the trash. -	 */ -	if (atomic_dec_and_test(&dest->refcnt)) { -		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", -			      dest->vfwmark, -			      IP_VS_DBG_ADDR(dest->af, &dest->addr), -			      ntohs(dest->port)); -		ip_vs_dst_reset(dest); -		/* simply decrease svc->refcnt here, let the caller check -		   and release the service if nobody refers to it. -		   Only user context can release destination and service, -		   and only one user context can update virtual service at a -		   time, so the operation here is OK */ -		atomic_dec(&dest->svc->refcnt); -		kfree(dest); -	} else { -		IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " -			      "dest->refcnt=%d\n", -			      IP_VS_DBG_ADDR(dest->af, &dest->addr), -			      ntohs(dest->port), -			      atomic_read(&dest->refcnt)); -		list_add(&dest->n_list, &ip_vs_dest_trash); -		atomic_inc(&dest->refcnt); -	} +	spin_lock_bh(&ipvs->dest_trash_lock); +	IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", +		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), +		      atomic_read(&dest->refcnt)); +	if (list_empty(&ipvs->dest_trash) && !cleanup) +		mod_timer(&ipvs->dest_trash_timer, +			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); +	/* dest lives in trash without reference */ +	list_add(&dest->t_list, &ipvs->dest_trash); +	dest->idle_start = 0; +	spin_unlock_bh(&ipvs->dest_trash_lock); +	ip_vs_dest_put(dest);  } @@ -1059,14 +1052,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,  	/*  	 *  Remove it from the d-linked destination list.  	 */ -	list_del(&dest->n_list); +	list_del_rcu(&dest->n_list);  	svc->num_dests--; -	/* -	 *  Call the update_service function of its scheduler -	 */ -	if (svcupd && svc->scheduler->update_service) -			svc->scheduler->update_service(svc); +	if (svcupd) { +		struct ip_vs_scheduler *sched; + +		sched = rcu_dereference_protected(svc->scheduler, 1); +		if (sched->del_dest) +			sched->del_dest(svc, dest); +	}  } @@ -1081,49 +1076,75 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)  	EnterFunction(2); +	/* We use function that requires RCU lock */ +	rcu_read_lock();  	dest = ip_vs_lookup_dest(svc, &udest->addr, dport); +	rcu_read_unlock();  	if (dest == NULL) {  		IP_VS_DBG(1, "%s(): destination not found!\n", __func__);  		return -ENOENT;  	} -	write_lock_bh(&__ip_vs_svc_lock); - -	/* -	 *	Wait until all other svc users go away. -	 */ -	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); -  	/*  	 *	Unlink dest from the service  	 */  	__ip_vs_unlink_dest(svc, dest, 1); -	write_unlock_bh(&__ip_vs_svc_lock); -  	/*  	 *	Delete the destination  	 */ -	__ip_vs_del_dest(dest); +	__ip_vs_del_dest(svc->net, dest, false);  	LeaveFunction(2);  	return 0;  } +static void ip_vs_dest_trash_expire(unsigned long data) +{ +	struct net *net = (struct net *) data; +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_dest *dest, *next; +	unsigned long now = jiffies; + +	spin_lock(&ipvs->dest_trash_lock); +	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { +		if (atomic_read(&dest->refcnt) > 0) +			continue; +		if (dest->idle_start) { +			if (time_before(now, dest->idle_start + +					     IP_VS_DEST_TRASH_PERIOD)) +				continue; +		} else { +			dest->idle_start = max(1UL, now); +			continue; +		} +		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", +			      dest->vfwmark, +			      IP_VS_DBG_ADDR(dest->af, &dest->addr), +			      ntohs(dest->port)); +		list_del(&dest->t_list); +		ip_vs_dest_free(dest); +	} +	if (!list_empty(&ipvs->dest_trash)) +		mod_timer(&ipvs->dest_trash_timer, +			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); +	spin_unlock(&ipvs->dest_trash_lock); +}  /*   *	Add a service into the service hash table   */  static int -ip_vs_add_service(struct ip_vs_service_user_kern *u, +ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,  		  struct ip_vs_service **svc_p)  { -	int ret = 0; +	int ret = 0, i;  	struct ip_vs_scheduler *sched = NULL;  	struct ip_vs_pe *pe = NULL;  	struct ip_vs_service *svc = NULL; +	struct netns_ipvs *ipvs = net_ipvs(net);  	/* increase the module use count */  	ip_vs_use_count_inc(); @@ -1137,7 +1158,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,  	}  	if (u->pe_name && *u->pe_name) { -		pe = ip_vs_pe_get(u->pe_name); +		pe = ip_vs_pe_getbyname(u->pe_name);  		if (pe == NULL) {  			pr_info("persistence engine module ip_vs_pe_%s "  				"not found\n", u->pe_name); @@ -1147,9 +1168,13 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,  	}  #ifdef CONFIG_IP_VS_IPV6 -	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { -		ret = -EINVAL; -		goto out_err; +	if (u->af == AF_INET6) { +		__u32 plen = (__force __u32) u->netmask; + +		if (plen < 1 || plen > 128) { +			ret = -EINVAL; +			goto out_err; +		}  	}  #endif @@ -1159,9 +1184,20 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,  		ret = -ENOMEM;  		goto out_err;  	} +	svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); +	if (!svc->stats.cpustats) { +		ret = -ENOMEM; +		goto out_err; +	} + +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *ip_vs_stats; +		ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); +		u64_stats_init(&ip_vs_stats->syncp); +	} +  	/* I'm the first user of the service */ -	atomic_set(&svc->usecnt, 0);  	atomic_set(&svc->refcnt, 0);  	svc->af = u->af; @@ -1172,9 +1208,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,  	svc->flags = u->flags;  	svc->timeout = u->timeout * HZ;  	svc->netmask = u->netmask; +	svc->net = net;  	INIT_LIST_HEAD(&svc->destinations); -	rwlock_init(&svc->sched_lock); +	spin_lock_init(&svc->sched_lock);  	spin_lock_init(&svc->stats.lock);  	/* Bind the scheduler */ @@ -1184,38 +1221,34 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,  	sched = NULL;  	/* Bind the ct retriever */ -	ip_vs_bind_pe(svc, pe); +	RCU_INIT_POINTER(svc->pe, pe);  	pe = NULL;  	/* Update the virtual service counters */  	if (svc->port == FTPPORT) -		atomic_inc(&ip_vs_ftpsvc_counter); +		atomic_inc(&ipvs->ftpsvc_counter);  	else if (svc->port == 0) -		atomic_inc(&ip_vs_nullsvc_counter); +		atomic_inc(&ipvs->nullsvc_counter); -	ip_vs_new_estimator(&svc->stats); +	ip_vs_start_estimator(net, &svc->stats);  	/* Count only IPv4 services for old get/setsockopt interface */  	if (svc->af == AF_INET) -		ip_vs_num_services++; +		ipvs->num_services++;  	/* Hash the service into the service table */ -	write_lock_bh(&__ip_vs_svc_lock);  	ip_vs_svc_hash(svc); -	write_unlock_bh(&__ip_vs_svc_lock);  	*svc_p = svc; +	/* Now there is a service - full throttle */ +	ipvs->enable = 1;  	return 0; +   out_err:  	if (svc != NULL) { -		ip_vs_unbind_scheduler(svc); -		if (svc->inc) { -			local_bh_disable(); -			ip_vs_app_inc_put(svc->inc); -			local_bh_enable(); -		} -		kfree(svc); +		ip_vs_unbind_scheduler(svc, sched); +		ip_vs_service_free(svc);  	}  	ip_vs_scheduler_put(sched);  	ip_vs_pe_put(pe); @@ -1248,7 +1281,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)  	old_sched = sched;  	if (u->pe_name && *u->pe_name) { -		pe = ip_vs_pe_get(u->pe_name); +		pe = ip_vs_pe_getbyname(u->pe_name);  		if (pe == NULL) {  			pr_info("persistence engine module ip_vs_pe_%s "  				"not found\n", u->pe_name); @@ -1259,18 +1292,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)  	}  #ifdef CONFIG_IP_VS_IPV6 -	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { -		ret = -EINVAL; -		goto out; +	if (u->af == AF_INET6) { +		__u32 plen = (__force __u32) u->netmask; + +		if (plen < 1 || plen > 128) { +			ret = -EINVAL; +			goto out; +		}  	}  #endif -	write_lock_bh(&__ip_vs_svc_lock); - -	/* -	 * Wait until all other svc users go away. -	 */ -	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); +	old_sched = rcu_dereference_protected(svc->scheduler, 1); +	if (sched != old_sched) { +		/* Bind the new scheduler */ +		ret = ip_vs_bind_scheduler(svc, sched); +		if (ret) { +			old_sched = sched; +			goto out; +		} +		/* Unbind the old scheduler on success */ +		ip_vs_unbind_scheduler(svc, old_sched); +	}  	/*  	 * Set the flags and timeout value @@ -1279,112 +1321,65 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)  	svc->timeout = u->timeout * HZ;  	svc->netmask = u->netmask; -	old_sched = svc->scheduler; -	if (sched != old_sched) { -		/* -		 * Unbind the old scheduler -		 */ -		if ((ret = ip_vs_unbind_scheduler(svc))) { -			old_sched = sched; -			goto out_unlock; -		} - -		/* -		 * Bind the new scheduler -		 */ -		if ((ret = ip_vs_bind_scheduler(svc, sched))) { -			/* -			 * If ip_vs_bind_scheduler fails, restore the old -			 * scheduler. -			 * The main reason of failure is out of memory. -			 * -			 * The question is if the old scheduler can be -			 * restored all the time. TODO: if it cannot be -			 * restored some time, we must delete the service, -			 * otherwise the system may crash. -			 */ -			ip_vs_bind_scheduler(svc, old_sched); -			old_sched = sched; -			goto out_unlock; -		} -	} - -	old_pe = svc->pe; -	if (pe != old_pe) { -		ip_vs_unbind_pe(svc); -		ip_vs_bind_pe(svc, pe); -	} +	old_pe = rcu_dereference_protected(svc->pe, 1); +	if (pe != old_pe) +		rcu_assign_pointer(svc->pe, pe); -  out_unlock: -	write_unlock_bh(&__ip_vs_svc_lock); -  out: +out:  	ip_vs_scheduler_put(old_sched);  	ip_vs_pe_put(old_pe);  	return ret;  } -  /*   *	Delete a service from the service list   *	- The service must be unlinked, unlocked and not referenced!   *	- We are called under _bh lock   */ -static void __ip_vs_del_service(struct ip_vs_service *svc) +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)  {  	struct ip_vs_dest *dest, *nxt;  	struct ip_vs_scheduler *old_sched;  	struct ip_vs_pe *old_pe; +	struct netns_ipvs *ipvs = net_ipvs(svc->net);  	pr_info("%s: enter\n", __func__);  	/* Count only IPv4 services for old get/setsockopt interface */  	if (svc->af == AF_INET) -		ip_vs_num_services--; +		ipvs->num_services--; -	ip_vs_kill_estimator(&svc->stats); +	ip_vs_stop_estimator(svc->net, &svc->stats);  	/* Unbind scheduler */ -	old_sched = svc->scheduler; -	ip_vs_unbind_scheduler(svc); +	old_sched = rcu_dereference_protected(svc->scheduler, 1); +	ip_vs_unbind_scheduler(svc, old_sched);  	ip_vs_scheduler_put(old_sched); -	/* Unbind persistence engine */ -	old_pe = svc->pe; -	ip_vs_unbind_pe(svc); +	/* Unbind persistence engine, keep svc->pe */ +	old_pe = rcu_dereference_protected(svc->pe, 1);  	ip_vs_pe_put(old_pe); -	/* Unbind app inc */ -	if (svc->inc) { -		ip_vs_app_inc_put(svc->inc); -		svc->inc = NULL; -	} -  	/*  	 *    Unlink the whole destination list  	 */  	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {  		__ip_vs_unlink_dest(svc, dest, 0); -		__ip_vs_del_dest(dest); +		__ip_vs_del_dest(svc->net, dest, cleanup);  	}  	/*  	 *    Update the virtual service counters  	 */  	if (svc->port == FTPPORT) -		atomic_dec(&ip_vs_ftpsvc_counter); +		atomic_dec(&ipvs->ftpsvc_counter);  	else if (svc->port == 0) -		atomic_dec(&ip_vs_nullsvc_counter); +		atomic_dec(&ipvs->nullsvc_counter);  	/*  	 *    Free the service if nobody refers to it  	 */ -	if (atomic_read(&svc->refcnt) == 0) { -		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", -			      svc->fwmark, -			      IP_VS_DBG_ADDR(svc->af, &svc->addr), -			      ntohs(svc->port), atomic_read(&svc->usecnt)); -		kfree(svc); -	} +	__ip_vs_svc_put(svc, true);  	/* decrease the module use count */  	ip_vs_use_count_dec(); @@ -1393,23 +1388,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)  /*   * Unlink a service from list and try to delete it if its refcnt reached 0   */ -static void ip_vs_unlink_service(struct ip_vs_service *svc) +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)  { +	/* Hold svc to avoid double release from dest_trash */ +	atomic_inc(&svc->refcnt);  	/*  	 * Unhash it from the service table  	 */ -	write_lock_bh(&__ip_vs_svc_lock); -  	ip_vs_svc_unhash(svc); -	/* -	 * Wait until all the svc users go away. -	 */ -	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - -	__ip_vs_del_service(svc); - -	write_unlock_bh(&__ip_vs_svc_lock); +	__ip_vs_del_service(svc, cleanup);  }  /* @@ -1419,7 +1407,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)  {  	if (svc == NULL)  		return -EEXIST; -	ip_vs_unlink_service(svc); +	ip_vs_unlink_service(svc, false);  	return 0;  } @@ -1428,17 +1416,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc)  /*   *	Flush all the virtual services   */ -static int ip_vs_flush(void) +static int ip_vs_flush(struct net *net, bool cleanup)  {  	int idx; -	struct ip_vs_service *svc, *nxt; +	struct ip_vs_service *svc; +	struct hlist_node *n;  	/* -	 * Flush the service table hashed by <protocol,addr,port> +	 * Flush the service table hashed by <netns,protocol,addr,port>  	 */  	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { -			ip_vs_unlink_service(svc); +		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], +					  s_list) { +			if (net_eq(svc->net, net)) +				ip_vs_unlink_service(svc, cleanup);  		}  	} @@ -1446,15 +1437,97 @@ static int ip_vs_flush(void)  	 * Flush the service table hashed by fwmark  	 */  	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry_safe(svc, nxt, -					 &ip_vs_svc_fwm_table[idx], f_list) { -			ip_vs_unlink_service(svc); +		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], +					  f_list) { +			if (net_eq(svc->net, net)) +				ip_vs_unlink_service(svc, cleanup);  		}  	}  	return 0;  } +/* + *	Delete service by {netns} in the service table. + *	Called by __ip_vs_cleanup() + */ +void ip_vs_service_net_cleanup(struct net *net) +{ +	EnterFunction(2); +	/* Check for "full" addressed entries */ +	mutex_lock(&__ip_vs_mutex); +	ip_vs_flush(net, true); +	mutex_unlock(&__ip_vs_mutex); +	LeaveFunction(2); +} + +/* Put all references for device (dst_cache) */ +static inline void +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) +{ +	struct ip_vs_dest_dst *dest_dst; + +	spin_lock_bh(&dest->dst_lock); +	dest_dst = rcu_dereference_protected(dest->dest_dst, 1); +	if (dest_dst && dest_dst->dst_cache->dev == dev) { +		IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", +			      dev->name, +			      IP_VS_DBG_ADDR(dest->af, &dest->addr), +			      ntohs(dest->port), +			      atomic_read(&dest->refcnt)); +		__ip_vs_dst_cache_reset(dest); +	} +	spin_unlock_bh(&dest->dst_lock); + +} +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, +			   void *ptr) +{ +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct net *net = dev_net(dev); +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_service *svc; +	struct ip_vs_dest *dest; +	unsigned int idx; + +	if (event != NETDEV_DOWN || !ipvs) +		return NOTIFY_DONE; +	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); +	EnterFunction(2); +	mutex_lock(&__ip_vs_mutex); +	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { +		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { +			if (net_eq(svc->net, net)) { +				list_for_each_entry(dest, &svc->destinations, +						    n_list) { +					ip_vs_forget_dev(dest, dev); +				} +			} +		} + +		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { +			if (net_eq(svc->net, net)) { +				list_for_each_entry(dest, &svc->destinations, +						    n_list) { +					ip_vs_forget_dev(dest, dev); +				} +			} + +		} +	} + +	spin_lock_bh(&ipvs->dest_trash_lock); +	list_for_each_entry(dest, &ipvs->dest_trash, t_list) { +		ip_vs_forget_dev(dest, dev); +	} +	spin_unlock_bh(&ipvs->dest_trash_lock); +	mutex_unlock(&__ip_vs_mutex); +	LeaveFunction(2); +	return NOTIFY_DONE; +}  /*   *	Zero counters in a service or all services @@ -1463,41 +1536,46 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)  {  	struct ip_vs_dest *dest; -	write_lock_bh(&__ip_vs_svc_lock);  	list_for_each_entry(dest, &svc->destinations, n_list) {  		ip_vs_zero_stats(&dest->stats);  	}  	ip_vs_zero_stats(&svc->stats); -	write_unlock_bh(&__ip_vs_svc_lock);  	return 0;  } -static int ip_vs_zero_all(void) +static int ip_vs_zero_all(struct net *net)  {  	int idx;  	struct ip_vs_service *svc;  	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { -			ip_vs_zero_service(svc); +		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { +			if (net_eq(svc->net, net)) +				ip_vs_zero_service(svc);  		}  	}  	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { -			ip_vs_zero_service(svc); +		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { +			if (net_eq(svc->net, net)) +				ip_vs_zero_service(svc);  		}  	} -	ip_vs_zero_stats(&ip_vs_stats); +	ip_vs_zero_stats(&net_ipvs(net)->tot_stats);  	return 0;  } +#ifdef CONFIG_SYSCTL + +static int zero; +static int three = 3;  static int -proc_do_defense_mode(ctl_table *table, int write, +proc_do_defense_mode(struct ctl_table *table, int write,  		     void __user *buffer, size_t *lenp, loff_t *ppos)  { +	struct net *net = current->nsproxy->net_ns;  	int *valp = table->data;  	int val = *valp;  	int rc; @@ -1508,15 +1586,14 @@ proc_do_defense_mode(ctl_table *table, int write,  			/* Restore the correct value */  			*valp = val;  		} else { -			update_defense_level(); +			update_defense_level(net_ipvs(net));  		}  	}  	return rc;  } -  static int -proc_do_sync_threshold(ctl_table *table, int write, +proc_do_sync_threshold(struct ctl_table *table, int write,  		       void __user *buffer, size_t *lenp, loff_t *ppos)  {  	int *valp = table->data; @@ -1527,52 +1604,77 @@ proc_do_sync_threshold(ctl_table *table, int write,  	memcpy(val, valp, sizeof(val));  	rc = proc_dointvec(table, write, buffer, lenp, ppos); -	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { +	if (write && (valp[0] < 0 || valp[1] < 0 || +	    (valp[0] >= valp[1] && valp[1]))) {  		/* Restore the correct value */  		memcpy(valp, val, sizeof(val));  	}  	return rc;  } +static int +proc_do_sync_mode(struct ctl_table *table, int write, +		     void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	int *valp = table->data; +	int val = *valp; +	int rc; + +	rc = proc_dointvec(table, write, buffer, lenp, ppos); +	if (write && (*valp != val)) { +		if ((*valp < 0) || (*valp > 1)) { +			/* Restore the correct value */ +			*valp = val; +		} +	} +	return rc; +} + +static int +proc_do_sync_ports(struct ctl_table *table, int write, +		   void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	int *valp = table->data; +	int val = *valp; +	int rc; + +	rc = proc_dointvec(table, write, buffer, lenp, ppos); +	if (write && (*valp != val)) { +		if (*valp < 1 || !is_power_of_2(*valp)) { +			/* Restore the correct value */ +			*valp = val; +		} +	} +	return rc; +}  /*   *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + *	Do not change order or insert new entries without + *	align with netns init in ip_vs_control_net_init()   */  static struct ctl_table vs_vars[] = {  	{  		.procname	= "amemthresh", -		.data		= &sysctl_ip_vs_amemthresh, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -#ifdef CONFIG_IP_VS_DEBUG -	{ -		.procname	= "debug_level", -		.data		= &sysctl_ip_vs_debug_level,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	}, -#endif  	{  		.procname	= "am_droprate", -		.data		= &sysctl_ip_vs_am_droprate,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "drop_entry", -		.data		= &sysctl_ip_vs_drop_entry,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_do_defense_mode,  	},  	{  		.procname	= "drop_packet", -		.data		= &sysctl_ip_vs_drop_packet,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_do_defense_mode, @@ -1580,7 +1682,6 @@ static struct ctl_table vs_vars[] = {  #ifdef CONFIG_IP_VS_NFCT  	{  		.procname	= "conntrack", -		.data		= &sysctl_ip_vs_conntrack,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= &proc_dointvec, @@ -1588,18 +1689,124 @@ static struct ctl_table vs_vars[] = {  #endif  	{  		.procname	= "secure_tcp", -		.data		= &sysctl_ip_vs_secure_tcp,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_do_defense_mode,  	},  	{  		.procname	= "snat_reroute", -		.data		= &sysctl_ip_vs_snat_reroute,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= &proc_dointvec,  	}, +	{ +		.procname	= "sync_version", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_do_sync_mode, +	}, +	{ +		.procname	= "sync_ports", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_do_sync_ports, +	}, +	{ +		.procname	= "sync_persist_mode", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "sync_qlen_max", +		.maxlen		= sizeof(unsigned long), +		.mode		= 0644, +		.proc_handler	= proc_doulongvec_minmax, +	}, +	{ +		.procname	= "sync_sock_size", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "cache_bypass", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "expire_nodest_conn", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "sloppy_tcp", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "sloppy_sctp", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "expire_quiescent_template", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "sync_threshold", +		.maxlen		= +			sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), +		.mode		= 0644, +		.proc_handler	= proc_do_sync_threshold, +	}, +	{ +		.procname	= "sync_refresh_period", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_jiffies, +	}, +	{ +		.procname	= "sync_retries", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &three, +	}, +	{ +		.procname	= "nat_icmp_send", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "pmtu_disc", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "backup_only", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +#ifdef CONFIG_IP_VS_DEBUG +	{ +		.procname	= "debug_level", +		.data		= &sysctl_ip_vs_debug_level, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +#endif  #if 0  	{  		.procname	= "timeout_established", @@ -1686,58 +1893,16 @@ static struct ctl_table vs_vars[] = {  		.proc_handler	= proc_dointvec_jiffies,  	},  #endif -	{ -		.procname	= "cache_bypass", -		.data		= &sysctl_ip_vs_cache_bypass, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{ -		.procname	= "expire_nodest_conn", -		.data		= &sysctl_ip_vs_expire_nodest_conn, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{ -		.procname	= "expire_quiescent_template", -		.data		= &sysctl_ip_vs_expire_quiescent_template, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{ -		.procname	= "sync_threshold", -		.data		= &sysctl_ip_vs_sync_threshold, -		.maxlen		= sizeof(sysctl_ip_vs_sync_threshold), -		.mode		= 0644, -		.proc_handler	= proc_do_sync_threshold, -	}, -	{ -		.procname	= "nat_icmp_send", -		.data		= &sysctl_ip_vs_nat_icmp_send, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{ } -}; - -const struct ctl_path net_vs_ctl_path[] = { -	{ .procname = "net", }, -	{ .procname = "ipv4", }, -	{ .procname = "vs", },  	{ }  }; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); -static struct ctl_table_header * sysctl_header; +#endif  #ifdef CONFIG_PROC_FS  struct ip_vs_iter { -	struct list_head *table; +	struct seq_net_private p;  /* Do not move this, netns depends upon it*/ +	struct hlist_head *table;  	int bucket;  }; @@ -1745,7 +1910,7 @@ struct ip_vs_iter {   *	Write the contents of the VS rule table to a PROCfs file.   *	(It is kept just for backward compatibility)   */ -static inline const char *ip_vs_fwd_name(unsigned flags) +static inline const char *ip_vs_fwd_name(unsigned int flags)  {  	switch (flags & IP_VS_CONN_F_FWD_MASK) {  	case IP_VS_CONN_F_LOCALNODE: @@ -1763,14 +1928,15 @@ static inline const char *ip_vs_fwd_name(unsigned flags)  /* Get the Nth entry in the two lists */  static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)  { +	struct net *net = seq_file_net(seq);  	struct ip_vs_iter *iter = seq->private;  	int idx;  	struct ip_vs_service *svc;  	/* look in hash by protocol */  	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { -			if (pos-- == 0){ +		hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { +			if (net_eq(svc->net, net) && pos-- == 0) {  				iter->table = ip_vs_svc_table;  				iter->bucket = idx;  				return svc; @@ -1780,8 +1946,9 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)  	/* keep looking in fwmark */  	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { -			if (pos-- == 0) { +		hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], +					 f_list) { +			if (net_eq(svc->net, net) && pos-- == 0) {  				iter->table = ip_vs_svc_fwm_table;  				iter->bucket = idx;  				return svc; @@ -1793,17 +1960,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)  }  static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) +	__acquires(RCU)  { - -	read_lock_bh(&__ip_vs_svc_lock); +	rcu_read_lock();  	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;  }  static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)  { -	struct list_head *e; +	struct hlist_node *e;  	struct ip_vs_iter *iter;  	struct ip_vs_service *svc; @@ -1816,13 +1982,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)  	if (iter->table == ip_vs_svc_table) {  		/* next service in table hashed by protocol */ -		if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) -			return list_entry(e, struct ip_vs_service, s_list); - +		e = rcu_dereference(hlist_next_rcu(&svc->s_list)); +		if (e) +			return hlist_entry(e, struct ip_vs_service, s_list);  		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { -			list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], -					    s_list) { +			hlist_for_each_entry_rcu(svc, +						 &ip_vs_svc_table[iter->bucket], +						 s_list) {  				return svc;  			}  		} @@ -1833,13 +2000,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)  	}  	/* next service in hashed by fwmark */ -	if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) -		return list_entry(e, struct ip_vs_service, f_list); +	e = rcu_dereference(hlist_next_rcu(&svc->f_list)); +	if (e) +		return hlist_entry(e, struct ip_vs_service, f_list);   scan_fwmark:  	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { -		list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], -				    f_list) +		hlist_for_each_entry_rcu(svc, +					 &ip_vs_svc_fwm_table[iter->bucket], +					 f_list)  			return svc;  	} @@ -1847,9 +2016,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)  }  static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) +	__releases(RCU)  { -	read_unlock_bh(&__ip_vs_svc_lock); +	rcu_read_unlock();  } @@ -1867,6 +2036,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)  		const struct ip_vs_service *svc = v;  		const struct ip_vs_iter *iter = seq->private;  		const struct ip_vs_dest *dest; +		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);  		if (iter->table == ip_vs_svc_table) {  #ifdef CONFIG_IP_VS_IPV6 @@ -1875,18 +2045,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)  					   ip_vs_proto_name(svc->protocol),  					   &svc->addr.in6,  					   ntohs(svc->port), -					   svc->scheduler->name); +					   sched->name);  			else  #endif  				seq_printf(seq, "%s  %08X:%04X %s %s ",  					   ip_vs_proto_name(svc->protocol),  					   ntohl(svc->addr.ip),  					   ntohs(svc->port), -					   svc->scheduler->name, +					   sched->name,  					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");  		} else {  			seq_printf(seq, "FWM  %08X %s %s", -				   svc->fwmark, svc->scheduler->name, +				   svc->fwmark, sched->name,  				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");  		} @@ -1897,7 +2067,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)  		else  			seq_putc(seq, '\n'); -		list_for_each_entry(dest, &svc->destinations, n_list) { +		list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  #ifdef CONFIG_IP_VS_IPV6  			if (dest->af == AF_INET6)  				seq_printf(seq, @@ -1935,7 +2105,7 @@ static const struct seq_operations ip_vs_info_seq_ops = {  static int ip_vs_info_open(struct inode *inode, struct file *file)  { -	return seq_open_private(file, &ip_vs_info_seq_ops, +	return seq_open_net(inode, file, &ip_vs_info_seq_ops,  			sizeof(struct ip_vs_iter));  } @@ -1944,18 +2114,13 @@ static const struct file_operations ip_vs_info_fops = {  	.open    = ip_vs_info_open,  	.read    = seq_read,  	.llseek  = seq_lseek, -	.release = seq_release_private, -}; - -#endif - -struct ip_vs_stats ip_vs_stats = { -	.lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), +	.release = seq_release_net,  }; -#ifdef CONFIG_PROC_FS  static int ip_vs_stats_show(struct seq_file *seq, void *v)  { +	struct net *net = seq_file_single_net(seq); +	struct ip_vs_stats_user show;  /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */  	seq_puts(seq, @@ -1963,29 +2128,25 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)  	seq_printf(seq,  		   "   Conns  Packets  Packets            Bytes            Bytes\n"); -	spin_lock_bh(&ip_vs_stats.lock); -	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, -		   ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, -		   (unsigned long long) ip_vs_stats.ustats.inbytes, -		   (unsigned long long) ip_vs_stats.ustats.outbytes); +	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); +	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, +		   show.inpkts, show.outpkts, +		   (unsigned long long) show.inbytes, +		   (unsigned long long) show.outbytes);  /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */  	seq_puts(seq,  		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n"); -	seq_printf(seq,"%8X %8X %8X %16X %16X\n", -			ip_vs_stats.ustats.cps, -			ip_vs_stats.ustats.inpps, -			ip_vs_stats.ustats.outpps, -			ip_vs_stats.ustats.inbps, -			ip_vs_stats.ustats.outbps); -	spin_unlock_bh(&ip_vs_stats.lock); +	seq_printf(seq, "%8X %8X %8X %16X %16X\n", +			show.cps, show.inpps, show.outpps, +			show.inbps, show.outbps);  	return 0;  }  static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)  { -	return single_open(file, ip_vs_stats_show, NULL); +	return single_open_net(inode, file, ip_vs_stats_show);  }  static const struct file_operations ip_vs_stats_fops = { @@ -1993,16 +2154,88 @@ static const struct file_operations ip_vs_stats_fops = {  	.open = ip_vs_stats_seq_open,  	.read = seq_read,  	.llseek = seq_lseek, -	.release = single_release, +	.release = single_release_net,  }; +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ +	struct net *net = seq_file_single_net(seq); +	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; +	struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; +	struct ip_vs_stats_user rates; +	int i; + +/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */ +	seq_puts(seq, +		 "       Total Incoming Outgoing         Incoming         Outgoing\n"); +	seq_printf(seq, +		   "CPU    Conns  Packets  Packets            Bytes            Bytes\n"); + +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); +		unsigned int start; +		__u64 inbytes, outbytes; + +		do { +			start = u64_stats_fetch_begin_irq(&u->syncp); +			inbytes = u->ustats.inbytes; +			outbytes = u->ustats.outbytes; +		} while (u64_stats_fetch_retry_irq(&u->syncp, start)); + +		seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", +			   i, u->ustats.conns, u->ustats.inpkts, +			   u->ustats.outpkts, (__u64)inbytes, +			   (__u64)outbytes); +	} + +	spin_lock_bh(&tot_stats->lock); + +	seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n", +		   tot_stats->ustats.conns, tot_stats->ustats.inpkts, +		   tot_stats->ustats.outpkts, +		   (unsigned long long) tot_stats->ustats.inbytes, +		   (unsigned long long) tot_stats->ustats.outbytes); + +	ip_vs_read_estimator(&rates, tot_stats); + +	spin_unlock_bh(&tot_stats->lock); + +/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */ +	seq_puts(seq, +		   "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n"); +	seq_printf(seq, "    %8X %8X %8X %16X %16X\n", +			rates.cps, +			rates.inpps, +			rates.outpps, +			rates.inbps, +			rates.outbps); + +	return 0; +} + +static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) +{ +	return single_open_net(inode, file, ip_vs_stats_percpu_show); +} + +static const struct file_operations ip_vs_stats_percpu_fops = { +	.owner = THIS_MODULE, +	.open = ip_vs_stats_percpu_seq_open, +	.read = seq_read, +	.llseek = seq_lseek, +	.release = single_release_net, +};  #endif  /*   *	Set timeout values for tcp tcpfin udp in the timeout_table.   */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)  { +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) +	struct ip_vs_proto_data *pd; +#endif +  	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",  		  u->tcp_timeout,  		  u->tcp_fin_timeout, @@ -2010,19 +2243,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)  #ifdef CONFIG_IP_VS_PROTO_TCP  	if (u->tcp_timeout) { -		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] +		pd = ip_vs_proto_data_get(net, IPPROTO_TCP); +		pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]  			= u->tcp_timeout * HZ;  	}  	if (u->tcp_fin_timeout) { -		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] +		pd = ip_vs_proto_data_get(net, IPPROTO_TCP); +		pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]  			= u->tcp_fin_timeout * HZ;  	}  #endif  #ifdef CONFIG_IP_VS_PROTO_UDP  	if (u->udp_timeout) { -		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] +		pd = ip_vs_proto_data_get(net, IPPROTO_UDP); +		pd->timeout_table[IP_VS_UDP_S_NORMAL]  			= u->udp_timeout * HZ;  	}  #endif @@ -2087,6 +2323,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,  static int  do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  { +	struct net *net = sock_net(sk);  	int ret;  	unsigned char arg[MAX_ARG_LEN];  	struct ip_vs_service_user *usvc_compat; @@ -2094,8 +2331,9 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  	struct ip_vs_service *svc;  	struct ip_vs_dest_user *udest_compat;  	struct ip_vs_dest_user_kern udest; +	struct netns_ipvs *ipvs = net_ipvs(net); -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) @@ -2114,6 +2352,24 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  	/* increase the module use count */  	ip_vs_use_count_inc(); +	/* Handle daemons since they have another lock */ +	if (cmd == IP_VS_SO_SET_STARTDAEMON || +	    cmd == IP_VS_SO_SET_STOPDAEMON) { +		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + +		if (mutex_lock_interruptible(&ipvs->sync_mutex)) { +			ret = -ERESTARTSYS; +			goto out_dec; +		} +		if (cmd == IP_VS_SO_SET_STARTDAEMON) +			ret = start_sync_thread(net, dm->state, dm->mcast_ifn, +						dm->syncid); +		else +			ret = stop_sync_thread(net, dm->state); +		mutex_unlock(&ipvs->sync_mutex); +		goto out_dec; +	} +  	if (mutex_lock_interruptible(&__ip_vs_mutex)) {  		ret = -ERESTARTSYS;  		goto out_dec; @@ -2121,19 +2377,11 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  	if (cmd == IP_VS_SO_SET_FLUSH) {  		/* Flush the virtual service */ -		ret = ip_vs_flush(); +		ret = ip_vs_flush(net, false);  		goto out_unlock;  	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {  		/* Set timeout values for (tcp tcpfin udp) */ -		ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); -		goto out_unlock; -	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) { -		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; -		ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); -		goto out_unlock; -	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) { -		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; -		ret = stop_sync_thread(dm->state); +		ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);  		goto out_unlock;  	} @@ -2148,7 +2396,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  	if (cmd == IP_VS_SO_SET_ZERO) {  		/* if no service address is set, zero counters in all */  		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { -			ret = ip_vs_zero_all(); +			ret = ip_vs_zero_all(net);  			goto out_unlock;  		}  	} @@ -2164,11 +2412,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  	}  	/* Lookup the exact service by <protocol, addr, port> or fwmark */ +	rcu_read_lock();  	if (usvc.fwmark == 0) -		svc = __ip_vs_service_find(usvc.af, usvc.protocol, +		svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,  					   &usvc.addr, usvc.port);  	else -		svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark); +		svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); +	rcu_read_unlock();  	if (cmd != IP_VS_SO_SET_ADD  	    && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2181,7 +2431,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  		if (svc != NULL)  			ret = -EEXIST;  		else -			ret = ip_vs_add_service(&usvc, &svc); +			ret = ip_vs_add_service(net, &usvc, &svc);  		break;  	case IP_VS_SO_SET_EDIT:  		ret = ip_vs_edit_service(svc, &usvc); @@ -2218,21 +2468,16 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ -	spin_lock_bh(&src->lock); -	memcpy(dst, &src->ustats, sizeof(*dst)); -	spin_unlock_bh(&src->lock); -} - -static void  ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)  { +	struct ip_vs_scheduler *sched; + +	sched = rcu_dereference_protected(src->scheduler, 1);  	dst->protocol = src->protocol;  	dst->addr = src->addr.ip;  	dst->port = src->port;  	dst->fwmark = src->fwmark; -	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); +	strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));  	dst->flags = src->flags;  	dst->timeout = src->timeout / HZ;  	dst->netmask = src->netmask; @@ -2241,7 +2486,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)  }  static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, +__ip_vs_get_service_entries(struct net *net, +			    const struct ip_vs_get_services *get,  			    struct ip_vs_get_services __user *uptr)  {  	int idx, count=0; @@ -2250,9 +2496,9 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,  	int ret = 0;  	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { +		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {  			/* Only expose IPv4 entries to old interface */ -			if (svc->af != AF_INET) +			if (svc->af != AF_INET || !net_eq(svc->net, net))  				continue;  			if (count >= get->num_services) @@ -2269,9 +2515,9 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,  	}  	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { -		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { +		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {  			/* Only expose IPv4 entries to old interface */ -			if (svc->af != AF_INET) +			if (svc->af != AF_INET || !net_eq(svc->net, net))  				continue;  			if (count >= get->num_services) @@ -2286,29 +2532,32 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,  			count++;  		}  	} -  out: +out:  	return ret;  }  static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, +__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,  			 struct ip_vs_get_dests __user *uptr)  {  	struct ip_vs_service *svc;  	union nf_inet_addr addr = { .ip = get->addr };  	int ret = 0; +	rcu_read_lock();  	if (get->fwmark) -		svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark); +		svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);  	else -		svc = __ip_vs_service_find(AF_INET, get->protocol, &addr, +		svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,  					   get->port); +	rcu_read_unlock();  	if (svc) {  		int count = 0;  		struct ip_vs_dest *dest;  		struct ip_vs_dest_entry entry; +		memset(&entry, 0, sizeof(entry));  		list_for_each_entry(dest, &svc->destinations, n_list) {  			if (count >= get->num_dests)  				break; @@ -2336,17 +2585,23 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,  }  static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)  { +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) +	struct ip_vs_proto_data *pd; +#endif + +	memset(u, 0, sizeof (*u)); +  #ifdef CONFIG_IP_VS_PROTO_TCP -	u->tcp_timeout = -		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; -	u->tcp_fin_timeout = -		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +	pd = ip_vs_proto_data_get(net, IPPROTO_TCP); +	u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; +	u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;  #endif  #ifdef CONFIG_IP_VS_PROTO_UDP +	pd = ip_vs_proto_data_get(net, IPPROTO_UDP);  	u->udp_timeout = -		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +			pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;  #endif  } @@ -2375,8 +2630,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  	unsigned char arg[128];  	int ret = 0;  	unsigned int copylen; +	struct net *net = sock_net(sk); +	struct netns_ipvs *ipvs = net_ipvs(net); -	if (!capable(CAP_NET_ADMIN)) +	BUG_ON(!net); +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) @@ -2394,6 +2652,33 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  	if (copy_from_user(arg, user, copylen) != 0)  		return -EFAULT; +	/* +	 * Handle daemons first since it has its own locking +	 */ +	if (cmd == IP_VS_SO_GET_DAEMON) { +		struct ip_vs_daemon_user d[2]; + +		memset(&d, 0, sizeof(d)); +		if (mutex_lock_interruptible(&ipvs->sync_mutex)) +			return -ERESTARTSYS; + +		if (ipvs->sync_state & IP_VS_STATE_MASTER) { +			d[0].state = IP_VS_STATE_MASTER; +			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, +				sizeof(d[0].mcast_ifn)); +			d[0].syncid = ipvs->master_syncid; +		} +		if (ipvs->sync_state & IP_VS_STATE_BACKUP) { +			d[1].state = IP_VS_STATE_BACKUP; +			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, +				sizeof(d[1].mcast_ifn)); +			d[1].syncid = ipvs->backup_syncid; +		} +		if (copy_to_user(user, &d, sizeof(d)) != 0) +			ret = -EFAULT; +		mutex_unlock(&ipvs->sync_mutex); +		return ret; +	}  	if (mutex_lock_interruptible(&__ip_vs_mutex))  		return -ERESTARTSYS; @@ -2418,7 +2703,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  		struct ip_vs_getinfo info;  		info.version = IP_VS_VERSION_CODE;  		info.size = ip_vs_conn_tab_size; -		info.num_services = ip_vs_num_services; +		info.num_services = ipvs->num_services;  		if (copy_to_user(user, &info, sizeof(info)) != 0)  			ret = -EFAULT;  	} @@ -2437,7 +2722,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  			ret = -EINVAL;  			goto out;  		} -		ret = __ip_vs_get_service_entries(get, user); +		ret = __ip_vs_get_service_entries(net, get, user);  	}  	break; @@ -2449,11 +2734,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  		entry = (struct ip_vs_service_entry *)arg;  		addr.ip = entry->addr; +		rcu_read_lock();  		if (entry->fwmark) -			svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark); +			svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);  		else -			svc = __ip_vs_service_find(AF_INET, entry->protocol, -						   &addr, entry->port); +			svc = __ip_vs_service_find(net, AF_INET, +						   entry->protocol, &addr, +						   entry->port); +		rcu_read_unlock();  		if (svc) {  			ip_vs_copy_service(entry, svc);  			if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2476,7 +2764,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  			ret = -EINVAL;  			goto out;  		} -		ret = __ip_vs_get_dest_entries(get, user); +		ret = __ip_vs_get_dest_entries(net, get, user);  	}  	break; @@ -2484,37 +2772,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  	{  		struct ip_vs_timeout_user t; -		__ip_vs_get_timeouts(&t); +		__ip_vs_get_timeouts(net, &t);  		if (copy_to_user(user, &t, sizeof(t)) != 0)  			ret = -EFAULT;  	}  	break; -	case IP_VS_SO_GET_DAEMON: -	{ -		struct ip_vs_daemon_user d[2]; - -		memset(&d, 0, sizeof(d)); -		if (ip_vs_sync_state & IP_VS_STATE_MASTER) { -			d[0].state = IP_VS_STATE_MASTER; -			strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); -			d[0].syncid = ip_vs_master_syncid; -		} -		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { -			d[1].state = IP_VS_STATE_BACKUP; -			strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); -			d[1].syncid = ip_vs_backup_syncid; -		} -		if (copy_to_user(user, &d, sizeof(d)) != 0) -			ret = -EFAULT; -	} -	break; -  	default:  		ret = -EINVAL;  	} -  out: +out:  	mutex_unlock(&__ip_vs_mutex);  	return ret;  } @@ -2542,6 +2810,7 @@ static struct genl_family ip_vs_genl_family = {  	.name		= IPVS_GENL_NAME,  	.version	= IPVS_GENL_VERSION,  	.maxattr	= IPVS_CMD_MAX, +	.netnsok        = true,         /* Make ipvsadm to work on netns */  };  /* Policy used for first-level command attributes */ @@ -2599,31 +2868,29 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {  static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,  				 struct ip_vs_stats *stats)  { +	struct ip_vs_stats_user ustats;  	struct nlattr *nl_stats = nla_nest_start(skb, container_type);  	if (!nl_stats)  		return -EMSGSIZE; -	spin_lock_bh(&stats->lock); - -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); -	NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); -	NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); -	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); - -	spin_unlock_bh(&stats->lock); - +	ip_vs_copy_stats(&ustats, stats); + +	if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || +	    nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || +	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || +	    nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || +	    nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || +	    nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || +	    nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || +	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || +	    nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || +	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) +		goto nla_put_failure;  	nla_nest_end(skb, nl_stats);  	return 0;  nla_put_failure: -	spin_unlock_bh(&stats->lock);  	nla_nest_cancel(skb, nl_stats);  	return -EMSGSIZE;  } @@ -2631,6 +2898,8 @@ nla_put_failure:  static int ip_vs_genl_fill_service(struct sk_buff *skb,  				   struct ip_vs_service *svc)  { +	struct ip_vs_scheduler *sched; +	struct ip_vs_pe *pe;  	struct nlattr *nl_service;  	struct ip_vs_flags flags = { .flags = svc->flags,  				     .mask = ~0 }; @@ -2639,23 +2908,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,  	if (!nl_service)  		return -EMSGSIZE; -	NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - +	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) +		goto nla_put_failure;  	if (svc->fwmark) { -		NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); +		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) +			goto nla_put_failure;  	} else { -		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); -		NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); -		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); +		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || +		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || +		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) +			goto nla_put_failure;  	} -	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); -	if (svc->pe) -		NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); -	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); -	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); -	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - +	sched = rcu_dereference_protected(svc->scheduler, 1); +	pe = rcu_dereference_protected(svc->pe, 1); +	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || +	    (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || +	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || +	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || +	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) +		goto nla_put_failure;  	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))  		goto nla_put_failure; @@ -2674,7 +2946,7 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb,  {  	void *hdr; -	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,  			  &ip_vs_genl_family, NLM_F_MULTI,  			  IPVS_CMD_NEW_SERVICE);  	if (!hdr) @@ -2696,11 +2968,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,  	int idx = 0, i;  	int start = cb->args[0];  	struct ip_vs_service *svc; +	struct net *net = skb_sknet(skb);  	mutex_lock(&__ip_vs_mutex);  	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { -		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { -			if (++idx <= start) +		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { +			if (++idx <= start || !net_eq(svc->net, net))  				continue;  			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {  				idx--; @@ -2710,8 +2983,8 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,  	}  	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { -		list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { -			if (++idx <= start) +		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { +			if (++idx <= start || !net_eq(svc->net, net))  				continue;  			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {  				idx--; @@ -2727,7 +3000,8 @@ nla_put_failure:  	return skb->len;  } -static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, +static int ip_vs_genl_parse_service(struct net *net, +				    struct ip_vs_service_user_kern *usvc,  				    struct nlattr *nla, int full_entry,  				    struct ip_vs_service **ret_svc)  { @@ -2765,15 +3039,17 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,  	} else {  		usvc->protocol = nla_get_u16(nla_protocol);  		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); -		usvc->port = nla_get_u16(nla_port); +		usvc->port = nla_get_be16(nla_port);  		usvc->fwmark = 0;  	} +	rcu_read_lock();  	if (usvc->fwmark) -		svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark); +		svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);  	else -		svc = __ip_vs_service_find(usvc->af, usvc->protocol, +		svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,  					   &usvc->addr, usvc->port); +	rcu_read_unlock();  	*ret_svc = svc;  	/* If a full entry was requested, check for the additional fields */ @@ -2803,19 +3079,20 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,  		usvc->sched_name = nla_data(nla_sched);  		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;  		usvc->timeout = nla_get_u32(nla_timeout); -		usvc->netmask = nla_get_u32(nla_netmask); +		usvc->netmask = nla_get_be32(nla_netmask);  	}  	return 0;  } -static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, +						     struct nlattr *nla)  {  	struct ip_vs_service_user_kern usvc;  	struct ip_vs_service *svc;  	int ret; -	ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc); +	ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);  	return ret ? ERR_PTR(ret) : svc;  } @@ -2827,21 +3104,22 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)  	if (!nl_dest)  		return -EMSGSIZE; -	NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); -	NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - -	NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, -		    atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); -	NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); -	NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); -	NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); -	NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, -		    atomic_read(&dest->activeconns)); -	NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, -		    atomic_read(&dest->inactconns)); -	NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, -		    atomic_read(&dest->persistconns)); - +	if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || +	    nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || +	    nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, +			(atomic_read(&dest->conn_flags) & +			 IP_VS_CONN_F_FWD_MASK)) || +	    nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, +			atomic_read(&dest->weight)) || +	    nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || +	    nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || +	    nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, +			atomic_read(&dest->activeconns)) || +	    nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, +			atomic_read(&dest->inactconns)) || +	    nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, +			atomic_read(&dest->persistconns))) +		goto nla_put_failure;  	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))  		goto nla_put_failure; @@ -2859,7 +3137,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,  {  	void *hdr; -	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,  			  &ip_vs_genl_family, NLM_F_MULTI,  			  IPVS_CMD_NEW_DEST);  	if (!hdr) @@ -2883,6 +3161,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,  	struct ip_vs_service *svc;  	struct ip_vs_dest *dest;  	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; +	struct net *net = skb_sknet(skb);  	mutex_lock(&__ip_vs_mutex); @@ -2891,7 +3170,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,  			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))  		goto out_err; -	svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + +	svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);  	if (IS_ERR(svc) || svc == NULL)  		goto out_err; @@ -2934,7 +3214,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,  	memset(udest, 0, sizeof(*udest));  	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); -	udest->port = nla_get_u16(nla_port); +	udest->port = nla_get_be16(nla_port);  	/* If a full entry was requested, check for the additional fields */  	if (full_entry) { @@ -2959,8 +3239,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,  	return 0;  } -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, -				  const char *mcast_ifn, __be32 syncid) +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, +				  const char *mcast_ifn, __u32 syncid)  {  	struct nlattr *nl_daemon; @@ -2968,10 +3248,10 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,  	if (!nl_daemon)  		return -EMSGSIZE; -	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); -	NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); -	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - +	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || +	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || +	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) +		goto nla_put_failure;  	nla_nest_end(skb, nl_daemon);  	return 0; @@ -2981,12 +3261,12 @@ nla_put_failure:  	return -EMSGSIZE;  } -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, -				  const char *mcast_ifn, __be32 syncid, +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, +				  const char *mcast_ifn, __u32 syncid,  				  struct netlink_callback *cb)  {  	void *hdr; -	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,  			  &ip_vs_genl_family, NLM_F_MULTI,  			  IPVS_CMD_NEW_DAEMON);  	if (!hdr) @@ -3005,56 +3285,61 @@ nla_put_failure:  static int ip_vs_genl_dump_daemons(struct sk_buff *skb,  				   struct netlink_callback *cb)  { -	mutex_lock(&__ip_vs_mutex); -	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { +	struct net *net = skb_sknet(skb); +	struct netns_ipvs *ipvs = net_ipvs(net); + +	mutex_lock(&ipvs->sync_mutex); +	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {  		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, -					   ip_vs_master_mcast_ifn, -					   ip_vs_master_syncid, cb) < 0) +					   ipvs->master_mcast_ifn, +					   ipvs->master_syncid, cb) < 0)  			goto nla_put_failure;  		cb->args[0] = 1;  	} -	if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { +	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {  		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, -					   ip_vs_backup_mcast_ifn, -					   ip_vs_backup_syncid, cb) < 0) +					   ipvs->backup_mcast_ifn, +					   ipvs->backup_syncid, cb) < 0)  			goto nla_put_failure;  		cb->args[1] = 1;  	}  nla_put_failure: -	mutex_unlock(&__ip_vs_mutex); +	mutex_unlock(&ipvs->sync_mutex);  	return skb->len;  } -static int ip_vs_genl_new_daemon(struct nlattr **attrs) +static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)  {  	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&  	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&  	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))  		return -EINVAL; -	return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), +	return start_sync_thread(net, +				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),  				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),  				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));  } -static int ip_vs_genl_del_daemon(struct nlattr **attrs) +static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)  {  	if (!attrs[IPVS_DAEMON_ATTR_STATE])  		return -EINVAL; -	return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +	return stop_sync_thread(net, +				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));  } -static int ip_vs_genl_set_config(struct nlattr **attrs) +static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)  {  	struct ip_vs_timeout_user t; -	__ip_vs_get_timeouts(&t); +	__ip_vs_get_timeouts(net, &t);  	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])  		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); @@ -3066,32 +3351,23 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)  	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])  		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); -	return ip_vs_set_timeout(&t); +	return ip_vs_set_timeout(net, &t);  } -static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)  { -	struct ip_vs_service *svc = NULL; -	struct ip_vs_service_user_kern usvc; -	struct ip_vs_dest_user_kern udest;  	int ret = 0, cmd; -	int need_full_svc = 0, need_full_dest = 0; +	struct net *net; +	struct netns_ipvs *ipvs; +	net = skb_sknet(skb); +	ipvs = net_ipvs(net);  	cmd = info->genlhdr->cmd; -	mutex_lock(&__ip_vs_mutex); - -	if (cmd == IPVS_CMD_FLUSH) { -		ret = ip_vs_flush(); -		goto out; -	} else if (cmd == IPVS_CMD_SET_CONFIG) { -		ret = ip_vs_genl_set_config(info->attrs); -		goto out; -	} else if (cmd == IPVS_CMD_NEW_DAEMON || -		   cmd == IPVS_CMD_DEL_DAEMON) { - +	if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {  		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; +		mutex_lock(&ipvs->sync_mutex);  		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||  		    nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,  				     info->attrs[IPVS_CMD_ATTR_DAEMON], @@ -3101,13 +3377,38 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)  		}  		if (cmd == IPVS_CMD_NEW_DAEMON) -			ret = ip_vs_genl_new_daemon(daemon_attrs); +			ret = ip_vs_genl_new_daemon(net, daemon_attrs);  		else -			ret = ip_vs_genl_del_daemon(daemon_attrs); +			ret = ip_vs_genl_del_daemon(net, daemon_attrs); +out: +		mutex_unlock(&ipvs->sync_mutex); +	} +	return ret; +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ +	struct ip_vs_service *svc = NULL; +	struct ip_vs_service_user_kern usvc; +	struct ip_vs_dest_user_kern udest; +	int ret = 0, cmd; +	int need_full_svc = 0, need_full_dest = 0; +	struct net *net; + +	net = skb_sknet(skb); +	cmd = info->genlhdr->cmd; + +	mutex_lock(&__ip_vs_mutex); + +	if (cmd == IPVS_CMD_FLUSH) { +		ret = ip_vs_flush(net, false); +		goto out; +	} else if (cmd == IPVS_CMD_SET_CONFIG) { +		ret = ip_vs_genl_set_config(net, info->attrs);  		goto out;  	} else if (cmd == IPVS_CMD_ZERO &&  		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) { -		ret = ip_vs_zero_all(); +		ret = ip_vs_zero_all(net);  		goto out;  	} @@ -3117,7 +3418,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)  	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)  		need_full_svc = 1; -	ret = ip_vs_genl_parse_service(&usvc, +	ret = ip_vs_genl_parse_service(net, &usvc,  				       info->attrs[IPVS_CMD_ATTR_SERVICE],  				       need_full_svc, &svc);  	if (ret) @@ -3147,7 +3448,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)  	switch (cmd) {  	case IPVS_CMD_NEW_SERVICE:  		if (svc == NULL) -			ret = ip_vs_add_service(&usvc, &svc); +			ret = ip_vs_add_service(net, &usvc, &svc);  		else  			ret = -EEXIST;  		break; @@ -3185,7 +3486,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)  	struct sk_buff *msg;  	void *reply;  	int ret, cmd, reply_cmd; +	struct net *net; +	net = skb_sknet(skb);  	cmd = info->genlhdr->cmd;  	if (cmd == IPVS_CMD_GET_SERVICE) @@ -3214,7 +3517,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)  	{  		struct ip_vs_service *svc; -		svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); +		svc = ip_vs_genl_find_service(net, +					      info->attrs[IPVS_CMD_ATTR_SERVICE]);  		if (IS_ERR(svc)) {  			ret = PTR_ERR(svc);  			goto out_err; @@ -3234,23 +3538,28 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)  	{  		struct ip_vs_timeout_user t; -		__ip_vs_get_timeouts(&t); +		__ip_vs_get_timeouts(net, &t);  #ifdef CONFIG_IP_VS_PROTO_TCP -		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); -		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, -			    t.tcp_fin_timeout); +		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, +				t.tcp_timeout) || +		    nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, +				t.tcp_fin_timeout)) +			goto nla_put_failure;  #endif  #ifdef CONFIG_IP_VS_PROTO_UDP -		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) +			goto nla_put_failure;  #endif  		break;  	}  	case IPVS_CMD_GET_INFO: -		NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); -		NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, -			    ip_vs_conn_tab_size); +		if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, +				IP_VS_VERSION_CODE) || +		    nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, +				ip_vs_conn_tab_size)) +			goto nla_put_failure;  		break;  	} @@ -3271,7 +3580,7 @@ out:  } -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { +static const struct genl_ops ip_vs_genl_ops[] = {  	{  		.cmd	= IPVS_CMD_NEW_SERVICE,  		.flags	= GENL_ADMIN_PERM, @@ -3325,13 +3634,13 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = {  		.cmd	= IPVS_CMD_NEW_DAEMON,  		.flags	= GENL_ADMIN_PERM,  		.policy	= ip_vs_cmd_policy, -		.doit	= ip_vs_genl_set_cmd, +		.doit	= ip_vs_genl_set_daemon,  	},  	{  		.cmd	= IPVS_CMD_DEL_DAEMON,  		.flags	= GENL_ADMIN_PERM,  		.policy	= ip_vs_cmd_policy, -		.doit	= ip_vs_genl_set_cmd, +		.doit	= ip_vs_genl_set_daemon,  	},  	{  		.cmd	= IPVS_CMD_GET_DAEMON, @@ -3370,7 +3679,7 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = {  static int __init ip_vs_genl_register(void)  {  	return genl_register_family_with_ops(&ip_vs_genl_family, -		ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops)); +					     ip_vs_genl_ops);  }  static void ip_vs_genl_unregister(void) @@ -3380,46 +3689,212 @@ static void ip_vs_genl_unregister(void)  /* End of Generic Netlink interface definitions */ - -int __init ip_vs_control_init(void) +/* + * per netns intit/exit func. + */ +#ifdef CONFIG_SYSCTL +static int __net_init ip_vs_control_net_init_sysctl(struct net *net)  { -	int ret;  	int idx; +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ctl_table *tbl; + +	atomic_set(&ipvs->dropentry, 0); +	spin_lock_init(&ipvs->dropentry_lock); +	spin_lock_init(&ipvs->droppacket_lock); +	spin_lock_init(&ipvs->securetcp_lock); + +	if (!net_eq(net, &init_net)) { +		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); +		if (tbl == NULL) +			return -ENOMEM; + +		/* Don't export sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) +			tbl[0].procname = NULL; +	} else +		tbl = vs_vars; +	/* Initialize sysctl defaults */ +	idx = 0; +	ipvs->sysctl_amemthresh = 1024; +	tbl[idx++].data = &ipvs->sysctl_amemthresh; +	ipvs->sysctl_am_droprate = 10; +	tbl[idx++].data = &ipvs->sysctl_am_droprate; +	tbl[idx++].data = &ipvs->sysctl_drop_entry; +	tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT +	tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif +	tbl[idx++].data = &ipvs->sysctl_secure_tcp; +	ipvs->sysctl_snat_reroute = 1; +	tbl[idx++].data = &ipvs->sysctl_snat_reroute; +	ipvs->sysctl_sync_ver = 1; +	tbl[idx++].data = &ipvs->sysctl_sync_ver; +	ipvs->sysctl_sync_ports = 1; +	tbl[idx++].data = &ipvs->sysctl_sync_ports; +	tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; +	ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; +	tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; +	ipvs->sysctl_sync_sock_size = 0; +	tbl[idx++].data = &ipvs->sysctl_sync_sock_size; +	tbl[idx++].data = &ipvs->sysctl_cache_bypass; +	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; +	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; +	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; +	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; +	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; +	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; +	tbl[idx].data = &ipvs->sysctl_sync_threshold; +	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); +	ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; +	tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; +	ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); +	tbl[idx++].data = &ipvs->sysctl_sync_retries; +	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; +	ipvs->sysctl_pmtu_disc = 1; +	tbl[idx++].data = &ipvs->sysctl_pmtu_disc; +	tbl[idx++].data = &ipvs->sysctl_backup_only; + + +	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); +	if (ipvs->sysctl_hdr == NULL) { +		if (!net_eq(net, &init_net)) +			kfree(tbl); +		return -ENOMEM; +	} +	ip_vs_start_estimator(net, &ipvs->tot_stats); +	ipvs->sysctl_tbl = tbl; +	/* Schedule defense work */ +	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); +	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); -	EnterFunction(2); +	return 0; +} -	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ -	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  { -		INIT_LIST_HEAD(&ip_vs_svc_table[idx]); -		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); -	} -	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  { -		INIT_LIST_HEAD(&ip_vs_rtable[idx]); +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	cancel_delayed_work_sync(&ipvs->defense_work); +	cancel_work_sync(&ipvs->defense_work.work); +	unregister_net_sysctl_table(ipvs->sysctl_hdr); +	ip_vs_stop_estimator(net, &ipvs->tot_stats); +} + +#else + +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } + +#endif + +static struct notifier_block ip_vs_dst_notifier = { +	.notifier_call = ip_vs_dst_event, +}; + +int __net_init ip_vs_control_net_init(struct net *net) +{ +	int i, idx; +	struct netns_ipvs *ipvs = net_ipvs(net); + +	/* Initialize rs_table */ +	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) +		INIT_HLIST_HEAD(&ipvs->rs_table[idx]); + +	INIT_LIST_HEAD(&ipvs->dest_trash); +	spin_lock_init(&ipvs->dest_trash_lock); +	setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, +		    (unsigned long) net); +	atomic_set(&ipvs->ftpsvc_counter, 0); +	atomic_set(&ipvs->nullsvc_counter, 0); + +	/* procfs stats */ +	ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); +	if (!ipvs->tot_stats.cpustats) +		return -ENOMEM; + +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *ipvs_tot_stats; +		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); +		u64_stats_init(&ipvs_tot_stats->syncp);  	} -	smp_wmb(); + +	spin_lock_init(&ipvs->tot_stats.lock); + +	proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); +	proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); +	proc_create("ip_vs_stats_percpu", 0, net->proc_net, +		    &ip_vs_stats_percpu_fops); + +	if (ip_vs_control_net_init_sysctl(net)) +		goto err; + +	return 0; + +err: +	free_percpu(ipvs->tot_stats.cpustats); +	return -ENOMEM; +} + +void __net_exit ip_vs_control_net_cleanup(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	ip_vs_trash_cleanup(net); +	ip_vs_control_net_cleanup_sysctl(net); +	remove_proc_entry("ip_vs_stats_percpu", net->proc_net); +	remove_proc_entry("ip_vs_stats", net->proc_net); +	remove_proc_entry("ip_vs", net->proc_net); +	free_percpu(ipvs->tot_stats.cpustats); +} + +int __init ip_vs_register_nl_ioctl(void) +{ +	int ret;  	ret = nf_register_sockopt(&ip_vs_sockopts);  	if (ret) {  		pr_err("cannot register sockopt.\n"); -		return ret; +		goto err_sock;  	}  	ret = ip_vs_genl_register();  	if (ret) {  		pr_err("cannot register Generic Netlink interface.\n"); -		nf_unregister_sockopt(&ip_vs_sockopts); -		return ret; +		goto err_genl;  	} +	return 0; -	proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); -	proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); +err_genl: +	nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: +	return ret; +} -	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); +void ip_vs_unregister_nl_ioctl(void) +{ +	ip_vs_genl_unregister(); +	nf_unregister_sockopt(&ip_vs_sockopts); +} -	ip_vs_new_estimator(&ip_vs_stats); +int __init ip_vs_control_init(void) +{ +	int idx; +	int ret; -	/* Hook the defense timer */ -	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); +	EnterFunction(2); + +	/* Initialize svc_table, ip_vs_svc_fwm_table */ +	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { +		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); +		INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); +	} + +	smp_wmb();	/* Do we really need it now ? */ + +	ret = register_netdevice_notifier(&ip_vs_dst_notifier); +	if (ret < 0) +		return ret;  	LeaveFunction(2);  	return 0; @@ -3429,14 +3904,6 @@ int __init ip_vs_control_init(void)  void ip_vs_control_cleanup(void)  {  	EnterFunction(2); -	ip_vs_trash_cleanup(); -	cancel_rearming_delayed_work(&defense_work); -	cancel_work_sync(&defense_work.work); -	ip_vs_kill_estimator(&ip_vs_stats); -	unregister_sysctl_table(sysctl_header); -	proc_net_remove(&init_net, "ip_vs_stats"); -	proc_net_remove(&init_net, "ip_vs"); -	ip_vs_genl_unregister(); -	nf_unregister_sockopt(&ip_vs_sockopts); +	unregister_netdevice_notifier(&ip_vs_dst_notifier);  	LeaveFunction(2);  } diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 95fd0d14200..c3b84546ea9 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -51,7 +51,7 @@   *      IPVS DH bucket   */  struct ip_vs_dh_bucket { -	struct ip_vs_dest       *dest;          /* real server (cache) */ +	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */  };  /* @@ -64,11 +64,15 @@ struct ip_vs_dh_bucket {  #define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)  #define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1) +struct ip_vs_dh_state { +	struct ip_vs_dh_bucket		buckets[IP_VS_DH_TAB_SIZE]; +	struct rcu_head			rcu_head; +};  /*   *	Returns hash value for IPVS DH entry   */ -static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)  {  	__be32 addr_fold = addr->ip; @@ -85,10 +89,9 @@ static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)   *      Get ip_vs_dest associated with supplied parameters.   */  static inline struct ip_vs_dest * -ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, -	     const union nf_inet_addr *addr) +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)  { -	return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; +	return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);  } @@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,   *      Assign all the hash buckets of the specified table with the service.   */  static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)  {  	int i;  	struct ip_vs_dh_bucket *b;  	struct list_head *p;  	struct ip_vs_dest *dest; +	bool empty; -	b = tbl; +	b = &s->buckets[0];  	p = &svc->destinations; +	empty = list_empty(p);  	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { -		if (list_empty(p)) { -			b->dest = NULL; -		} else { +		dest = rcu_dereference_protected(b->dest, 1); +		if (dest) +			ip_vs_dest_put(dest); +		if (empty) +			RCU_INIT_POINTER(b->dest, NULL); +		else {  			if (p == &svc->destinations)  				p = p->next;  			dest = list_entry(p, struct ip_vs_dest, n_list); -			atomic_inc(&dest->refcnt); -			b->dest = dest; +			ip_vs_dest_hold(dest); +			RCU_INIT_POINTER(b->dest, dest);  			p = p->next;  		} @@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)  /*   *      Flush all the hash buckets of the specified table.   */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +static void ip_vs_dh_flush(struct ip_vs_dh_state *s)  {  	int i;  	struct ip_vs_dh_bucket *b; +	struct ip_vs_dest *dest; -	b = tbl; +	b = &s->buckets[0];  	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { -		if (b->dest) { -			atomic_dec(&b->dest->refcnt); -			b->dest = NULL; +		dest = rcu_dereference_protected(b->dest, 1); +		if (dest) { +			ip_vs_dest_put(dest); +			RCU_INIT_POINTER(b->dest, NULL);  		}  		b++;  	} @@ -145,52 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)  static int ip_vs_dh_init_svc(struct ip_vs_service *svc)  { -	struct ip_vs_dh_bucket *tbl; +	struct ip_vs_dh_state *s;  	/* allocate the DH table for this service */ -	tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, -		      GFP_ATOMIC); -	if (tbl == NULL) { -		pr_err("%s(): no memory\n", __func__); +	s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); +	if (s == NULL)  		return -ENOMEM; -	} -	svc->sched_data = tbl; + +	svc->sched_data = s;  	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "  		  "current service\n",  		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); -	/* assign the hash buckets with the updated service */ -	ip_vs_dh_assign(tbl, svc); +	/* assign the hash buckets with current dests */ +	ip_vs_dh_reassign(s, svc);  	return 0;  } -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +static void ip_vs_dh_done_svc(struct ip_vs_service *svc)  { -	struct ip_vs_dh_bucket *tbl = svc->sched_data; +	struct ip_vs_dh_state *s = svc->sched_data;  	/* got to clean up hash buckets here */ -	ip_vs_dh_flush(tbl); +	ip_vs_dh_flush(s);  	/* release the table itself */ -	kfree(svc->sched_data); +	kfree_rcu(s, rcu_head);  	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",  		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - -	return 0;  } -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, +				 struct ip_vs_dest *dest)  { -	struct ip_vs_dh_bucket *tbl = svc->sched_data; - -	/* got to clean up hash buckets here */ -	ip_vs_dh_flush(tbl); +	struct ip_vs_dh_state *s = svc->sched_data;  	/* assign the hash buckets with the updated service */ -	ip_vs_dh_assign(tbl, svc); +	ip_vs_dh_reassign(s, svc);  	return 0;  } @@ -210,27 +214,26 @@ static inline int is_overloaded(struct ip_vs_dest *dest)   *      Destination hashing scheduling   */  static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		  struct ip_vs_iphdr *iph)  {  	struct ip_vs_dest *dest; -	struct ip_vs_dh_bucket *tbl; -	struct ip_vs_iphdr iph; - -	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); +	struct ip_vs_dh_state *s;  	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); -	tbl = (struct ip_vs_dh_bucket *)svc->sched_data; -	dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); +	s = (struct ip_vs_dh_state *) svc->sched_data; +	dest = ip_vs_dh_get(svc->af, s, &iph->daddr);  	if (!dest  	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)  	    || atomic_read(&dest->weight) <= 0  	    || is_overloaded(dest)) { +		ip_vs_scheduler_err(svc, "no destination available");  		return NULL;  	}  	IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", -		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), +		      IP_VS_DBG_ADDR(svc->af, &iph->daddr),  		      IP_VS_DBG_ADDR(svc->af, &dest->addr),  		      ntohs(dest->port)); @@ -249,7 +252,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =  	.n_list =		LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),  	.init_service =		ip_vs_dh_init_svc,  	.done_service =		ip_vs_dh_done_svc, -	.update_service =	ip_vs_dh_update_svc, +	.add_dest =		ip_vs_dh_dest_changed, +	.del_dest =		ip_vs_dh_dest_changed,  	.schedule =		ip_vs_dh_schedule,  }; @@ -263,6 +267,7 @@ static int __init ip_vs_dh_init(void)  static void __exit ip_vs_dh_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +	synchronize_rcu();  } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index ff28801962e..1425e9a924c 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -8,8 +8,12 @@   *              as published by the Free Software Foundation; either version   *              2 of the License, or (at your option) any later version.   * - * Changes: - * + * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com> + *              Network name space (netns) aware. + *              Global data moved to netns i.e struct netns_ipvs + *              Affected data: est_list and est_lock. + *              estimation_timer() runs with timer per netns. + *              get_stats()) do the per cpu summing.   */  #define KMSG_COMPONENT "IPVS" @@ -48,11 +52,44 @@   */ -static void estimation_timer(unsigned long arg); +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, +				 struct ip_vs_cpu_stats __percpu *stats) +{ +	int i; +	bool add = false; + +	for_each_possible_cpu(i) { +		struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); +		unsigned int start; +		__u64 inbytes, outbytes; +		if (add) { +			sum->conns += s->ustats.conns; +			sum->inpkts += s->ustats.inpkts; +			sum->outpkts += s->ustats.outpkts; +			do { +				start = u64_stats_fetch_begin(&s->syncp); +				inbytes = s->ustats.inbytes; +				outbytes = s->ustats.outbytes; +			} while (u64_stats_fetch_retry(&s->syncp, start)); +			sum->inbytes += inbytes; +			sum->outbytes += outbytes; +		} else { +			add = true; +			sum->conns = s->ustats.conns; +			sum->inpkts = s->ustats.inpkts; +			sum->outpkts = s->ustats.outpkts; +			do { +				start = u64_stats_fetch_begin(&s->syncp); +				sum->inbytes = s->ustats.inbytes; +				sum->outbytes = s->ustats.outbytes; +			} while (u64_stats_fetch_retry(&s->syncp, start)); +		} +	} +} -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);  static void estimation_timer(unsigned long arg)  { @@ -62,12 +99,16 @@ static void estimation_timer(unsigned long arg)  	u32 n_inpkts, n_outpkts;  	u64 n_inbytes, n_outbytes;  	u32 rate; +	struct net *net = (struct net *)arg; +	struct netns_ipvs *ipvs; -	spin_lock(&est_lock); -	list_for_each_entry(e, &est_list, list) { +	ipvs = net_ipvs(net); +	spin_lock(&ipvs->est_lock); +	list_for_each_entry(e, &ipvs->est_list, list) {  		s = container_of(e, struct ip_vs_stats, est);  		spin_lock(&s->lock); +		ip_vs_read_cpu_stats(&s->ustats, s->cpustats);  		n_conns = s->ustats.conns;  		n_inpkts = s->ustats.inpkts;  		n_outpkts = s->ustats.outpkts; @@ -75,81 +116,64 @@ static void estimation_timer(unsigned long arg)  		n_outbytes = s->ustats.outbytes;  		/* scaled by 2^10, but divided 2 seconds */ -		rate = (n_conns - e->last_conns)<<9; +		rate = (n_conns - e->last_conns) << 9;  		e->last_conns = n_conns; -		e->cps += ((long)rate - (long)e->cps)>>2; -		s->ustats.cps = (e->cps+0x1FF)>>10; +		e->cps += ((long)rate - (long)e->cps) >> 2; -		rate = (n_inpkts - e->last_inpkts)<<9; +		rate = (n_inpkts - e->last_inpkts) << 9;  		e->last_inpkts = n_inpkts; -		e->inpps += ((long)rate - (long)e->inpps)>>2; -		s->ustats.inpps = (e->inpps+0x1FF)>>10; +		e->inpps += ((long)rate - (long)e->inpps) >> 2; -		rate = (n_outpkts - e->last_outpkts)<<9; +		rate = (n_outpkts - e->last_outpkts) << 9;  		e->last_outpkts = n_outpkts; -		e->outpps += ((long)rate - (long)e->outpps)>>2; -		s->ustats.outpps = (e->outpps+0x1FF)>>10; +		e->outpps += ((long)rate - (long)e->outpps) >> 2; -		rate = (n_inbytes - e->last_inbytes)<<4; +		rate = (n_inbytes - e->last_inbytes) << 4;  		e->last_inbytes = n_inbytes; -		e->inbps += ((long)rate - (long)e->inbps)>>2; -		s->ustats.inbps = (e->inbps+0xF)>>5; +		e->inbps += ((long)rate - (long)e->inbps) >> 2; -		rate = (n_outbytes - e->last_outbytes)<<4; +		rate = (n_outbytes - e->last_outbytes) << 4;  		e->last_outbytes = n_outbytes; -		e->outbps += ((long)rate - (long)e->outbps)>>2; -		s->ustats.outbps = (e->outbps+0xF)>>5; +		e->outbps += ((long)rate - (long)e->outbps) >> 2;  		spin_unlock(&s->lock);  	} -	spin_unlock(&est_lock); -	mod_timer(&est_timer, jiffies + 2*HZ); +	spin_unlock(&ipvs->est_lock); +	mod_timer(&ipvs->est_timer, jiffies + 2*HZ);  } -void ip_vs_new_estimator(struct ip_vs_stats *stats) +void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)  { +	struct netns_ipvs *ipvs = net_ipvs(net);  	struct ip_vs_estimator *est = &stats->est;  	INIT_LIST_HEAD(&est->list); -	est->last_conns = stats->ustats.conns; -	est->cps = stats->ustats.cps<<10; - -	est->last_inpkts = stats->ustats.inpkts; -	est->inpps = stats->ustats.inpps<<10; - -	est->last_outpkts = stats->ustats.outpkts; -	est->outpps = stats->ustats.outpps<<10; - -	est->last_inbytes = stats->ustats.inbytes; -	est->inbps = stats->ustats.inbps<<5; - -	est->last_outbytes = stats->ustats.outbytes; -	est->outbps = stats->ustats.outbps<<5; - -	spin_lock_bh(&est_lock); -	list_add(&est->list, &est_list); -	spin_unlock_bh(&est_lock); +	spin_lock_bh(&ipvs->est_lock); +	list_add(&est->list, &ipvs->est_list); +	spin_unlock_bh(&ipvs->est_lock);  } -void ip_vs_kill_estimator(struct ip_vs_stats *stats) +void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)  { +	struct netns_ipvs *ipvs = net_ipvs(net);  	struct ip_vs_estimator *est = &stats->est; -	spin_lock_bh(&est_lock); +	spin_lock_bh(&ipvs->est_lock);  	list_del(&est->list); -	spin_unlock_bh(&est_lock); +	spin_unlock_bh(&ipvs->est_lock);  }  void ip_vs_zero_estimator(struct ip_vs_stats *stats)  {  	struct ip_vs_estimator *est = &stats->est; - -	/* set counters zero, caller must hold the stats->lock lock */ -	est->last_inbytes = 0; -	est->last_outbytes = 0; -	est->last_conns = 0; -	est->last_inpkts = 0; -	est->last_outpkts = 0; +	struct ip_vs_stats_user *u = &stats->ustats; + +	/* reset counters, caller must hold the stats->lock lock */ +	est->last_inbytes = u->inbytes; +	est->last_outbytes = u->outbytes; +	est->last_conns = u->conns; +	est->last_inpkts = u->inpkts; +	est->last_outpkts = u->outpkts;  	est->cps = 0;  	est->inpps = 0;  	est->outpps = 0; @@ -157,13 +181,31 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)  	est->outbps = 0;  } -int __init ip_vs_estimator_init(void) +/* Get decoded rates */ +void ip_vs_read_estimator(struct ip_vs_stats_user *dst, +			  struct ip_vs_stats *stats) +{ +	struct ip_vs_estimator *e = &stats->est; + +	dst->cps = (e->cps + 0x1FF) >> 10; +	dst->inpps = (e->inpps + 0x1FF) >> 10; +	dst->outpps = (e->outpps + 0x1FF) >> 10; +	dst->inbps = (e->inbps + 0xF) >> 5; +	dst->outbps = (e->outbps + 0xF) >> 5; +} + +int __net_init ip_vs_estimator_net_init(struct net *net)  { -	mod_timer(&est_timer, jiffies + 2 * HZ); +	struct netns_ipvs *ipvs = net_ipvs(net); + +	INIT_LIST_HEAD(&ipvs->est_list); +	spin_lock_init(&ipvs->est_lock); +	setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); +	mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);  	return 0;  } -void ip_vs_estimator_cleanup(void) +void __net_exit ip_vs_estimator_net_cleanup(struct net *net)  { -	del_timer_sync(&est_timer); +	del_timer_sync(&net_ipvs(net)->est_timer);  } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 75455000ad1..77c173282f3 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -44,16 +44,17 @@  #include <net/ip_vs.h> -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " +#define SERVER_STRING "227 " +#define CLIENT_STRING "PORT"  /*   * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper   * First port is set to the default port.   */ +static unsigned int ports_count = 1;  static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); +module_param_array(ports, ushort, &ports_count, 0444);  MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); @@ -79,14 +80,17 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)  /*   * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. + * with the "pattern", ignoring before "skip" and terminated with + * the "term" character.   * <addr,port> is in network order.   */  static int ip_vs_ftp_get_addrport(char *data, char *data_limit, -				  const char *pattern, size_t plen, char term, +				  const char *pattern, size_t plen, +				  char skip, char term,  				  __be32 *addr, __be16 *port,  				  char **start, char **end)  { +	char *s, c;  	unsigned char p[6];  	int i = 0; @@ -101,19 +105,38 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,  	if (strnicmp(data, pattern, plen) != 0) {  		return 0;  	} -	*start = data + plen; +	s = data + plen; +	if (skip) { +		int found = 0; + +		for (;; s++) { +			if (s == data_limit) +				return -1; +			if (!found) { +				if (*s == skip) +					found = 1; +			} else if (*s != skip) { +				break; +			} +		} +	} -	for (data = *start; *data != term; data++) { +	for (data = s; ; data++) {  		if (data == data_limit)  			return -1; +		if (*data == term) +			break;  	}  	*end = data;  	memset(p, 0, sizeof(p)); -	for (data = *start; data != *end; data++) { -		if (*data >= '0' && *data <= '9') { -			p[i] = p[i]*10 + *data - '0'; -		} else if (*data == ',' && i < 5) { +	for (data = s; ; data++) { +		c = *data; +		if (c == term) +			break; +		if (c >= '0' && c <= '9') { +			p[i] = p[i]*10 + c - '0'; +		} else if (c == ',' && i < 5) {  			i++;  		} else {  			/* unexpected character */ @@ -124,8 +147,9 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,  	if (i != 5)  		return -1; -	*addr = get_unaligned((__be32 *)p); -	*port = get_unaligned((__be16 *)(p + 4)); +	*start = s; +	*addr = get_unaligned((__be32 *) p); +	*port = get_unaligned((__be16 *) (p + 4));  	return 1;  } @@ -153,10 +177,11 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,  	__be16 port;  	struct ip_vs_conn *n_cp;  	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */ -	unsigned buf_len; +	unsigned int buf_len;  	int ret = 0;  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct; +	struct net *net;  #ifdef CONFIG_IP_VS_IPV6  	/* This application helper doesn't work with IPv6 yet, @@ -184,7 +209,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,  		if (ip_vs_ftp_get_addrport(data, data_limit,  					   SERVER_STRING, -					   sizeof(SERVER_STRING)-1, ')', +					   sizeof(SERVER_STRING)-1, +					   '(', ')',  					   &from.ip, &port,  					   &start, &end) != 1)  			return 1; @@ -197,18 +223,20 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,  		 */  		{  			struct ip_vs_conn_param p; -			ip_vs_conn_fill_param(AF_INET, iph->protocol, -					      &from, port, &cp->caddr, 0, &p); +			ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, +					      iph->protocol, &from, port, +					      &cp->caddr, 0, &p);  			n_cp = ip_vs_conn_out_get(&p);  		}  		if (!n_cp) {  			struct ip_vs_conn_param p; -			ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr, +			ip_vs_conn_fill_param(ip_vs_conn_net(cp), +					      AF_INET, IPPROTO_TCP, &cp->caddr,  					      0, &cp->vaddr, port, &p);  			n_cp = ip_vs_conn_new(&p, &from, port,  					      IP_VS_CONN_F_NO_CPORT |  					      IP_VS_CONN_F_NFCT, -					      cp->dest); +					      cp->dest, skb->mark);  			if (!n_cp)  				return 0; @@ -239,9 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,  			 * hopefully it will succeed on the retransmitted  			 * packet.  			 */ +			rcu_read_lock();  			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, +						       iph->ihl * 4,  						       start-data, end-start,  						       buf, buf_len); +			rcu_read_unlock();  			if (ret) {  				ip_vs_nfct_expect_related(skb, ct, n_cp,  							  IPPROTO_TCP, 0, 0); @@ -257,8 +288,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,  		 * would be adjusted twice.  		 */ +		net = skb_net(skb);  		cp->app_data = NULL; -		ip_vs_tcp_conn_listen(n_cp); +		ip_vs_tcp_conn_listen(net, n_cp);  		ip_vs_conn_put(n_cp);  		return ret;  	} @@ -287,6 +319,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,  	union nf_inet_addr to;  	__be16 port;  	struct ip_vs_conn *n_cp; +	struct net *net;  #ifdef CONFIG_IP_VS_IPV6  	/* This application helper doesn't work with IPv6 yet, @@ -340,7 +373,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,  	 */  	if (ip_vs_ftp_get_addrport(data_start, data_limit,  				   CLIENT_STRING, sizeof(CLIENT_STRING)-1, -				   '\r', &to.ip, &port, +				   ' ', '\r', &to.ip, &port,  				   &start, &end) != 1)  		return 1; @@ -358,14 +391,15 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,  	{  		struct ip_vs_conn_param p; -		ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port, -				      &cp->vaddr, htons(ntohs(cp->vport)-1), -				      &p); +		ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, +				      iph->protocol, &to, port, &cp->vaddr, +				      htons(ntohs(cp->vport)-1), &p);  		n_cp = ip_vs_conn_in_get(&p);  		if (!n_cp) {  			n_cp = ip_vs_conn_new(&p, &cp->daddr,  					      htons(ntohs(cp->dport)-1), -					      IP_VS_CONN_F_NFCT, cp->dest); +					      IP_VS_CONN_F_NFCT, cp->dest, +					      skb->mark);  			if (!n_cp)  				return 0; @@ -377,7 +411,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,  	/*  	 *	Move tunnel to listen state  	 */ -	ip_vs_tcp_conn_listen(n_cp); +	net = skb_net(skb); +	ip_vs_tcp_conn_listen(net, n_cp);  	ip_vs_conn_put(n_cp);  	return 1; @@ -398,42 +433,66 @@ static struct ip_vs_app ip_vs_ftp = {  	.pkt_in =	ip_vs_ftp_in,  }; -  /* - *	ip_vs_ftp initialization + *	per netns ip_vs_ftp initialization   */ -static int __init ip_vs_ftp_init(void) +static int __net_init __ip_vs_ftp_init(struct net *net)  {  	int i, ret; -	struct ip_vs_app *app = &ip_vs_ftp; +	struct ip_vs_app *app; +	struct netns_ipvs *ipvs = net_ipvs(net); -	ret = register_ip_vs_app(app); -	if (ret) -		return ret; +	if (!ipvs) +		return -ENOENT; + +	app = register_ip_vs_app(net, &ip_vs_ftp); +	if (IS_ERR(app)) +		return PTR_ERR(app); -	for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { +	for (i = 0; i < ports_count; i++) {  		if (!ports[i])  			continue; -		ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); +		ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);  		if (ret) -			break; +			goto err_unreg;  		pr_info("%s: loaded support on port[%d] = %d\n",  			app->name, i, ports[i]);  	} +	return 0; -	if (ret) -		unregister_ip_vs_app(app); - +err_unreg: +	unregister_ip_vs_app(net, &ip_vs_ftp);  	return ret;  } +/* + *	netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ +	unregister_ip_vs_app(net, &ip_vs_ftp); +} +static struct pernet_operations ip_vs_ftp_ops = { +	.init = __ip_vs_ftp_init, +	.exit = __ip_vs_ftp_exit, +}; + +static int __init ip_vs_ftp_init(void) +{ +	int rv; + +	rv = register_pernet_subsys(&ip_vs_ftp_ops); +	/* rcu_barrier() is called by netns on error */ +	return rv; +}  /*   *	ip_vs_ftp finish.   */  static void __exit ip_vs_ftp_exit(void)  { -	unregister_ip_vs_app(&ip_vs_ftp); +	unregister_pernet_subsys(&ip_vs_ftp_ops); +	/* rcu_barrier() is called by netns */  } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 9323f894419..547ff33c1ef 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -63,6 +63,8 @@  #define CHECK_EXPIRE_INTERVAL   (60*HZ)  #define ENTRY_TIMEOUT           (6*60*HZ) +#define DEFAULT_EXPIRATION	(24*60*60*HZ) +  /*   *    It is for full expiration check.   *    When there is no partial expiration check (garbage collection) @@ -70,7 +72,6 @@   *    entries that haven't been touched for a day.   */  #define COUNT_FOR_FULL_EXPIRATION   30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;  /* @@ -89,11 +90,12 @@ static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;   *      IP address and its destination server   */  struct ip_vs_lblc_entry { -	struct list_head        list; +	struct hlist_node	list;  	int			af;		/* address family */  	union nf_inet_addr      addr;           /* destination IP address */ -	struct ip_vs_dest       *dest;          /* real server (cache) */ +	struct ip_vs_dest	*dest;          /* real server (cache) */  	unsigned long           lastuse;        /* last used time */ +	struct rcu_head		rcu_head;  }; @@ -101,48 +103,53 @@ struct ip_vs_lblc_entry {   *      IPVS lblc hash table   */  struct ip_vs_lblc_table { -	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */ +	struct rcu_head		rcu_head; +	struct hlist_head	bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */ +	struct timer_list       periodic_timer; /* collect stale entries */  	atomic_t                entries;        /* number of entries */  	int                     max_size;       /* maximum size of entries */ -	struct timer_list       periodic_timer; /* collect stale entries */  	int                     rover;          /* rover for expire check */  	int                     counter;        /* counter for no expire */ +	bool			dead;  };  /*   *      IPVS LBLC sysctl table   */ - -static ctl_table vs_vars_table[] = { +#ifdef CONFIG_SYSCTL +static struct ctl_table vs_vars_table[] = {  	{  		.procname	= "lblc_expiration", -		.data		= &sysctl_ip_vs_lblc_expiration, +		.data		= NULL,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{ }  }; +#endif -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +static void ip_vs_lblc_rcu_free(struct rcu_head *head)  { -	list_del(&en->list); -	/* -	 * We don't kfree dest because it is refered either by its service -	 * or the trash dest list. -	 */ -	atomic_dec(&en->dest->refcnt); +	struct ip_vs_lblc_entry *en = container_of(head, +						   struct ip_vs_lblc_entry, +						   rcu_head); + +	ip_vs_dest_put_and_free(en->dest);  	kfree(en);  } +static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) +{ +	hlist_del_rcu(&en->list); +	call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); +}  /*   *	Returns hash value for IPVS LBLC entry   */ -static inline unsigned +static inline unsigned int  ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)  {  	__be32 addr_fold = addr->ip; @@ -163,25 +170,22 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)  static void  ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)  { -	unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); +	unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); -	list_add(&en->list, &tbl->bucket[hash]); +	hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);  	atomic_inc(&tbl->entries);  } -/* - *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read - *  lock - */ +/* Get ip_vs_lblc_entry associated with supplied parameters. */  static inline struct ip_vs_lblc_entry *  ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,  	       const union nf_inet_addr *addr)  { -	unsigned hash = ip_vs_lblc_hashkey(af, addr); +	unsigned int hash = ip_vs_lblc_hashkey(af, addr);  	struct ip_vs_lblc_entry *en; -	list_for_each_entry(en, &tbl->bucket[hash], list) +	hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)  		if (ip_vs_addr_equal(af, &en->addr, addr))  			return en; @@ -191,7 +195,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,  /*   * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. + * address to a server. Called under spin lock.   */  static inline struct ip_vs_lblc_entry *  ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, @@ -200,26 +204,23 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,  	struct ip_vs_lblc_entry *en;  	en = ip_vs_lblc_get(dest->af, tbl, daddr); -	if (!en) { -		en = kmalloc(sizeof(*en), GFP_ATOMIC); -		if (!en) { -			pr_err("%s(): no memory\n", __func__); -			return NULL; -		} +	if (en) { +		if (en->dest == dest) +			return en; +		ip_vs_lblc_del(en); +	} +	en = kmalloc(sizeof(*en), GFP_ATOMIC); +	if (!en) +		return NULL; -		en->af = dest->af; -		ip_vs_addr_copy(dest->af, &en->addr, daddr); -		en->lastuse = jiffies; +	en->af = dest->af; +	ip_vs_addr_copy(dest->af, &en->addr, daddr); +	en->lastuse = jiffies; -		atomic_inc(&dest->refcnt); -		en->dest = dest; +	ip_vs_dest_hold(dest); +	en->dest = dest; -		ip_vs_lblc_hash(tbl, en); -	} else if (en->dest != dest) { -		atomic_dec(&en->dest->refcnt); -		atomic_inc(&dest->refcnt); -		en->dest = dest; -	} +	ip_vs_lblc_hash(tbl, en);  	return en;  } @@ -228,40 +229,56 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,  /*   *      Flush all the entries of the specified table.   */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +static void ip_vs_lblc_flush(struct ip_vs_service *svc)  { -	struct ip_vs_lblc_entry *en, *nxt; +	struct ip_vs_lblc_table *tbl = svc->sched_data; +	struct ip_vs_lblc_entry *en; +	struct hlist_node *next;  	int i; -	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { -		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { -			ip_vs_lblc_free(en); +	spin_lock_bh(&svc->sched_lock); +	tbl->dead = 1; +	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { +		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { +			ip_vs_lblc_del(en);  			atomic_dec(&tbl->entries);  		}  	} +	spin_unlock_bh(&svc->sched_lock);  } +static int sysctl_lblc_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL +	struct netns_ipvs *ipvs = net_ipvs(svc->net); +	return ipvs->sysctl_lblc_expiration; +#else +	return DEFAULT_EXPIRATION; +#endif +}  static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)  {  	struct ip_vs_lblc_table *tbl = svc->sched_data; -	struct ip_vs_lblc_entry *en, *nxt; +	struct ip_vs_lblc_entry *en; +	struct hlist_node *next;  	unsigned long now = jiffies;  	int i, j; -	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { +	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {  		j = (j + 1) & IP_VS_LBLC_TAB_MASK; -		write_lock(&svc->sched_lock); -		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { +		spin_lock(&svc->sched_lock); +		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {  			if (time_before(now, -					en->lastuse + sysctl_ip_vs_lblc_expiration)) +					en->lastuse + +					sysctl_lblc_expiration(svc)))  				continue; -			ip_vs_lblc_free(en); +			ip_vs_lblc_del(en);  			atomic_dec(&tbl->entries);  		} -		write_unlock(&svc->sched_lock); +		spin_unlock(&svc->sched_lock);  	}  	tbl->rover = j;  } @@ -285,7 +302,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)  	unsigned long now = jiffies;  	int goal;  	int i, j; -	struct ip_vs_lblc_entry *en, *nxt; +	struct ip_vs_lblc_entry *en; +	struct hlist_node *next;  	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {  		/* do full expiration check */ @@ -303,26 +321,26 @@ static void ip_vs_lblc_check_expire(unsigned long data)  	if (goal > tbl->max_size/2)  		goal = tbl->max_size/2; -	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { +	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {  		j = (j + 1) & IP_VS_LBLC_TAB_MASK; -		write_lock(&svc->sched_lock); -		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { +		spin_lock(&svc->sched_lock); +		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {  			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))  				continue; -			ip_vs_lblc_free(en); +			ip_vs_lblc_del(en);  			atomic_dec(&tbl->entries);  			goal--;  		} -		write_unlock(&svc->sched_lock); +		spin_unlock(&svc->sched_lock);  		if (goal <= 0)  			break;  	}  	tbl->rover = j;    out: -	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);  } @@ -334,11 +352,10 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)  	/*  	 *    Allocate the ip_vs_lblc_table for this service  	 */ -	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); -	if (tbl == NULL) { -		pr_err("%s(): no memory\n", __func__); +	tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); +	if (tbl == NULL)  		return -ENOMEM; -	} +  	svc->sched_data = tbl;  	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "  		  "current service\n", sizeof(*tbl)); @@ -346,12 +363,13 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)  	/*  	 *    Initialize the hash buckets  	 */ -	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { -		INIT_LIST_HEAD(&tbl->bucket[i]); +	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { +		INIT_HLIST_HEAD(&tbl->bucket[i]);  	}  	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;  	tbl->rover = 0;  	tbl->counter = 1; +	tbl->dead = 0;  	/*  	 *    Hook periodic timer for garbage collection @@ -364,7 +382,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)  } -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)  {  	struct ip_vs_lblc_table *tbl = svc->sched_data; @@ -372,14 +390,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)  	del_timer_sync(&tbl->periodic_timer);  	/* got to clean up table entries here */ -	ip_vs_lblc_flush(tbl); +	ip_vs_lblc_flush(svc);  	/* release the table itself */ -	kfree(tbl); +	kfree_rcu(tbl, rcu_head);  	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",  		  sizeof(*tbl)); - -	return 0;  } @@ -390,12 +406,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)  	int loh, doh;  	/* -	 * We think the overhead of processing active connections is fifty -	 * times higher than that of inactive connections in average. (This -	 * fifty times might not be accurate, we will change it later.) We -	 * use the following formula to estimate the overhead: -	 *                dest->activeconns*50 + dest->inactconns -	 * and the load: +	 * We use the following formula to estimate the load:  	 *                (dest overhead) / dest->weight  	 *  	 * Remember -- no floats in kernel mode!!! @@ -406,13 +417,12 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)  	 * The server with weight=0 is quiesced and will not receive any  	 * new connection.  	 */ -	list_for_each_entry(dest, &svc->destinations, n_list) { +	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  		if (dest->flags & IP_VS_DEST_F_OVERLOAD)  			continue;  		if (atomic_read(&dest->weight) > 0) {  			least = dest; -			loh = atomic_read(&least->activeconns) * 50 -				+ atomic_read(&least->inactconns); +			loh = ip_vs_dest_conn_overhead(least);  			goto nextstage;  		}  	} @@ -422,14 +432,13 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)  	 *    Find the destination with the least load.  	 */    nextstage: -	list_for_each_entry_continue(dest, &svc->destinations, n_list) { +	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {  		if (dest->flags & IP_VS_DEST_F_OVERLOAD)  			continue; -		doh = atomic_read(&dest->activeconns) * 50 -			+ atomic_read(&dest->inactconns); -		if (loh * atomic_read(&dest->weight) > -		    doh * atomic_read(&least->weight)) { +		doh = ip_vs_dest_conn_overhead(dest); +		if ((__s64)loh * atomic_read(&dest->weight) > +		    (__s64)doh * atomic_read(&least->weight)) {  			least = dest;  			loh = doh;  		} @@ -457,7 +466,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)  	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {  		struct ip_vs_dest *d; -		list_for_each_entry(d, &svc->destinations, n_list) { +		list_for_each_entry_rcu(d, &svc->destinations, n_list) {  			if (atomic_read(&d->activeconns)*2  			    < atomic_read(&d->weight)) {  				return 1; @@ -472,20 +481,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)   *    Locality-Based (weighted) Least-Connection scheduling   */  static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		    struct ip_vs_iphdr *iph)  {  	struct ip_vs_lblc_table *tbl = svc->sched_data; -	struct ip_vs_iphdr iph;  	struct ip_vs_dest *dest = NULL;  	struct ip_vs_lblc_entry *en; -	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); -  	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);  	/* First look in our cache */ -	read_lock(&svc->sched_lock); -	en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); +	en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);  	if (en) {  		/* We only hold a read lock, but this is atomic */  		en->lastuse = jiffies; @@ -499,30 +505,28 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  		 * free up entries from the trash at any time.  		 */ -		if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) -			dest = en->dest; +		dest = en->dest; +		if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && +		    atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) +			goto out;  	} -	read_unlock(&svc->sched_lock); - -	/* If the destination has a weight and is not overloaded, use it */ -	if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) -		goto out;  	/* No cache entry or it is invalid, time to schedule */  	dest = __ip_vs_lblc_schedule(svc);  	if (!dest) { -		IP_VS_ERR_RL("LBLC: no destination available\n"); +		ip_vs_scheduler_err(svc, "no destination available");  		return NULL;  	}  	/* If we fail to create a cache entry, we'll just use the valid dest */ -	write_lock(&svc->sched_lock); -	ip_vs_lblc_new(tbl, &iph.daddr, dest); -	write_unlock(&svc->sched_lock); +	spin_lock_bh(&svc->sched_lock); +	if (!tbl->dead) +		ip_vs_lblc_new(tbl, &iph->daddr, dest); +	spin_unlock_bh(&svc->sched_lock);  out:  	IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", -		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), +		      IP_VS_DBG_ADDR(svc->af, &iph->daddr),  		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));  	return dest; @@ -532,8 +536,7 @@ out:  /*   *      IPVS LBLC Scheduler structure   */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = {  	.name =			"lblc",  	.refcnt =		ATOMIC_INIT(0),  	.module =		THIS_MODULE, @@ -543,23 +546,85 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =  	.schedule =		ip_vs_lblc_schedule,  }; +/* + *  per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblc_init(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	if (!ipvs) +		return -ENOENT; + +	if (!net_eq(net, &init_net)) { +		ipvs->lblc_ctl_table = kmemdup(vs_vars_table, +						sizeof(vs_vars_table), +						GFP_KERNEL); +		if (ipvs->lblc_ctl_table == NULL) +			return -ENOMEM; + +		/* Don't export sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) +			ipvs->lblc_ctl_table[0].procname = NULL; + +	} else +		ipvs->lblc_ctl_table = vs_vars_table; +	ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; +	ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + +	ipvs->lblc_ctl_header = +		register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); +	if (!ipvs->lblc_ctl_header) { +		if (!net_eq(net, &init_net)) +			kfree(ipvs->lblc_ctl_table); +		return -ENOMEM; +	} + +	return 0; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	unregister_net_sysctl_table(ipvs->lblc_ctl_header); + +	if (!net_eq(net, &init_net)) +		kfree(ipvs->lblc_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblc_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblc_ops = { +	.init = __ip_vs_lblc_init, +	.exit = __ip_vs_lblc_exit, +};  static int __init ip_vs_lblc_init(void)  {  	int ret; -	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); +	ret = register_pernet_subsys(&ip_vs_lblc_ops); +	if (ret) +		return ret; +  	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);  	if (ret) -		unregister_sysctl_table(sysctl_header); +		unregister_pernet_subsys(&ip_vs_lblc_ops);  	return ret;  } -  static void __exit ip_vs_lblc_cleanup(void)  { -	unregister_sysctl_table(sysctl_header);  	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +	unregister_pernet_subsys(&ip_vs_lblc_ops); +	rcu_barrier();  } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index dbeed8ea421..3f21a2f47de 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -63,6 +63,8 @@  #define CHECK_EXPIRE_INTERVAL   (60*HZ)  #define ENTRY_TIMEOUT           (6*60*HZ) +#define DEFAULT_EXPIRATION	(24*60*60*HZ) +  /*   *    It is for full expiration check.   *    When there is no partial expiration check (garbage collection) @@ -70,8 +72,6 @@   *    entries that haven't been touched for a day.   */  #define COUNT_FOR_FULL_EXPIRATION   30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; -  /*   *     for IPVS lblcr entry hash table @@ -89,42 +89,49 @@ static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;   */  struct ip_vs_dest_set_elem {  	struct list_head	list;          /* list link */ -	struct ip_vs_dest       *dest;          /* destination server */ +	struct ip_vs_dest	*dest;		/* destination server */ +	struct rcu_head		rcu_head;  };  struct ip_vs_dest_set {  	atomic_t                size;           /* set size */  	unsigned long           lastmod;        /* last modified time */  	struct list_head	list;           /* destination list */ -	rwlock_t	        lock;           /* lock for this list */  }; -static struct ip_vs_dest_set_elem * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, +				  struct ip_vs_dest *dest, bool check)  {  	struct ip_vs_dest_set_elem *e; -	list_for_each_entry(e, &set->list, list) { -		if (e->dest == dest) -			/* already existed */ -			return NULL; +	if (check) { +		list_for_each_entry(e, &set->list, list) { +			if (e->dest == dest) +				return; +		}  	}  	e = kmalloc(sizeof(*e), GFP_ATOMIC); -	if (e == NULL) { -		pr_err("%s(): no memory\n", __func__); -		return NULL; -	} +	if (e == NULL) +		return; -	atomic_inc(&dest->refcnt); +	ip_vs_dest_hold(dest);  	e->dest = dest; -	list_add(&e->list, &set->list); +	list_add_rcu(&e->list, &set->list);  	atomic_inc(&set->size);  	set->lastmod = jiffies; -	return e; +} + +static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) +{ +	struct ip_vs_dest_set_elem *e; + +	e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); +	ip_vs_dest_put_and_free(e->dest); +	kfree(e);  }  static void @@ -137,9 +144,8 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)  			/* HIT */  			atomic_dec(&set->size);  			set->lastmod = jiffies; -			atomic_dec(&e->dest->refcnt); -			list_del(&e->list); -			kfree(e); +			list_del_rcu(&e->list); +			call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);  			break;  		}  	} @@ -149,17 +155,10 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)  {  	struct ip_vs_dest_set_elem *e, *ep; -	write_lock(&set->lock);  	list_for_each_entry_safe(e, ep, &set->list, list) { -		/* -		 * We don't kfree dest because it is refered either -		 * by its service or by the trash dest list. -		 */ -		atomic_dec(&e->dest->refcnt); -		list_del(&e->list); -		kfree(e); +		list_del_rcu(&e->list); +		call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);  	} -	write_unlock(&set->lock);  }  /* get weighted least-connection node in the destination set */ @@ -169,19 +168,15 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)  	struct ip_vs_dest *dest, *least;  	int loh, doh; -	if (set == NULL) -		return NULL; -  	/* select the first destination server, whose weight > 0 */ -	list_for_each_entry(e, &set->list, list) { +	list_for_each_entry_rcu(e, &set->list, list) {  		least = e->dest;  		if (least->flags & IP_VS_DEST_F_OVERLOAD)  			continue;  		if ((atomic_read(&least->weight) > 0)  		    && (least->flags & IP_VS_DEST_F_AVAILABLE)) { -			loh = atomic_read(&least->activeconns) * 50 -				+ atomic_read(&least->inactconns); +			loh = ip_vs_dest_conn_overhead(least);  			goto nextstage;  		}  	} @@ -189,15 +184,14 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)  	/* find the destination with the weighted least load */    nextstage: -	list_for_each_entry(e, &set->list, list) { +	list_for_each_entry_continue_rcu(e, &set->list, list) {  		dest = e->dest;  		if (dest->flags & IP_VS_DEST_F_OVERLOAD)  			continue; -		doh = atomic_read(&dest->activeconns) * 50 -			+ atomic_read(&dest->inactconns); -		if ((loh * atomic_read(&dest->weight) > -		     doh * atomic_read(&least->weight)) +		doh = ip_vs_dest_conn_overhead(dest); +		if (((__s64)loh * atomic_read(&dest->weight) > +		     (__s64)doh * atomic_read(&least->weight))  		    && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {  			least = dest;  			loh = doh; @@ -230,8 +224,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)  	list_for_each_entry(e, &set->list, list) {  		most = e->dest;  		if (atomic_read(&most->weight) > 0) { -			moh = atomic_read(&most->activeconns) * 50 -				+ atomic_read(&most->inactconns); +			moh = ip_vs_dest_conn_overhead(most);  			goto nextstage;  		}  	} @@ -239,13 +232,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)  	/* find the destination with the weighted most load */    nextstage: -	list_for_each_entry(e, &set->list, list) { +	list_for_each_entry_continue(e, &set->list, list) {  		dest = e->dest; -		doh = atomic_read(&dest->activeconns) * 50 -			+ atomic_read(&dest->inactconns); +		doh = ip_vs_dest_conn_overhead(dest);  		/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ -		if ((moh * atomic_read(&dest->weight) < -		     doh * atomic_read(&most->weight)) +		if (((__s64)moh * atomic_read(&dest->weight) < +		     (__s64)doh * atomic_read(&most->weight))  		    && (atomic_read(&dest->weight) > 0)) {  			most = dest;  			moh = doh; @@ -268,11 +260,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)   *      IP address and its destination server set   */  struct ip_vs_lblcr_entry { -	struct list_head        list; +	struct hlist_node       list;  	int			af;		/* address family */  	union nf_inet_addr      addr;           /* destination IP address */  	struct ip_vs_dest_set   set;            /* destination server set */  	unsigned long           lastuse;        /* last used time */ +	struct rcu_head		rcu_head;  }; @@ -280,44 +273,46 @@ struct ip_vs_lblcr_entry {   *      IPVS lblcr hash table   */  struct ip_vs_lblcr_table { -	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */ +	struct rcu_head		rcu_head; +	struct hlist_head	bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */  	atomic_t                entries;        /* number of entries */  	int                     max_size;       /* maximum size of entries */  	struct timer_list       periodic_timer; /* collect stale entries */  	int                     rover;          /* rover for expire check */  	int                     counter;        /* counter for no expire */ +	bool			dead;  }; +#ifdef CONFIG_SYSCTL  /*   *      IPVS LBLCR sysctl table   */ -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = {  	{  		.procname	= "lblcr_expiration", -		.data		= &sysctl_ip_vs_lblcr_expiration, +		.data		= NULL,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{ }  }; - -static struct ctl_table_header * sysctl_header; +#endif  static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)  { -	list_del(&en->list); +	hlist_del_rcu(&en->list);  	ip_vs_dest_set_eraseall(&en->set); -	kfree(en); +	kfree_rcu(en, rcu_head);  }  /*   *	Returns hash value for IPVS LBLCR entry   */ -static inline unsigned +static inline unsigned int  ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)  {  	__be32 addr_fold = addr->ip; @@ -338,25 +333,22 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)  static void  ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)  { -	unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr); +	unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); -	list_add(&en->list, &tbl->bucket[hash]); +	hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);  	atomic_inc(&tbl->entries);  } -/* - *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under - *  read lock. - */ +/* Get ip_vs_lblcr_entry associated with supplied parameters. */  static inline struct ip_vs_lblcr_entry *  ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,  		const union nf_inet_addr *addr)  { -	unsigned hash = ip_vs_lblcr_hashkey(af, addr); +	unsigned int hash = ip_vs_lblcr_hashkey(af, addr);  	struct ip_vs_lblcr_entry *en; -	list_for_each_entry(en, &tbl->bucket[hash], list) +	hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)  		if (ip_vs_addr_equal(af, &en->addr, addr))  			return en; @@ -366,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,  /*   * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. + * IP address to a server. Called under spin lock.   */  static inline struct ip_vs_lblcr_entry *  ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, @@ -377,10 +369,8 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,  	en = ip_vs_lblcr_get(dest->af, tbl, daddr);  	if (!en) {  		en = kmalloc(sizeof(*en), GFP_ATOMIC); -		if (!en) { -			pr_err("%s(): no memory\n", __func__); +		if (!en)  			return NULL; -		}  		en->af = dest->af;  		ip_vs_addr_copy(dest->af, &en->addr, daddr); @@ -389,14 +379,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,  		/* initialize its dest set */  		atomic_set(&(en->set.size), 0);  		INIT_LIST_HEAD(&en->set.list); -		rwlock_init(&en->set.lock); + +		ip_vs_dest_set_insert(&en->set, dest, false);  		ip_vs_lblcr_hash(tbl, en); +		return en;  	} -	write_lock(&en->set.lock); -	ip_vs_dest_set_insert(&en->set, dest); -	write_unlock(&en->set.lock); +	ip_vs_dest_set_insert(&en->set, dest, true);  	return en;  } @@ -405,40 +395,54 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,  /*   *      Flush all the entries of the specified table.   */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +static void ip_vs_lblcr_flush(struct ip_vs_service *svc)  { +	struct ip_vs_lblcr_table *tbl = svc->sched_data;  	int i; -	struct ip_vs_lblcr_entry *en, *nxt; +	struct ip_vs_lblcr_entry *en; +	struct hlist_node *next; -	/* No locking required, only called during cleanup. */ -	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { -		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { +	spin_lock_bh(&svc->sched_lock); +	tbl->dead = 1; +	for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { +		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {  			ip_vs_lblcr_free(en);  		}  	} +	spin_unlock_bh(&svc->sched_lock);  } +static int sysctl_lblcr_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL +	struct netns_ipvs *ipvs = net_ipvs(svc->net); +	return ipvs->sysctl_lblcr_expiration; +#else +	return DEFAULT_EXPIRATION; +#endif +}  static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)  {  	struct ip_vs_lblcr_table *tbl = svc->sched_data;  	unsigned long now = jiffies;  	int i, j; -	struct ip_vs_lblcr_entry *en, *nxt; +	struct ip_vs_lblcr_entry *en; +	struct hlist_node *next; -	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { +	for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {  		j = (j + 1) & IP_VS_LBLCR_TAB_MASK; -		write_lock(&svc->sched_lock); -		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { -			if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, -				       now)) +		spin_lock(&svc->sched_lock); +		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { +			if (time_after(en->lastuse + +				       sysctl_lblcr_expiration(svc), now))  				continue;  			ip_vs_lblcr_free(en);  			atomic_dec(&tbl->entries);  		} -		write_unlock(&svc->sched_lock); +		spin_unlock(&svc->sched_lock);  	}  	tbl->rover = j;  } @@ -462,7 +466,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)  	unsigned long now = jiffies;  	int goal;  	int i, j; -	struct ip_vs_lblcr_entry *en, *nxt; +	struct ip_vs_lblcr_entry *en; +	struct hlist_node *next;  	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {  		/* do full expiration check */ @@ -480,11 +485,11 @@ static void ip_vs_lblcr_check_expire(unsigned long data)  	if (goal > tbl->max_size/2)  		goal = tbl->max_size/2; -	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { +	for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {  		j = (j + 1) & IP_VS_LBLCR_TAB_MASK; -		write_lock(&svc->sched_lock); -		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { +		spin_lock(&svc->sched_lock); +		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {  			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))  				continue; @@ -492,7 +497,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)  			atomic_dec(&tbl->entries);  			goal--;  		} -		write_unlock(&svc->sched_lock); +		spin_unlock(&svc->sched_lock);  		if (goal <= 0)  			break;  	} @@ -510,11 +515,10 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)  	/*  	 *    Allocate the ip_vs_lblcr_table for this service  	 */ -	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); -	if (tbl == NULL) { -		pr_err("%s(): no memory\n", __func__); +	tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); +	if (tbl == NULL)  		return -ENOMEM; -	} +  	svc->sched_data = tbl;  	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "  		  "current service\n", sizeof(*tbl)); @@ -522,12 +526,13 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)  	/*  	 *    Initialize the hash buckets  	 */ -	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { -		INIT_LIST_HEAD(&tbl->bucket[i]); +	for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { +		INIT_HLIST_HEAD(&tbl->bucket[i]);  	}  	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;  	tbl->rover = 0;  	tbl->counter = 1; +	tbl->dead = 0;  	/*  	 *    Hook periodic timer for garbage collection @@ -540,7 +545,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)  } -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)  {  	struct ip_vs_lblcr_table *tbl = svc->sched_data; @@ -548,14 +553,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)  	del_timer_sync(&tbl->periodic_timer);  	/* got to clean up table entries here */ -	ip_vs_lblcr_flush(tbl); +	ip_vs_lblcr_flush(svc);  	/* release the table itself */ -	kfree(tbl); +	kfree_rcu(tbl, rcu_head);  	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",  		  sizeof(*tbl)); - -	return 0;  } @@ -566,12 +569,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)  	int loh, doh;  	/* -	 * We think the overhead of processing active connections is fifty -	 * times higher than that of inactive connections in average. (This -	 * fifty times might not be accurate, we will change it later.) We -	 * use the following formula to estimate the overhead: -	 *                dest->activeconns*50 + dest->inactconns -	 * and the load: +	 * We use the following formula to estimate the load:  	 *                (dest overhead) / dest->weight  	 *  	 * Remember -- no floats in kernel mode!!! @@ -582,14 +580,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)  	 * The server with weight=0 is quiesced and will not receive any  	 * new connection.  	 */ -	list_for_each_entry(dest, &svc->destinations, n_list) { +	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  		if (dest->flags & IP_VS_DEST_F_OVERLOAD)  			continue;  		if (atomic_read(&dest->weight) > 0) {  			least = dest; -			loh = atomic_read(&least->activeconns) * 50 -				+ atomic_read(&least->inactconns); +			loh = ip_vs_dest_conn_overhead(least);  			goto nextstage;  		}  	} @@ -599,14 +596,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)  	 *    Find the destination with the least load.  	 */    nextstage: -	list_for_each_entry_continue(dest, &svc->destinations, n_list) { +	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {  		if (dest->flags & IP_VS_DEST_F_OVERLOAD)  			continue; -		doh = atomic_read(&dest->activeconns) * 50 -			+ atomic_read(&dest->inactconns); -		if (loh * atomic_read(&dest->weight) > -		    doh * atomic_read(&least->weight)) { +		doh = ip_vs_dest_conn_overhead(dest); +		if ((__s64)loh * atomic_read(&dest->weight) > +		    (__s64)doh * atomic_read(&least->weight)) {  			least = dest;  			loh = doh;  		} @@ -634,7 +630,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)  	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {  		struct ip_vs_dest *d; -		list_for_each_entry(d, &svc->destinations, n_list) { +		list_for_each_entry_rcu(d, &svc->destinations, n_list) {  			if (atomic_read(&d->activeconns)*2  			    < atomic_read(&d->weight)) {  				return 1; @@ -649,65 +645,56 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)   *    Locality-Based (weighted) Least-Connection scheduling   */  static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		     struct ip_vs_iphdr *iph)  {  	struct ip_vs_lblcr_table *tbl = svc->sched_data; -	struct ip_vs_iphdr iph; -	struct ip_vs_dest *dest = NULL; +	struct ip_vs_dest *dest;  	struct ip_vs_lblcr_entry *en; -	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); -  	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);  	/* First look in our cache */ -	read_lock(&svc->sched_lock); -	en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); +	en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);  	if (en) { -		/* We only hold a read lock, but this is atomic */  		en->lastuse = jiffies;  		/* Get the least loaded destination */ -		read_lock(&en->set.lock);  		dest = ip_vs_dest_set_min(&en->set); -		read_unlock(&en->set.lock);  		/* More than one destination + enough time passed by, cleanup */  		if (atomic_read(&en->set.size) > 1 && -				time_after(jiffies, en->set.lastmod + -				sysctl_ip_vs_lblcr_expiration)) { -			struct ip_vs_dest *m; - -			write_lock(&en->set.lock); -			m = ip_vs_dest_set_max(&en->set); -			if (m) -				ip_vs_dest_set_erase(&en->set, m); -			write_unlock(&en->set.lock); +		    time_after(jiffies, en->set.lastmod + +				sysctl_lblcr_expiration(svc))) { +			spin_lock_bh(&svc->sched_lock); +			if (atomic_read(&en->set.size) > 1) { +				struct ip_vs_dest *m; + +				m = ip_vs_dest_set_max(&en->set); +				if (m) +					ip_vs_dest_set_erase(&en->set, m); +			} +			spin_unlock_bh(&svc->sched_lock);  		}  		/* If the destination is not overloaded, use it */ -		if (dest && !is_overloaded(dest, svc)) { -			read_unlock(&svc->sched_lock); +		if (dest && !is_overloaded(dest, svc))  			goto out; -		}  		/* The cache entry is invalid, time to schedule */  		dest = __ip_vs_lblcr_schedule(svc);  		if (!dest) { -			IP_VS_ERR_RL("LBLCR: no destination available\n"); -			read_unlock(&svc->sched_lock); +			ip_vs_scheduler_err(svc, "no destination available");  			return NULL;  		}  		/* Update our cache entry */ -		write_lock(&en->set.lock); -		ip_vs_dest_set_insert(&en->set, dest); -		write_unlock(&en->set.lock); -	} -	read_unlock(&svc->sched_lock); - -	if (dest) +		spin_lock_bh(&svc->sched_lock); +		if (!tbl->dead) +			ip_vs_dest_set_insert(&en->set, dest, true); +		spin_unlock_bh(&svc->sched_lock);  		goto out; +	}  	/* No cache entry, time to schedule */  	dest = __ip_vs_lblcr_schedule(svc); @@ -717,13 +704,14 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  	}  	/* If we fail to create a cache entry, we'll just use the valid dest */ -	write_lock(&svc->sched_lock); -	ip_vs_lblcr_new(tbl, &iph.daddr, dest); -	write_unlock(&svc->sched_lock); +	spin_lock_bh(&svc->sched_lock); +	if (!tbl->dead) +		ip_vs_lblcr_new(tbl, &iph->daddr, dest); +	spin_unlock_bh(&svc->sched_lock);  out:  	IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", -		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), +		      IP_VS_DBG_ADDR(svc->af, &iph->daddr),  		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));  	return dest; @@ -744,23 +732,84 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =  	.schedule =		ip_vs_lblcr_schedule,  }; +/* + *  per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	if (!ipvs) +		return -ENOENT; + +	if (!net_eq(net, &init_net)) { +		ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, +						sizeof(vs_vars_table), +						GFP_KERNEL); +		if (ipvs->lblcr_ctl_table == NULL) +			return -ENOMEM; + +		/* Don't export sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) +			ipvs->lblcr_ctl_table[0].procname = NULL; +	} else +		ipvs->lblcr_ctl_table = vs_vars_table; +	ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; +	ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + +	ipvs->lblcr_ctl_header = +		register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); +	if (!ipvs->lblcr_ctl_header) { +		if (!net_eq(net, &init_net)) +			kfree(ipvs->lblcr_ctl_table); +		return -ENOMEM; +	} + +	return 0; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + +	if (!net_eq(net, &init_net)) +		kfree(ipvs->lblcr_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblcr_ops = { +	.init = __ip_vs_lblcr_init, +	.exit = __ip_vs_lblcr_exit, +};  static int __init ip_vs_lblcr_init(void)  {  	int ret; -	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); +	ret = register_pernet_subsys(&ip_vs_lblcr_ops); +	if (ret) +		return ret; +  	ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);  	if (ret) -		unregister_sysctl_table(sysctl_header); +		unregister_pernet_subsys(&ip_vs_lblcr_ops);  	return ret;  } -  static void __exit ip_vs_lblcr_cleanup(void)  { -	unregister_sysctl_table(sysctl_header);  	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +	unregister_pernet_subsys(&ip_vs_lblcr_ops); +	rcu_barrier();  } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 4f69db1fac5..2bdcb1cf212 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -22,27 +22,12 @@  #include <net/ip_vs.h> - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ -	/* -	 * We think the overhead of processing active connections is 256 -	 * times higher than that of inactive connections in average. (This -	 * 256 times might not be accurate, we will change it later) We -	 * use the following formula to estimate the overhead now: -	 *		  dest->activeconns*256 + dest->inactconns -	 */ -	return (atomic_read(&dest->activeconns) << 8) + -		atomic_read(&dest->inactconns); -} - -  /*   *	Least Connection scheduling   */  static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		  struct ip_vs_iphdr *iph)  {  	struct ip_vs_dest *dest, *least = NULL;  	unsigned int loh = 0, doh; @@ -58,11 +43,11 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  	 * served, but no new connection is assigned to the server.  	 */ -	list_for_each_entry(dest, &svc->destinations, n_list) { +	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  		if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||  		    atomic_read(&dest->weight) == 0)  			continue; -		doh = ip_vs_lc_dest_overhead(dest); +		doh = ip_vs_dest_conn_overhead(dest);  		if (!least || doh < loh) {  			least = dest;  			loh = doh; @@ -70,7 +55,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  	}  	if (!least) -		IP_VS_ERR_RL("LC: no destination available\n"); +		ip_vs_scheduler_err(svc, "no destination available");  	else  		IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "  			      "inactconns %d\n", @@ -100,6 +85,7 @@ static int __init ip_vs_lc_init(void)  static void __exit ip_vs_lc_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +	synchronize_rcu();  }  module_init(ip_vs_lc_init); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 4680647cd45..5882bbfd198 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -19,8 +19,7 @@   * GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   *   *   * Authors: @@ -63,6 +62,7 @@  #include <net/ip_vs.h>  #include <net/netfilter/nf_conntrack_core.h>  #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h>  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_conntrack_zones.h> @@ -82,7 +82,7 @@ void  ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)  {  	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);  	struct nf_conntrack_tuple new_tuple;  	if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || @@ -97,6 +97,11 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)  	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)  		return; +	/* Applications may adjust TCP seqs */ +	if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && +	    !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) +		return; +  	/*  	 * The connection is not yet in the hashtable, so we update it.  	 * CIP->VIP will remain the same, so leave the tuple in @@ -127,7 +132,7 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)  	nf_conntrack_alter_reply(ct, &new_tuple);  } -int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +int ip_vs_confirm_conntrack(struct sk_buff *skb)  {  	return nf_conntrack_confirm(skb);  } @@ -141,6 +146,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,  	struct nf_conntrack_tuple *orig, new_reply;  	struct ip_vs_conn *cp;  	struct ip_vs_conn_param p; +	struct net *net = nf_ct_net(ct);  	if (exp->tuple.src.l3num != PF_INET)  		return; @@ -155,7 +161,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,  	/* RS->CLIENT */  	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; -	ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum, +	ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,  			      &orig->src.u3, orig->src.u.tcp.port,  			      &orig->dst.u3, orig->dst.u.tcp.port, &p);  	cp = ip_vs_conn_out_get(&p); @@ -268,7 +274,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)  		" for conn " FMT_CONN "\n",  		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); -	h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); +	h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, +				  &tuple);  	if (h) {  		ct = nf_ct_tuplehash_to_ctrack(h);  		/* Show what happens instead of calling nf_ct_kill() */ diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index c413e183082..961a6de9bb2 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -40,7 +40,7 @@  #include <net/ip_vs.h> -static inline unsigned int +static inline int  ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)  {  	/* @@ -55,10 +55,11 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)   *	Weighted Least Connection scheduling   */  static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		  struct ip_vs_iphdr *iph)  {  	struct ip_vs_dest *dest, *least = NULL; -	unsigned int loh = 0, doh; +	int loh = 0, doh;  	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -75,7 +76,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  	 * new connections.  	 */ -	list_for_each_entry(dest, &svc->destinations, n_list) { +	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  		if (dest->flags & IP_VS_DEST_F_OVERLOAD ||  		    !atomic_read(&dest->weight)) @@ -91,15 +92,15 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  		}  		if (!least || -		    (loh * atomic_read(&dest->weight) > -		     doh * atomic_read(&least->weight))) { +		    ((__s64)loh * atomic_read(&dest->weight) > +		     (__s64)doh * atomic_read(&least->weight))) {  			least = dest;  			loh = doh;  		}  	}  	if (!least) { -		IP_VS_ERR_RL("NQ: no destination available\n"); +		ip_vs_scheduler_err(svc, "no destination available");  		return NULL;  	} @@ -133,6 +134,7 @@ static int __init ip_vs_nq_init(void)  static void __exit ip_vs_nq_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +	synchronize_rcu();  }  module_init(ip_vs_nq_init); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 3414af70ee1..1a82b29ce8e 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -13,33 +13,19 @@  /* IPVS pe list */  static LIST_HEAD(ip_vs_pe); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_pe_lock); - -/* Bind a service with a pe */ -void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) -{ -	svc->pe = pe; -} - -/* Unbind a service from its pe */ -void ip_vs_unbind_pe(struct ip_vs_service *svc) -{ -	svc->pe = NULL; -} +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex);  /* Get pe in the pe list by name */ -static struct ip_vs_pe * -ip_vs_pe_getbyname(const char *pe_name) +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)  {  	struct ip_vs_pe *pe; -	IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__, +	IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,  		  pe_name); -	spin_lock_bh(&ip_vs_pe_lock); - -	list_for_each_entry(pe, &ip_vs_pe, n_list) { +	rcu_read_lock(); +	list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {  		/* Test and get the modules atomically */  		if (pe->module &&  		    !try_module_get(pe->module)) { @@ -48,40 +34,34 @@ ip_vs_pe_getbyname(const char *pe_name)  		}  		if (strcmp(pe_name, pe->name)==0) {  			/* HIT */ -			spin_unlock_bh(&ip_vs_pe_lock); +			rcu_read_unlock();  			return pe;  		}  		if (pe->module)  			module_put(pe->module);  	} +	rcu_read_unlock(); -	spin_unlock_bh(&ip_vs_pe_lock);  	return NULL;  }  /* Lookup pe and try to load it if it doesn't exist */ -struct ip_vs_pe *ip_vs_pe_get(const char *name) +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)  {  	struct ip_vs_pe *pe;  	/* Search for the pe by name */ -	pe = ip_vs_pe_getbyname(name); +	pe = __ip_vs_pe_getbyname(name);  	/* If pe not found, load the module and search again */  	if (!pe) {  		request_module("ip_vs_pe_%s", name); -		pe = ip_vs_pe_getbyname(name); +		pe = __ip_vs_pe_getbyname(name);  	}  	return pe;  } -void ip_vs_pe_put(struct ip_vs_pe *pe) -{ -	if (pe && pe->module) -		module_put(pe->module); -} -  /* Register a pe in the pe list */  int register_ip_vs_pe(struct ip_vs_pe *pe)  { @@ -90,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)  	/* increase the module use count */  	ip_vs_use_count_inc(); -	spin_lock_bh(&ip_vs_pe_lock); - -	if (!list_empty(&pe->n_list)) { -		spin_unlock_bh(&ip_vs_pe_lock); -		ip_vs_use_count_dec(); -		pr_err("%s(): [%s] pe already linked\n", -		       __func__, pe->name); -		return -EINVAL; -	} - +	mutex_lock(&ip_vs_pe_mutex);  	/* Make sure that the pe with this name doesn't exist  	 * in the pe list.  	 */  	list_for_each_entry(tmp, &ip_vs_pe, n_list) {  		if (strcmp(tmp->name, pe->name) == 0) { -			spin_unlock_bh(&ip_vs_pe_lock); +			mutex_unlock(&ip_vs_pe_mutex);  			ip_vs_use_count_dec();  			pr_err("%s(): [%s] pe already existed "  			       "in the system\n", __func__, pe->name); @@ -113,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)  		}  	}  	/* Add it into the d-linked pe list */ -	list_add(&pe->n_list, &ip_vs_pe); -	spin_unlock_bh(&ip_vs_pe_lock); +	list_add_rcu(&pe->n_list, &ip_vs_pe); +	mutex_unlock(&ip_vs_pe_mutex);  	pr_info("[%s] pe registered.\n", pe->name); @@ -125,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe);  /* Unregister a pe from the pe list */  int unregister_ip_vs_pe(struct ip_vs_pe *pe)  { -	spin_lock_bh(&ip_vs_pe_lock); -	if (list_empty(&pe->n_list)) { -		spin_unlock_bh(&ip_vs_pe_lock); -		pr_err("%s(): [%s] pe is not in the list. failed\n", -		       __func__, pe->name); -		return -EINVAL; -	} - +	mutex_lock(&ip_vs_pe_mutex);  	/* Remove it from the d-linked pe list */ -	list_del(&pe->n_list); -	spin_unlock_bh(&ip_vs_pe_lock); +	list_del_rcu(&pe->n_list); +	mutex_unlock(&ip_vs_pe_mutex);  	/* decrease the module use count */  	ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index b8b4e9620f3..bed5f704252 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,  				    const char *callid, size_t callid_len,  				    int *idx)  { -	size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); +	size_t max_len = 64; +	size_t len = min3(max_len, callid_len, buf_len - *idx - 1);  	memcpy(buf + *idx, callid, len);  	buf[*idx+len] = '\0';  	*idx += len + 1; @@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff,  		if (ret > 0)  			break;  		if (!ret) -			return 0; +			return -EINVAL;  		dataoff += *matchoff;  	} -	/* Empty callid is useless */ -	if (!*matchlen) -		return -EINVAL; -  	/* Too large is useless */  	if (*matchlen > IP_VS_PEDATA_MAXLEN)  		return -EINVAL; @@ -71,32 +68,36 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)  	struct ip_vs_iphdr iph;  	unsigned int dataoff, datalen, matchoff, matchlen;  	const char *dptr; +	int retc; -	ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); +	ip_vs_fill_iph_skb(p->af, skb, &iph);  	/* Only useful with UDP */  	if (iph.protocol != IPPROTO_UDP)  		return -EINVAL; - -	/* No Data ? */ +	/* todo: IPv6 fragments: +	 *       I think this only should be done for the first fragment. /HS +	 */  	dataoff = iph.len + sizeof(struct udphdr); +  	if (dataoff >= skb->len)  		return -EINVAL; - +	retc = skb_linearize(skb); +	if (retc < 0) +		return retc;  	dptr = skb->data + dataoff;  	datalen = skb->len - dataoff;  	if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))  		return -EINVAL; -	p->pe_data = kmalloc(matchlen, GFP_ATOMIC); -	if (!p->pe_data) -		return -ENOMEM; -  	/* N.B: pe_data is only set on success,  	 * this allows fallback to the default persistence logic on failure  	 */ -	memcpy(p->pe_data, dptr + matchoff, matchlen); +	p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC); +	if (!p->pe_data) +		return -ENOMEM; +  	p->pe_data_len = matchlen;  	return 0; @@ -106,7 +107,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,  				  struct ip_vs_conn *ct)  { -	bool ret = 0; +	bool ret = false;  	if (ct->af == p->af &&  	    ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && @@ -119,7 +120,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,  	    ct->protocol == p->protocol &&  	    ct->pe_data && ct->pe_data_len == p->pe_data_len &&  	    !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) -		ret = 1; +		ret = true;  	IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",  		      ip_vs_proto_name(p->protocol), @@ -162,6 +163,7 @@ static int __init ip_vs_sip_init(void)  static void __exit ip_vs_sip_cleanup(void)  {  	unregister_ip_vs_pe(&ip_vs_sip_pe); +	synchronize_rcu();  }  module_init(ip_vs_sip_init); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index c5399839087..939f7fbe9b4 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -25,7 +25,6 @@  #include <net/protocol.h>  #include <net/tcp.h>  #include <net/udp.h> -#include <asm/system.h>  #include <linux/stat.h>  #include <linux/proc_fs.h> @@ -49,7 +48,7 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];   */  static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)  { -	unsigned hash = IP_VS_PROTO_HASH(pp->protocol); +	unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);  	pp->next = ip_vs_proto_table[hash];  	ip_vs_proto_table[hash] = pp; @@ -60,6 +59,37 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)  	return 0;  } +/* + *	register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); +	unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); +	struct ip_vs_proto_data *pd = +			kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); + +	if (!pd) +		return -ENOMEM; + +	pd->pp = pp;	/* For speed issues */ +	pd->next = ipvs->proto_data_table[hash]; +	ipvs->proto_data_table[hash] = pd; +	atomic_set(&pd->appcnt, 0);	/* Init app counter */ + +	if (pp->init_netns != NULL) { +		int ret = pp->init_netns(net, pd); +		if (ret) { +			/* unlink an free proto data */ +			ipvs->proto_data_table[hash] = pd->next; +			kfree(pd); +			return ret; +		} +	} + +	return 0; +}  /*   *	unregister an ipvs protocol @@ -67,7 +97,7 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)  static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)  {  	struct ip_vs_protocol **pp_p; -	unsigned hash = IP_VS_PROTO_HASH(pp->protocol); +	unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);  	pp_p = &ip_vs_proto_table[hash];  	for (; *pp_p; pp_p = &(*pp_p)->next) { @@ -82,6 +112,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)  	return -ESRCH;  } +/* + *	unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_proto_data **pd_p; +	unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); + +	pd_p = &ipvs->proto_data_table[hash]; +	for (; *pd_p; pd_p = &(*pd_p)->next) { +		if (*pd_p == pd) { +			*pd_p = pd->next; +			if (pd->pp->exit_netns != NULL) +				pd->pp->exit_netns(net, pd); +			kfree(pd); +			return 0; +		} +	} + +	return -ESRCH; +}  /*   *	get ip_vs_protocol object by its proto. @@ -89,7 +142,7 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)  struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)  {  	struct ip_vs_protocol *pp; -	unsigned hash = IP_VS_PROTO_HASH(proto); +	unsigned int hash = IP_VS_PROTO_HASH(proto);  	for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {  		if (pp->protocol == proto) @@ -100,19 +153,44 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)  }  EXPORT_SYMBOL(ip_vs_proto_get); +/* + *	get ip_vs_protocol object data by netns and proto + */ +static struct ip_vs_proto_data * +__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +{ +	struct ip_vs_proto_data *pd; +	unsigned int hash = IP_VS_PROTO_HASH(proto); + +	for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { +		if (pd->pp->protocol == proto) +			return pd; +	} + +	return NULL; +} + +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	return __ipvs_proto_data_get(ipvs, proto); +} +EXPORT_SYMBOL(ip_vs_proto_data_get);  /*   *	Propagate event for state change to all protocols   */ -void ip_vs_protocol_timeout_change(int flags) +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)  { -	struct ip_vs_protocol *pp; +	struct ip_vs_proto_data *pd;  	int i;  	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { -		for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { -			if (pp->timeout_change) -				pp->timeout_change(pp, flags); +		for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { +			if (pd->pp->timeout_change) +				pd->pp->timeout_change(pd, flags);  		}  	}  } @@ -121,7 +199,7 @@ void ip_vs_protocol_timeout_change(int flags)  int *  ip_vs_create_timeout_table(int *table, int size)  { -	return kmemdup(table, size, GFP_ATOMIC); +	return kmemdup(table, size, GFP_KERNEL);  } @@ -202,17 +280,17 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,  	if (ih == NULL)  		sprintf(buf, "TRUNCATED");  	else if (ih->nexthdr == IPPROTO_FRAGMENT) -		sprintf(buf, "%pI6->%pI6 frag",	&ih->saddr, &ih->daddr); +		sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr);  	else {  		__be16 _ports[2], *pptr;  		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),  					  sizeof(_ports), _ports);  		if (pptr == NULL) -			sprintf(buf, "TRUNCATED %pI6->%pI6", +			sprintf(buf, "TRUNCATED %pI6c->%pI6c",  				&ih->saddr, &ih->daddr);  		else -			sprintf(buf, "%pI6:%u->%pI6:%u", +			sprintf(buf, "%pI6c:%u->%pI6c:%u",  				&ih->saddr, ntohs(pptr[0]),  				&ih->daddr, ntohs(pptr[1]));  	} @@ -236,6 +314,54 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,  		ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);  } +/* + * per network name-space init + */ +int __net_init ip_vs_protocol_net_init(struct net *net) +{ +	int i, ret; +	static struct ip_vs_protocol *protos[] = { +#ifdef CONFIG_IP_VS_PROTO_TCP +        &ip_vs_protocol_tcp, +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP +	&ip_vs_protocol_udp, +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP +	&ip_vs_protocol_sctp, +#endif +#ifdef CONFIG_IP_VS_PROTO_AH +	&ip_vs_protocol_ah, +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP +	&ip_vs_protocol_esp, +#endif +	}; + +	for (i = 0; i < ARRAY_SIZE(protos); i++) { +		ret = register_ip_vs_proto_netns(net, protos[i]); +		if (ret < 0) +			goto cleanup; +	} +	return 0; + +cleanup: +	ip_vs_protocol_net_cleanup(net); +	return ret; +} + +void __net_exit ip_vs_protocol_net_cleanup(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_proto_data *pd; +	int i; + +	/* unregister all the ipvs proto data for this netns */ +	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { +		while ((pd = ipvs->proto_data_table[i]) != NULL) +			unregister_ip_vs_proto_netns(net, pd); +	} +}  int __init ip_vs_protocol_init(void)  { diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 3a0461117d3..5de3dd312c0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -41,28 +41,30 @@ struct isakmp_hdr {  #define PORT_ISAKMP	500  static void -ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph, -			     int inverse, struct ip_vs_conn_param *p) +ah_esp_conn_fill_param_proto(struct net *net, int af, +			     const struct ip_vs_iphdr *iph, int inverse, +			     struct ip_vs_conn_param *p)  {  	if (likely(!inverse)) -		ip_vs_conn_fill_param(af, IPPROTO_UDP, +		ip_vs_conn_fill_param(net, af, IPPROTO_UDP,  				      &iph->saddr, htons(PORT_ISAKMP),  				      &iph->daddr, htons(PORT_ISAKMP), p);  	else -		ip_vs_conn_fill_param(af, IPPROTO_UDP, +		ip_vs_conn_fill_param(net, af, IPPROTO_UDP,  				      &iph->daddr, htons(PORT_ISAKMP),  				      &iph->saddr, htons(PORT_ISAKMP), p);  }  static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, -		   const struct ip_vs_iphdr *iph, unsigned int proto_off, +ah_esp_conn_in_get(int af, const struct sk_buff *skb, +		   const struct ip_vs_iphdr *iph,  		   int inverse)  {  	struct ip_vs_conn *cp;  	struct ip_vs_conn_param p; +	struct net *net = skb_net(skb); -	ah_esp_conn_fill_param_proto(af, iph, inverse, &p); +	ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);  	cp = ip_vs_conn_in_get(&p);  	if (!cp) {  		/* @@ -72,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,  		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "  			      "%s%s %s->%s\n",  			      inverse ? "ICMP+" : "", -			      pp->name, +			      ip_vs_proto_get(iph->protocol)->name,  			      IP_VS_DBG_ADDR(af, &iph->saddr),  			      IP_VS_DBG_ADDR(af, &iph->daddr));  	} @@ -83,21 +85,19 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,  static struct ip_vs_conn *  ah_esp_conn_out_get(int af, const struct sk_buff *skb, -		    struct ip_vs_protocol *pp, -		    const struct ip_vs_iphdr *iph, -		    unsigned int proto_off, -		    int inverse) +		    const struct ip_vs_iphdr *iph, int inverse)  {  	struct ip_vs_conn *cp;  	struct ip_vs_conn_param p; +	struct net *net = skb_net(skb); -	ah_esp_conn_fill_param_proto(af, iph, inverse, &p); +	ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);  	cp = ip_vs_conn_out_get(&p);  	if (!cp) {  		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "  			      "%s%s %s->%s\n",  			      inverse ? "ICMP+" : "", -			      pp->name, +			      ip_vs_proto_get(iph->protocol)->name,  			      IP_VS_DBG_ADDR(af, &iph->saddr),  			      IP_VS_DBG_ADDR(af, &iph->daddr));  	} @@ -107,8 +107,9 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,  static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, -		     int *verdict, struct ip_vs_conn **cpp) +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +		     int *verdict, struct ip_vs_conn **cpp, +		     struct ip_vs_iphdr *iph)  {  	/*  	 * AH/ESP is only related traffic. Pass the packet to IP stack. @@ -117,26 +118,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,  	return 0;  } -static void ah_esp_init(struct ip_vs_protocol *pp) -{ -	/* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ -	/* nothing to do now */ -} - -  #ifdef CONFIG_IP_VS_PROTO_AH  struct ip_vs_protocol ip_vs_protocol_ah = {  	.name =			"AH",  	.protocol =		IPPROTO_AH,  	.num_states =		1,  	.dont_defrag =		1, -	.init =			ah_esp_init, -	.exit =			ah_esp_exit, +	.init =			NULL, +	.exit =			NULL,  	.conn_schedule =	ah_esp_conn_schedule,  	.conn_in_get =		ah_esp_conn_in_get,  	.conn_out_get =		ah_esp_conn_out_get, @@ -149,7 +138,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {  	.app_conn_bind =	NULL,  	.debug_packet =		ip_vs_tcpudp_debug_packet,  	.timeout_change =	NULL,		/* ISAKMP */ -	.set_state_timeout =	NULL,  };  #endif @@ -159,8 +147,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {  	.protocol =		IPPROTO_ESP,  	.num_states =		1,  	.dont_defrag =		1, -	.init =			ah_esp_init, -	.exit =			ah_esp_exit, +	.init =			NULL, +	.exit =			NULL,  	.conn_schedule =	ah_esp_conn_schedule,  	.conn_in_get =		ah_esp_conn_in_get,  	.conn_out_get =		ah_esp_conn_out_get, diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 1ea96bcd342..2f7ea756404 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -9,36 +9,43 @@  #include <net/ip_vs.h>  static int -sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, -		   int *verdict, struct ip_vs_conn **cpp) +sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +		   int *verdict, struct ip_vs_conn **cpp, +		   struct ip_vs_iphdr *iph)  { +	struct net *net;  	struct ip_vs_service *svc; +	struct netns_ipvs *ipvs;  	sctp_chunkhdr_t _schunkh, *sch;  	sctp_sctphdr_t *sh, _sctph; -	struct ip_vs_iphdr iph; -	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - -	sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph); -	if (sh == NULL) +	sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); +	if (sh == NULL) { +		*verdict = NF_DROP;  		return 0; +	} -	sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t), +	sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),  				 sizeof(_schunkh), &_schunkh); -	if (sch == NULL) +	if (sch == NULL) { +		*verdict = NF_DROP;  		return 0; +	} -	if ((sch->type == SCTP_CID_INIT) && -	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, -				     &iph.daddr, sh->dest))) { +	net = skb_net(skb); +	ipvs = net_ipvs(net); +	rcu_read_lock(); +	if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && +	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, +				      &iph->daddr, sh->dest))) {  		int ignored; -		if (ip_vs_todrop()) { +		if (ip_vs_todrop(ipvs)) {  			/*  			 * It seems that we are very loaded.  			 * We have to drop this packet :(  			 */ -			ip_vs_service_put(svc); +			rcu_read_unlock();  			*verdict = NF_DROP;  			return 0;  		} @@ -46,101 +53,119 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,  		 * Let the virtual server select a real server for the  		 * incoming connection, and create a connection entry.  		 */ -		*cpp = ip_vs_schedule(svc, skb, pp, &ignored); -		if (!*cpp && !ignored) { -			*verdict = ip_vs_leave(svc, skb, pp); +		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); +		if (!*cpp && ignored <= 0) { +			if (!ignored) +				*verdict = ip_vs_leave(svc, skb, pd, iph); +			else +				*verdict = NF_DROP; +			rcu_read_unlock();  			return 0;  		} -		ip_vs_service_put(svc);  	} - +	rcu_read_unlock(); +	/* NF_ACCEPT */  	return 1;  } +static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph, +			  unsigned int sctphoff) +{ +	sctph->checksum = sctp_compute_cksum(skb, sctphoff); +	skb->ip_summed = CHECKSUM_UNNECESSARY; +} +  static int -sctp_snat_handler(struct sk_buff *skb, -		  struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, +		  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)  {  	sctp_sctphdr_t *sctph; -	unsigned int sctphoff; -	struct sk_buff *iter; -	__be32 crc32; +	unsigned int sctphoff = iph->len; +	bool payload_csum = false;  #ifdef CONFIG_IP_VS_IPV6 -	if (cp->af == AF_INET6) -		sctphoff = sizeof(struct ipv6hdr); -	else +	if (cp->af == AF_INET6 && iph->fragoffs) +		return 1;  #endif -		sctphoff = ip_hdrlen(skb);  	/* csum_check requires unshared skb */  	if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))  		return 0;  	if (unlikely(cp->app != NULL)) { +		int ret; +  		/* Some checks before mangling */  		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))  			return 0;  		/* Call application helper if needed */ -		if (!ip_vs_app_pkt_out(cp, skb)) +		ret = ip_vs_app_pkt_out(cp, skb); +		if (ret == 0)  			return 0; +		/* ret=2: csum update is needed after payload mangling */ +		if (ret == 2) +			payload_csum = true;  	}  	sctph = (void *) skb_network_header(skb) + sctphoff; -	sctph->source = cp->vport; -	/* Calculate the checksum */ -	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); -	skb_walk_frags(skb, iter) -		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), -				          crc32); -	crc32 = sctp_end_cksum(crc32); -	sctph->checksum = crc32; +	/* Only update csum if we really have to */ +	if (sctph->source != cp->vport || payload_csum || +	    skb->ip_summed == CHECKSUM_PARTIAL) { +		sctph->source = cp->vport; +		sctp_nat_csum(skb, sctph, sctphoff); +	} else { +		skb->ip_summed = CHECKSUM_UNNECESSARY; +	}  	return 1;  }  static int -sctp_dnat_handler(struct sk_buff *skb, -		  struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, +		  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)  {  	sctp_sctphdr_t *sctph; -	unsigned int sctphoff; -	struct sk_buff *iter; -	__be32 crc32; +	unsigned int sctphoff = iph->len; +	bool payload_csum = false;  #ifdef CONFIG_IP_VS_IPV6 -	if (cp->af == AF_INET6) -		sctphoff = sizeof(struct ipv6hdr); -	else +	if (cp->af == AF_INET6 && iph->fragoffs) +		return 1;  #endif -		sctphoff = ip_hdrlen(skb);  	/* csum_check requires unshared skb */  	if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))  		return 0;  	if (unlikely(cp->app != NULL)) { +		int ret; +  		/* Some checks before mangling */  		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))  			return 0;  		/* Call application helper if needed */ -		if (!ip_vs_app_pkt_in(cp, skb)) +		ret = ip_vs_app_pkt_in(cp, skb); +		if (ret == 0)  			return 0; +		/* ret=2: csum update is needed after payload mangling */ +		if (ret == 2) +			payload_csum = true;  	}  	sctph = (void *) skb_network_header(skb) + sctphoff; -	sctph->dest = cp->dport; -	/* Calculate the checksum */ -	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); -	skb_walk_frags(skb, iter) -		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), -					  crc32); -	crc32 = sctp_end_cksum(crc32); -	sctph->checksum = crc32; +	/* Only update csum if we really have to */ +	if (sctph->dest != cp->dport || payload_csum || +	    (skb->ip_summed == CHECKSUM_PARTIAL && +	     !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { +		sctph->dest = cp->dport; +		sctp_nat_csum(skb, sctph, sctphoff); +	} else if (skb->ip_summed != CHECKSUM_PARTIAL) { +		skb->ip_summed = CHECKSUM_UNNECESSARY; +	}  	return 1;  } @@ -150,10 +175,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)  {  	unsigned int sctphoff;  	struct sctphdr *sh, _sctph; -	struct sk_buff *iter; -	__le32 cmp; -	__le32 val; -	__u32 tmp; +	__le32 cmp, val;  #ifdef CONFIG_IP_VS_IPV6  	if (af == AF_INET6) @@ -167,13 +189,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)  		return 0;  	cmp = sh->checksum; - -	tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); -	skb_walk_frags(skb, iter) -		tmp = sctp_update_cksum((__u8 *) iter->data, -					skb_headlen(iter), tmp); - -	val = sctp_end_cksum(tmp); +	val = sctp_compute_cksum(skb, sctphoff);  	if (val != cmp) {  		/* CRC failure, dump it. */ @@ -184,710 +200,159 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)  	return 1;  } -struct ipvs_sctp_nextstate { -	int next_state; -};  enum ipvs_sctp_event_t { -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_DATA_SER, -	IP_VS_SCTP_EVE_INIT_CLI, -	IP_VS_SCTP_EVE_INIT_SER, -	IP_VS_SCTP_EVE_INIT_ACK_CLI, -	IP_VS_SCTP_EVE_INIT_ACK_SER, -	IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, -	IP_VS_SCTP_EVE_COOKIE_ECHO_SER, -	IP_VS_SCTP_EVE_COOKIE_ACK_CLI, -	IP_VS_SCTP_EVE_COOKIE_ACK_SER, -	IP_VS_SCTP_EVE_ABORT_CLI, -	IP_VS_SCTP_EVE__ABORT_SER, -	IP_VS_SCTP_EVE_SHUT_CLI, -	IP_VS_SCTP_EVE_SHUT_SER, -	IP_VS_SCTP_EVE_SHUT_ACK_CLI, -	IP_VS_SCTP_EVE_SHUT_ACK_SER, -	IP_VS_SCTP_EVE_SHUT_COM_CLI, -	IP_VS_SCTP_EVE_SHUT_COM_SER, -	IP_VS_SCTP_EVE_LAST +	IP_VS_SCTP_DATA = 0,		/* DATA, SACK, HEARTBEATs */ +	IP_VS_SCTP_INIT, +	IP_VS_SCTP_INIT_ACK, +	IP_VS_SCTP_COOKIE_ECHO, +	IP_VS_SCTP_COOKIE_ACK, +	IP_VS_SCTP_SHUTDOWN, +	IP_VS_SCTP_SHUTDOWN_ACK, +	IP_VS_SCTP_SHUTDOWN_COMPLETE, +	IP_VS_SCTP_ERROR, +	IP_VS_SCTP_ABORT, +	IP_VS_SCTP_EVENT_LAST  }; -static enum ipvs_sctp_event_t sctp_events[255] = { -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_INIT_CLI, -	IP_VS_SCTP_EVE_INIT_ACK_CLI, -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_ABORT_CLI, -	IP_VS_SCTP_EVE_SHUT_CLI, -	IP_VS_SCTP_EVE_SHUT_ACK_CLI, -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, -	IP_VS_SCTP_EVE_COOKIE_ACK_CLI, -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_DATA_CLI, -	IP_VS_SCTP_EVE_SHUT_COM_CLI, +/* RFC 2960, 3.2 Chunk Field Descriptions */ +static __u8 sctp_events[] = { +	[SCTP_CID_DATA]			= IP_VS_SCTP_DATA, +	[SCTP_CID_INIT]			= IP_VS_SCTP_INIT, +	[SCTP_CID_INIT_ACK]		= IP_VS_SCTP_INIT_ACK, +	[SCTP_CID_SACK]			= IP_VS_SCTP_DATA, +	[SCTP_CID_HEARTBEAT]		= IP_VS_SCTP_DATA, +	[SCTP_CID_HEARTBEAT_ACK]	= IP_VS_SCTP_DATA, +	[SCTP_CID_ABORT]		= IP_VS_SCTP_ABORT, +	[SCTP_CID_SHUTDOWN]		= IP_VS_SCTP_SHUTDOWN, +	[SCTP_CID_SHUTDOWN_ACK]		= IP_VS_SCTP_SHUTDOWN_ACK, +	[SCTP_CID_ERROR]		= IP_VS_SCTP_ERROR, +	[SCTP_CID_COOKIE_ECHO]		= IP_VS_SCTP_COOKIE_ECHO, +	[SCTP_CID_COOKIE_ACK]		= IP_VS_SCTP_COOKIE_ACK, +	[SCTP_CID_ECN_ECNE]		= IP_VS_SCTP_DATA, +	[SCTP_CID_ECN_CWR]		= IP_VS_SCTP_DATA, +	[SCTP_CID_SHUTDOWN_COMPLETE]	= IP_VS_SCTP_SHUTDOWN_COMPLETE,  }; -static struct ipvs_sctp_nextstate - sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = { -	/* -	 * STATE : IP_VS_SCTP_S_NONE -	 */ -	/*next state *//*event */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }, -	 }, -	/* -	 * STATE : IP_VS_SCTP_S_INIT_CLI -	 * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT) -	 */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_INIT_SER -	 * Server sent INIT and waiting for INIT ACK from the client -	 */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_INIT_ACK_CLI -	 * Client sent INIT ACK and waiting for ECHO from the server -	 */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK has been resent by the client, let us stay is in -	  * the same state -	  */ -	 {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 /* -	  * INIT_ACK sent by the server, close the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * ECHO by client, it should not happen, close the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 /* -	  * ECHO by server, this is what we are expecting, move to ECHO_SER -	  */ -	 {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, it should not happen, close the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 /* -	  * Unexpected COOKIE ACK from server, staty in the same state -	  */ -	 {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_INIT_ACK_SER -	 * Server sent INIT ACK and waiting for ECHO from the client -	 */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * Unexpected INIT_ACK by the client, let us close the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 /* -	  * INIT_ACK resent by the server, let us move to same state -	  */ -	 {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client send the ECHO, this is what we are expecting, -	  * move to ECHO_CLI -	  */ -	 {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 /* -	  * ECHO received from the server, Not sure what to do, -	  * let us close it -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, let us stay in the same state -	  */ -	 {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 /* -	  * COOKIE ACK from server, hmm... this should not happen, lets close -	  * the connection. -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_ECHO_CLI -	 * Cient  sent ECHO and waiting COOKEI ACK from the Server -	 */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK has been by the client, let us close the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 /* -	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, -	  * “If an INIT ACK is received by an endpoint in any state other -	  * than the COOKIE-WAIT state, the endpoint should discard the -	  * INIT ACK chunk”. Stay in the same state -	  */ -	 {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client resent the ECHO, let us stay in the same state -	  */ -	 {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 /* -	  * ECHO received from the server, Not sure what to do, -	  * let us close it -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, this shoud not happen, let's close the -	  * connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 /* -	  * COOKIE ACK from server, this is what we are awaiting,lets move to -	  * ESTABLISHED. -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_ECHO_SER -	 * Server sent ECHO and waiting COOKEI ACK from the client -	 */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, -	  * “If an INIT ACK is received by an endpoint in any state other -	  * than the COOKIE-WAIT state, the endpoint should discard the -	  * INIT ACK chunk”. Stay in the same state -	  */ -	 {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 /* -	  * INIT_ACK has been by the server, let us close the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client sent the ECHO, not sure what to do, let's close the -	  * connection. -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 /* -	  * ECHO resent by the server, stay in the same state -	  */ -	 {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, this is what we are expecting, let's move -	  * to ESTABLISHED. -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 /* -	  * COOKIE ACK from server, this should not happen, lets close the -	  * connection. -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_ESTABLISHED -	 * Association established -	 */ -	{{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, -	  * “If an INIT ACK is received by an endpoint in any state other -	  * than the COOKIE-WAIT state, the endpoint should discard the -	  * INIT ACK chunk”. Stay in the same state -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the -	  * peer and peer shall move to the ESTABISHED. if it doesn't handle -	  * it will send ERROR chunk. So, stay in the same state -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, not sure what to do stay in the same state -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 /* -	  * SHUTDOWN from the client, move to SHUDDOWN_CLI -	  */ -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 /* -	  * SHUTDOWN from the server, move to SHUTDOWN_SER -	  */ -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 /* -	  * client sent SHUDTDOWN_ACK, this should not happen, let's close -	  * the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_SHUT_CLI -	 * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server -	 */ -	/* -	 * We recieved the data chuck, keep the state unchanged. I assume -	 * that still data chuncks  can be received by both the peers in -	 * SHUDOWN state -	 */ - -	{{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, -	  * “If an INIT ACK is received by an endpoint in any state other -	  * than the COOKIE-WAIT state, the endpoint should discard the -	  * INIT ACK chunk”. Stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the -	  * peer and peer shall move to the ESTABISHED. if it doesn't handle -	  * it will send ERROR chunk. So, stay in the same state -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, not sure what to do stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 /* -	  * SHUTDOWN resent from the client, move to SHUDDOWN_CLI -	  */ -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 /* -	  * SHUTDOWN from the server, move to SHUTDOWN_SER -	  */ -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 /* -	  * client sent SHUDTDOWN_ACK, this should not happen, let's close -	  * the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 /* -	  * Server sent SHUTDOWN ACK, this is what we are expecting, let's move -	  * to SHUDOWN_ACK_SER -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 /* -	  * SHUTDOWN COM from client, this should not happen, let's close the -	  * connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_SHUT_SER -	 * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client -	 */ -	/* -	 * We recieved the data chuck, keep the state unchanged. I assume -	 * that still data chuncks  can be received by both the peers in -	 * SHUDOWN state -	 */ - -	{{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, -	  * “If an INIT ACK is received by an endpoint in any state other -	  * than the COOKIE-WAIT state, the endpoint should discard the -	  * INIT ACK chunk”. Stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the -	  * peer and peer shall move to the ESTABISHED. if it doesn't handle -	  * it will send ERROR chunk. So, stay in the same state -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, not sure what to do stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 /* -	  * SHUTDOWN resent from the client, move to SHUDDOWN_CLI -	  */ -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 /* -	  * SHUTDOWN resent from the server, move to SHUTDOWN_SER -	  */ -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 /* -	  * client sent SHUDTDOWN_ACK, this is what we are expecting, let's -	  * move to SHUT_ACK_CLI -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 /* -	  * Server sent SHUTDOWN ACK, this should not happen, let's close the -	  * connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 /* -	  * SHUTDOWN COM from client, this should not happen, let's close the -	  * connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, - -	/* -	 * State : IP_VS_SCTP_S_SHUT_ACK_CLI -	 * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server -	 */ -	/* -	 * We recieved the data chuck, keep the state unchanged. I assume -	 * that still data chuncks  can be received by both the peers in -	 * SHUDOWN state -	 */ - -	{{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, -	  * “If an INIT ACK is received by an endpoint in any state other -	  * than the COOKIE-WAIT state, the endpoint should discard the -	  * INIT ACK chunk”. Stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the -	  * peer and peer shall move to the ESTABISHED. if it doesn't handle -	  * it will send ERROR chunk. So, stay in the same state -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, not sure what to do stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 /* -	  * SHUTDOWN sent from the client, move to SHUDDOWN_CLI -	  */ -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 /* -	  * SHUTDOWN sent from the server, move to SHUTDOWN_SER -	  */ -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 /* -	  * client resent SHUDTDOWN_ACK, let's stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 /* -	  * Server sent SHUTDOWN ACK, this should not happen, let's close the -	  * connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 /* -	  * SHUTDOWN COM from client, this should not happen, let's close the -	  * connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 /* -	  * SHUTDOWN COMPLETE from server this is what we are expecting. -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, - -	/* -	 * State : IP_VS_SCTP_S_SHUT_ACK_SER -	 * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client -	 */ -	/* -	 * We recieved the data chuck, keep the state unchanged. I assume -	 * that still data chuncks  can be received by both the peers in -	 * SHUDOWN state -	 */ +/* SCTP States: + * See RFC 2960, 4. SCTP Association State Diagram + * + * New states (not in diagram): + * - INIT1 state: use shorter timeout for dropped INIT packets + * - REJECTED state: use shorter timeout if INIT is rejected with ABORT + * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging + * + * The states are as seen in real server. In the diagram, INIT1, INIT, + * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. + * + * States as per packets from client (C) and server (S): + * + * Setup of client connection: + * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK + * + * Setup of server connection: + * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK + * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK + */ -	{{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 /* -	  * We have got an INIT from client. From the spec.“Upon receipt of -	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with -	  * an INIT ACK using the same parameters it sent in its  original -	  * INIT chunk (including its Initiate Tag, unchanged”). -	  */ -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 /* -	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, -	  * “If an INIT ACK is received by an endpoint in any state other -	  * than the COOKIE-WAIT state, the endpoint should discard the -	  * INIT ACK chunk”. Stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 /* -	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the -	  * peer and peer shall move to the ESTABISHED. if it doesn't handle -	  * it will send ERROR chunk. So, stay in the same state -	  */ -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 /* -	  * COOKIE ACK from client, not sure what to do stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 /* -	  * SHUTDOWN sent from the client, move to SHUDDOWN_CLI -	  */ -	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 /* -	  * SHUTDOWN sent from the server, move to SHUTDOWN_SER -	  */ -	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 /* -	  * client sent SHUDTDOWN_ACK, this should not happen let's close -	  * the connection. -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 /* -	  * Server resent SHUTDOWN ACK, stay in the same state -	  */ -	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 /* -	  * SHUTDOWN COM from client, this what we are expecting, let's close -	  * the connection -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 /* -	  * SHUTDOWN COMPLETE from server this should not happen. -	  */ -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 }, -	/* -	 * State : IP_VS_SCTP_S_CLOSED -	 */ -	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, -	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, -	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, -	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } -	 } +#define sNO IP_VS_SCTP_S_NONE +#define sI1 IP_VS_SCTP_S_INIT1 +#define sIN IP_VS_SCTP_S_INIT +#define sCS IP_VS_SCTP_S_COOKIE_SENT +#define sCR IP_VS_SCTP_S_COOKIE_REPLIED +#define sCW IP_VS_SCTP_S_COOKIE_WAIT +#define sCO IP_VS_SCTP_S_COOKIE +#define sCE IP_VS_SCTP_S_COOKIE_ECHOED +#define sES IP_VS_SCTP_S_ESTABLISHED +#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT +#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED +#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT +#define sRJ IP_VS_SCTP_S_REJECTED +#define sCL IP_VS_SCTP_S_CLOSED + +static const __u8 sctp_states +	[IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { +	{ /* INPUT */ +/*        sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d   */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i   */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s   */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab  */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +	}, +	{ /* OUTPUT */ +/*        sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d   */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i   */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, +/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s   */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, +/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab  */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +	}, +	{ /* INPUT-ONLY */ +/*        sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d   */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i   */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s   */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab  */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +	},  }; -/* - *      Timeout table[state] - */ -static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { -	[IP_VS_SCTP_S_NONE]         =     2 * HZ, -	[IP_VS_SCTP_S_INIT_CLI]     =     1 * 60 * HZ, -	[IP_VS_SCTP_S_INIT_SER]     =     1 * 60 * HZ, -	[IP_VS_SCTP_S_INIT_ACK_CLI] =     1 * 60 * HZ, -	[IP_VS_SCTP_S_INIT_ACK_SER] =     1 * 60 * HZ, -	[IP_VS_SCTP_S_ECHO_CLI]     =     1 * 60 * HZ, -	[IP_VS_SCTP_S_ECHO_SER]     =     1 * 60 * HZ, -	[IP_VS_SCTP_S_ESTABLISHED]  =    15 * 60 * HZ, -	[IP_VS_SCTP_S_SHUT_CLI]     =     1 * 60 * HZ, -	[IP_VS_SCTP_S_SHUT_SER]     =     1 * 60 * HZ, -	[IP_VS_SCTP_S_SHUT_ACK_CLI] =     1 * 60 * HZ, -	[IP_VS_SCTP_S_SHUT_ACK_SER] =     1 * 60 * HZ, -	[IP_VS_SCTP_S_CLOSED]       =    10 * HZ, -	[IP_VS_SCTP_S_LAST]         =     2 * HZ, +#define IP_VS_SCTP_MAX_RTO	((60 + 1) * HZ) + +/* Timeout table[state] */ +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { +	[IP_VS_SCTP_S_NONE]			= 2 * HZ, +	[IP_VS_SCTP_S_INIT1]			= (0 + 3 + 1) * HZ, +	[IP_VS_SCTP_S_INIT]			= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_COOKIE_SENT]		= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_COOKIE_REPLIED]		= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_COOKIE_WAIT]		= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_COOKIE]			= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_COOKIE_ECHOED]		= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_ESTABLISHED]		= 15 * 60 * HZ, +	[IP_VS_SCTP_S_SHUTDOWN_SENT]		= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_SHUTDOWN_RECEIVED]	= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_SHUTDOWN_ACK_SENT]	= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_REJECTED]			= (0 + 3 + 1) * HZ, +	[IP_VS_SCTP_S_CLOSED]			= IP_VS_SCTP_MAX_RTO, +	[IP_VS_SCTP_S_LAST]			= 2 * HZ,  };  static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { -	[IP_VS_SCTP_S_NONE]         =    "NONE", -	[IP_VS_SCTP_S_INIT_CLI]     =    "INIT_CLI", -	[IP_VS_SCTP_S_INIT_SER]     =    "INIT_SER", -	[IP_VS_SCTP_S_INIT_ACK_CLI] =    "INIT_ACK_CLI", -	[IP_VS_SCTP_S_INIT_ACK_SER] =    "INIT_ACK_SER", -	[IP_VS_SCTP_S_ECHO_CLI]     =    "COOKIE_ECHO_CLI", -	[IP_VS_SCTP_S_ECHO_SER]     =    "COOKIE_ECHO_SER", -	[IP_VS_SCTP_S_ESTABLISHED]  =    "ESTABISHED", -	[IP_VS_SCTP_S_SHUT_CLI]     =    "SHUTDOWN_CLI", -	[IP_VS_SCTP_S_SHUT_SER]     =    "SHUTDOWN_SER", -	[IP_VS_SCTP_S_SHUT_ACK_CLI] =    "SHUTDOWN_ACK_CLI", -	[IP_VS_SCTP_S_SHUT_ACK_SER] =    "SHUTDOWN_ACK_SER", -	[IP_VS_SCTP_S_CLOSED]       =    "CLOSED", -	[IP_VS_SCTP_S_LAST]         =    "BUG!" +	[IP_VS_SCTP_S_NONE]			= "NONE", +	[IP_VS_SCTP_S_INIT1]			= "INIT1", +	[IP_VS_SCTP_S_INIT]			= "INIT", +	[IP_VS_SCTP_S_COOKIE_SENT]		= "C-SENT", +	[IP_VS_SCTP_S_COOKIE_REPLIED]		= "C-REPLIED", +	[IP_VS_SCTP_S_COOKIE_WAIT]		= "C-WAIT", +	[IP_VS_SCTP_S_COOKIE]			= "COOKIE", +	[IP_VS_SCTP_S_COOKIE_ECHOED]		= "C-ECHOED", +	[IP_VS_SCTP_S_ESTABLISHED]		= "ESTABLISHED", +	[IP_VS_SCTP_S_SHUTDOWN_SENT]		= "S-SENT", +	[IP_VS_SCTP_S_SHUTDOWN_RECEIVED]	= "S-RECEIVED", +	[IP_VS_SCTP_S_SHUTDOWN_ACK_SENT]	= "S-ACK-SENT", +	[IP_VS_SCTP_S_REJECTED]			= "REJECTED", +	[IP_VS_SCTP_S_CLOSED]			= "CLOSED", +	[IP_VS_SCTP_S_LAST]			= "BUG!",  }; @@ -900,26 +365,14 @@ static const char *sctp_state_name(int state)  	return "?";  } -static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ -} - -static int -sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - -return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST, -				sctp_state_name_table, sname, to); -} - -static inline int -set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, +static inline void +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,  		int direction, const struct sk_buff *skb)  {  	sctp_chunkhdr_t _sctpch, *sch;  	unsigned char chunk_type;  	int event, next_state; -	int ihl; +	int ihl, cofs;  #ifdef CONFIG_IP_VS_IPV6  	ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -927,10 +380,10 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,  	ihl = ip_hdrlen(skb);  #endif -	sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), -				sizeof(_sctpch), &_sctpch); +	cofs = ihl + sizeof(sctp_sctphdr_t); +	sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);  	if (sch == NULL) -		return 0; +		return;  	chunk_type = sch->type;  	/* @@ -946,32 +399,37 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,  	 */  	if ((sch->type == SCTP_CID_COOKIE_ECHO) ||  	    (sch->type == SCTP_CID_COOKIE_ACK)) { -		sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + -				sch->length), sizeof(_sctpch), &_sctpch); -		if (sch) { -			if (sch->type == SCTP_CID_ABORT) +		int clen = ntohs(sch->length); + +		if (clen >= sizeof(sctp_chunkhdr_t)) { +			sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), +						 sizeof(_sctpch), &_sctpch); +			if (sch && sch->type == SCTP_CID_ABORT)  				chunk_type = sch->type;  		}  	} -	event = sctp_events[chunk_type]; +	event = (chunk_type < sizeof(sctp_events)) ? +		sctp_events[chunk_type] : IP_VS_SCTP_DATA; -	/* -	 *  If the direction is IP_VS_DIR_OUTPUT, this event is from server -	 */ -	if (direction == IP_VS_DIR_OUTPUT) -		event++; -	/* -	 * get next state +	/* Update direction to INPUT_ONLY if necessary +	 * or delete NO_OUTPUT flag if output packet detected  	 */ -	next_state = sctp_states_table[cp->state][event].next_state; +	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { +		if (direction == IP_VS_DIR_OUTPUT) +			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; +		else +			direction = IP_VS_DIR_INPUT_ONLY; +	} + +	next_state = sctp_states[direction][event][cp->state];  	if (next_state != cp->state) {  		struct ip_vs_dest *dest = cp->dest;  		IP_VS_DBG_BUF(8, "%s %s  %s:%d->"  				"%s:%d state: %s->%s conn->refcnt:%d\n", -				pp->name, +				pd->pp->name,  				((direction == IP_VS_DIR_OUTPUT) ?  				 "output " : "input "),  				IP_VS_DBG_ADDR(cp->af, &cp->daddr), @@ -995,75 +453,62 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,  			}  		}  	} - -	 cp->timeout = pp->timeout_table[cp->state = next_state]; - -	 return 1; +	if (likely(pd)) +		cp->timeout = pd->timeout_table[cp->state = next_state]; +	else	/* What to do ? */ +		cp->timeout = sctp_timeouts[cp->state = next_state];  } -static int +static void  sctp_state_transition(struct ip_vs_conn *cp, int direction, -		const struct sk_buff *skb, struct ip_vs_protocol *pp) +		const struct sk_buff *skb, struct ip_vs_proto_data *pd)  { -	int ret = 0; - -	spin_lock(&cp->lock); -	ret = set_sctp_state(pp, cp, direction, skb); -	spin_unlock(&cp->lock); - -	return ret; +	spin_lock_bh(&cp->lock); +	set_sctp_state(pd, cp, direction, skb); +	spin_unlock_bh(&cp->lock);  } -/* - *      Hash table for SCTP application incarnations - */ -#define SCTP_APP_TAB_BITS        4 -#define SCTP_APP_TAB_SIZE        (1 << SCTP_APP_TAB_BITS) -#define SCTP_APP_TAB_MASK        (SCTP_APP_TAB_SIZE - 1) - -static struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(sctp_app_lock); -  static inline __u16 sctp_app_hashkey(__be16 port)  {  	return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)  		& SCTP_APP_TAB_MASK;  } -static int sctp_register_app(struct ip_vs_app *inc) +static int sctp_register_app(struct net *net, struct ip_vs_app *inc)  {  	struct ip_vs_app *i;  	__u16 hash;  	__be16 port = inc->port;  	int ret = 0; +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);  	hash = sctp_app_hashkey(port); -	spin_lock_bh(&sctp_app_lock); -	list_for_each_entry(i, &sctp_apps[hash], p_list) { +	list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {  		if (i->port == port) {  			ret = -EEXIST;  			goto out;  		}  	} -	list_add(&inc->p_list, &sctp_apps[hash]); -	atomic_inc(&ip_vs_protocol_sctp.appcnt); +	list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); +	atomic_inc(&pd->appcnt);  out: -	spin_unlock_bh(&sctp_app_lock);  	return ret;  } -static void sctp_unregister_app(struct ip_vs_app *inc) +static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)  { -	spin_lock_bh(&sctp_app_lock); -	atomic_dec(&ip_vs_protocol_sctp.appcnt); -	list_del(&inc->p_list); -	spin_unlock_bh(&sctp_app_lock); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + +	atomic_dec(&pd->appcnt); +	list_del_rcu(&inc->p_list);  }  static int sctp_app_conn_bind(struct ip_vs_conn *cp)  { +	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));  	int hash;  	struct ip_vs_app *inc;  	int result = 0; @@ -1074,12 +519,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)  	/* Lookup application incarnations and bind the right one */  	hash = sctp_app_hashkey(cp->vport); -	spin_lock(&sctp_app_lock); -	list_for_each_entry(inc, &sctp_apps[hash], p_list) { +	rcu_read_lock(); +	list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {  		if (inc->port == cp->vport) {  			if (unlikely(!ip_vs_app_inc_get(inc)))  				break; -			spin_unlock(&sctp_app_lock); +			rcu_read_unlock();  			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"  					"%s:%u to app %s on port %u\n", @@ -1095,43 +540,52 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)  			goto out;  		}  	} -	spin_unlock(&sctp_app_lock); +	rcu_read_unlock();  out:  	return result;  } -static void ip_vs_sctp_init(struct ip_vs_protocol *pp) +/* --------------------------------------------- + *   timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)  { -	IP_VS_INIT_HASH_TABLE(sctp_apps); -	pp->timeout_table = sctp_timeouts; +	struct netns_ipvs *ipvs = net_ipvs(net); + +	ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); +	pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, +							sizeof(sctp_timeouts)); +	if (!pd->timeout_table) +		return -ENOMEM; +	return 0;  } - -static void ip_vs_sctp_exit(struct ip_vs_protocol *pp) +static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)  { - +	kfree(pd->timeout_table);  }  struct ip_vs_protocol ip_vs_protocol_sctp = { -	.name = "SCTP", -	.protocol = IPPROTO_SCTP, -	.num_states = IP_VS_SCTP_S_LAST, -	.dont_defrag = 0, -	.appcnt = ATOMIC_INIT(0), -	.init = ip_vs_sctp_init, -	.exit = ip_vs_sctp_exit, -	.register_app = sctp_register_app, +	.name		= "SCTP", +	.protocol	= IPPROTO_SCTP, +	.num_states	= IP_VS_SCTP_S_LAST, +	.dont_defrag	= 0, +	.init		= NULL, +	.exit		= NULL, +	.init_netns	= __ip_vs_sctp_init, +	.exit_netns	= __ip_vs_sctp_exit, +	.register_app	= sctp_register_app,  	.unregister_app = sctp_unregister_app, -	.conn_schedule = sctp_conn_schedule, -	.conn_in_get = ip_vs_conn_in_get_proto, -	.conn_out_get = ip_vs_conn_out_get_proto, -	.snat_handler = sctp_snat_handler, -	.dnat_handler = sctp_dnat_handler, -	.csum_check = sctp_csum_check, -	.state_name = sctp_state_name, +	.conn_schedule	= sctp_conn_schedule, +	.conn_in_get	= ip_vs_conn_in_get_proto, +	.conn_out_get	= ip_vs_conn_out_get_proto, +	.snat_handler	= sctp_snat_handler, +	.dnat_handler	= sctp_dnat_handler, +	.csum_check	= sctp_csum_check, +	.state_name	= sctp_state_name,  	.state_transition = sctp_state_transition, -	.app_conn_bind = sctp_app_conn_bind, -	.debug_packet = ip_vs_tcpudp_debug_packet, -	.timeout_change = sctp_timeout_change, -	.set_state_timeout = sctp_set_state_timeout, +	.app_conn_bind	= sctp_app_conn_bind, +	.debug_packet	= ip_vs_tcpudp_debug_packet, +	.timeout_change	= NULL,  }; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index f6c5200e214..e3a697234a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -9,8 +9,12 @@   *              as published by the Free Software Foundation; either version   *              2 of the License, or (at your option) any later version.   * - * Changes: + * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>   * + *              Network name space (netns) aware. + *              Global data moved to netns i.e struct netns_ipvs + *              tcp_timeouts table has copy per netns in a hash table per + *              protocol ip_vs_proto_data and is handled by netns   */  #define KMSG_COMPONENT "IPVS" @@ -28,33 +32,35 @@  #include <net/ip_vs.h>  static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, -		  int *verdict, struct ip_vs_conn **cpp) +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +		  int *verdict, struct ip_vs_conn **cpp, +		  struct ip_vs_iphdr *iph)  { +	struct net *net;  	struct ip_vs_service *svc;  	struct tcphdr _tcph, *th; -	struct ip_vs_iphdr iph; +	struct netns_ipvs *ipvs; -	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - -	th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); +	th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);  	if (th == NULL) {  		*verdict = NF_DROP;  		return 0;  	} - +	net = skb_net(skb); +	ipvs = net_ipvs(net);  	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ -	if (th->syn && -	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, -				     th->dest))) { +	rcu_read_lock(); +	if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && +	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, +				      &iph->daddr, th->dest))) {  		int ignored; -		if (ip_vs_todrop()) { +		if (ip_vs_todrop(ipvs)) {  			/*  			 * It seems that we are very loaded.  			 * We have to drop this packet :(  			 */ -			ip_vs_service_put(svc); +			rcu_read_unlock();  			*verdict = NF_DROP;  			return 0;  		} @@ -63,13 +69,18 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,  		 * Let the virtual server select a real server for the  		 * incoming connection, and create a connection entry.  		 */ -		*cpp = ip_vs_schedule(svc, skb, pp, &ignored); -		if (!*cpp && !ignored) { -			*verdict = ip_vs_leave(svc, skb, pp); +		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); +		if (!*cpp && ignored <= 0) { +			if (!ignored) +				*verdict = ip_vs_leave(svc, skb, pd, iph); +			else +				*verdict = NF_DROP; +			rcu_read_unlock();  			return 0;  		} -		ip_vs_service_put(svc);  	} +	rcu_read_unlock(); +	/* NF_ACCEPT */  	return 1;  } @@ -117,20 +128,18 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,  static int -tcp_snat_handler(struct sk_buff *skb, -		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, +		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)  {  	struct tcphdr *tcph; -	unsigned int tcphoff; +	unsigned int tcphoff = iph->len;  	int oldlen;  	int payload_csum = 0;  #ifdef CONFIG_IP_VS_IPV6 -	if (cp->af == AF_INET6) -		tcphoff = sizeof(struct ipv6hdr); -	else +	if (cp->af == AF_INET6 && iph->fragoffs) +		return 1;  #endif -		tcphoff = ip_hdrlen(skb);  	oldlen = skb->len - tcphoff;  	/* csum_check requires unshared skb */ @@ -197,20 +206,18 @@ tcp_snat_handler(struct sk_buff *skb,  static int -tcp_dnat_handler(struct sk_buff *skb, -		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, +		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)  {  	struct tcphdr *tcph; -	unsigned int tcphoff; +	unsigned int tcphoff = iph->len;  	int oldlen;  	int payload_csum = 0;  #ifdef CONFIG_IP_VS_IPV6 -	if (cp->af == AF_INET6) -		tcphoff = sizeof(struct ipv6hdr); -	else +	if (cp->af == AF_INET6 && iph->fragoffs) +		return 1;  #endif -		tcphoff = ip_hdrlen(skb);  	oldlen = skb->len - tcphoff;  	/* csum_check requires unshared skb */ @@ -338,7 +345,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {  /*   *	Timeout table[state]   */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {  	[IP_VS_TCP_S_NONE]		=	2*HZ,  	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,  	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ, @@ -396,7 +403,7 @@ static struct tcp_states_t tcp_states [] = {  /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/  /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},  /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},  /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},  /*	OUTPUT */ @@ -410,7 +417,7 @@ static struct tcp_states_t tcp_states [] = {  /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/  /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},  /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},  /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},  }; @@ -419,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = {  /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/  /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},  /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},  /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},  /*	OUTPUT */ @@ -433,14 +440,11 @@ static struct tcp_states_t tcp_states_dos [] = {  /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/  /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},  /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},  /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},  }; -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)  {  	int on = (flags & 1);		/* secure_tcp */ @@ -450,14 +454,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)  	** for most if not for all of the applications. Something  	** like "capabilities" (flags) for each object.  	*/ -	tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ -	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, -				       tcp_state_name_table, sname, to); +	pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);  }  static inline int tcp_state_idx(struct tcphdr *th) @@ -474,7 +471,7 @@ static inline int tcp_state_idx(struct tcphdr *th)  }  static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,  	      int direction, struct tcphdr *th)  {  	int state_idx; @@ -497,7 +494,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,  		goto tcp_state_out;  	} -	new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; +	new_state = +		pd->tcp_state_table[state_off+state_idx].next_state[cp->state];    tcp_state_out:  	if (new_state != cp->state) { @@ -505,7 +503,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,  		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"  			      "%s:%d state: %s->%s conn->refcnt:%d\n", -			      pp->name, +			      pd->pp->name,  			      ((state_off == TCP_DIR_OUTPUT) ?  			       "output " : "input "),  			      th->syn ? 'S' : '.', @@ -535,17 +533,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,  		}  	} -	cp->timeout = pp->timeout_table[cp->state = new_state]; +	if (likely(pd)) +		cp->timeout = pd->timeout_table[cp->state = new_state]; +	else	/* What to do ? */ +		cp->timeout = tcp_timeouts[cp->state = new_state];  } -  /*   *	Handle state transitions   */ -static int +static void  tcp_state_transition(struct ip_vs_conn *cp, int direction,  		     const struct sk_buff *skb, -		     struct ip_vs_protocol *pp) +		     struct ip_vs_proto_data *pd)  {  	struct tcphdr _tcph, *th; @@ -557,26 +557,13 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,  	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);  	if (th == NULL) -		return 0; +		return; -	spin_lock(&cp->lock); -	set_tcp_state(pp, cp, direction, th); -	spin_unlock(&cp->lock); - -	return 1; +	spin_lock_bh(&cp->lock); +	set_tcp_state(pd, cp, direction, th); +	spin_unlock_bh(&cp->lock);  } - -/* - *	Hash table for TCP application incarnations - */ -#define	TCP_APP_TAB_BITS	4 -#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS) -#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); -  static inline __u16 tcp_app_hashkey(__be16 port)  {  	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) @@ -584,44 +571,45 @@ static inline __u16 tcp_app_hashkey(__be16 port)  } -static int tcp_register_app(struct ip_vs_app *inc) +static int tcp_register_app(struct net *net, struct ip_vs_app *inc)  {  	struct ip_vs_app *i;  	__u16 hash;  	__be16 port = inc->port;  	int ret = 0; +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);  	hash = tcp_app_hashkey(port); -	spin_lock_bh(&tcp_app_lock); -	list_for_each_entry(i, &tcp_apps[hash], p_list) { +	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {  		if (i->port == port) {  			ret = -EEXIST;  			goto out;  		}  	} -	list_add(&inc->p_list, &tcp_apps[hash]); -	atomic_inc(&ip_vs_protocol_tcp.appcnt); +	list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); +	atomic_inc(&pd->appcnt);    out: -	spin_unlock_bh(&tcp_app_lock);  	return ret;  }  static void -tcp_unregister_app(struct ip_vs_app *inc) +tcp_unregister_app(struct net *net, struct ip_vs_app *inc)  { -	spin_lock_bh(&tcp_app_lock); -	atomic_dec(&ip_vs_protocol_tcp.appcnt); -	list_del(&inc->p_list); -	spin_unlock_bh(&tcp_app_lock); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + +	atomic_dec(&pd->appcnt); +	list_del_rcu(&inc->p_list);  }  static int  tcp_app_conn_bind(struct ip_vs_conn *cp)  { +	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));  	int hash;  	struct ip_vs_app *inc;  	int result = 0; @@ -633,12 +621,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)  	/* Lookup application incarnations and bind the right one */  	hash = tcp_app_hashkey(cp->vport); -	spin_lock(&tcp_app_lock); -	list_for_each_entry(inc, &tcp_apps[hash], p_list) { +	rcu_read_lock(); +	list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {  		if (inc->port == cp->vport) {  			if (unlikely(!ip_vs_app_inc_get(inc)))  				break; -			spin_unlock(&tcp_app_lock); +			rcu_read_unlock();  			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"  				      "%s:%u to app %s on port %u\n", @@ -655,7 +643,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)  			goto out;  		}  	} -	spin_unlock(&tcp_app_lock); +	rcu_read_unlock();    out:  	return result; @@ -665,24 +653,37 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)  /*   *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)   */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)  { -	spin_lock(&cp->lock); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + +	spin_lock_bh(&cp->lock);  	cp->state = IP_VS_TCP_S_LISTEN; -	cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; -	spin_unlock(&cp->lock); +	cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] +			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]); +	spin_unlock_bh(&cp->lock);  } - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) +/* --------------------------------------------- + *   timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)  { -	IP_VS_INIT_HASH_TABLE(tcp_apps); -	pp->timeout_table = tcp_timeouts; +	struct netns_ipvs *ipvs = net_ipvs(net); + +	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); +	pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, +							sizeof(tcp_timeouts)); +	if (!pd->timeout_table) +		return -ENOMEM; +	pd->tcp_state_table =  tcp_states; +	return 0;  } - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) +static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)  { +	kfree(pd->timeout_table);  } @@ -691,9 +692,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {  	.protocol =		IPPROTO_TCP,  	.num_states =		IP_VS_TCP_S_LAST,  	.dont_defrag =		0, -	.appcnt =		ATOMIC_INIT(0), -	.init =			ip_vs_tcp_init, -	.exit =			ip_vs_tcp_exit, +	.init =			NULL, +	.exit =			NULL, +	.init_netns =		__ip_vs_tcp_init, +	.exit_netns =		__ip_vs_tcp_exit,  	.register_app =		tcp_register_app,  	.unregister_app =	tcp_unregister_app,  	.conn_schedule =	tcp_conn_schedule, @@ -707,5 +709,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {  	.app_conn_bind =	tcp_app_conn_bind,  	.debug_packet =		ip_vs_tcpudp_debug_packet,  	.timeout_change =	tcp_timeout_change, -	.set_state_timeout =	tcp_set_state_timeout,  }; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 9d106a06bb0..b62a3c0ff9b 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,7 +9,8 @@   *              as published by the Free Software Foundation; either version   *              2 of the License, or (at your option) any later version.   * - * Changes: + * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com> + *              Network name space (netns) aware.   *   */ @@ -28,32 +29,33 @@  #include <net/ip6_checksum.h>  static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, -		  int *verdict, struct ip_vs_conn **cpp) +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +		  int *verdict, struct ip_vs_conn **cpp, +		  struct ip_vs_iphdr *iph)  { +	struct net *net;  	struct ip_vs_service *svc;  	struct udphdr _udph, *uh; -	struct ip_vs_iphdr iph; -	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - -	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); +	/* IPv6 fragments, only first fragment will hit this */ +	uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);  	if (uh == NULL) {  		*verdict = NF_DROP;  		return 0;  	} - -	svc = ip_vs_service_get(af, skb->mark, iph.protocol, -				&iph.daddr, uh->dest); +	net = skb_net(skb); +	rcu_read_lock(); +	svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, +				 &iph->daddr, uh->dest);  	if (svc) {  		int ignored; -		if (ip_vs_todrop()) { +		if (ip_vs_todrop(net_ipvs(net))) {  			/*  			 * It seems that we are very loaded.  			 * We have to drop this packet :(  			 */ -			ip_vs_service_put(svc); +			rcu_read_unlock();  			*verdict = NF_DROP;  			return 0;  		} @@ -62,13 +64,18 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,  		 * Let the virtual server select a real server for the  		 * incoming connection, and create a connection entry.  		 */ -		*cpp = ip_vs_schedule(svc, skb, pp, &ignored); -		if (!*cpp && !ignored) { -			*verdict = ip_vs_leave(svc, skb, pp); +		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); +		if (!*cpp && ignored <= 0) { +			if (!ignored) +				*verdict = ip_vs_leave(svc, skb, pd, iph); +			else +				*verdict = NF_DROP; +			rcu_read_unlock();  			return 0;  		} -		ip_vs_service_put(svc);  	} +	rcu_read_unlock(); +	/* NF_ACCEPT */  	return 1;  } @@ -117,20 +124,18 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,  static int -udp_snat_handler(struct sk_buff *skb, -		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, +		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)  {  	struct udphdr *udph; -	unsigned int udphoff; +	unsigned int udphoff = iph->len;  	int oldlen;  	int payload_csum = 0;  #ifdef CONFIG_IP_VS_IPV6 -	if (cp->af == AF_INET6) -		udphoff = sizeof(struct ipv6hdr); -	else +	if (cp->af == AF_INET6 && iph->fragoffs) +		return 1;  #endif -		udphoff = ip_hdrlen(skb);  	oldlen = skb->len - udphoff;  	/* csum_check requires unshared skb */ @@ -202,20 +207,18 @@ udp_snat_handler(struct sk_buff *skb,  static int -udp_dnat_handler(struct sk_buff *skb, -		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, +		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)  {  	struct udphdr *udph; -	unsigned int udphoff; +	unsigned int udphoff = iph->len;  	int oldlen;  	int payload_csum = 0;  #ifdef CONFIG_IP_VS_IPV6 -	if (cp->af == AF_INET6) -		udphoff = sizeof(struct ipv6hdr); -	else +	if (cp->af == AF_INET6 && iph->fragoffs) +		return 1;  #endif -		udphoff = ip_hdrlen(skb);  	oldlen = skb->len - udphoff;  	/* csum_check requires unshared skb */ @@ -338,19 +341,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)  	return 1;  } - -/* - *	Note: the caller guarantees that only one of register_app, - *	unregister_app or app_conn_bind is called each time. - */ - -#define	UDP_APP_TAB_BITS	4 -#define	UDP_APP_TAB_SIZE	(1 << UDP_APP_TAB_BITS) -#define	UDP_APP_TAB_MASK	(UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); -  static inline __u16 udp_app_hashkey(__be16 port)  {  	return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) @@ -358,44 +348,44 @@ static inline __u16 udp_app_hashkey(__be16 port)  } -static int udp_register_app(struct ip_vs_app *inc) +static int udp_register_app(struct net *net, struct ip_vs_app *inc)  {  	struct ip_vs_app *i;  	__u16 hash;  	__be16 port = inc->port;  	int ret = 0; +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);  	hash = udp_app_hashkey(port); - -	spin_lock_bh(&udp_app_lock); -	list_for_each_entry(i, &udp_apps[hash], p_list) { +	list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {  		if (i->port == port) {  			ret = -EEXIST;  			goto out;  		}  	} -	list_add(&inc->p_list, &udp_apps[hash]); -	atomic_inc(&ip_vs_protocol_udp.appcnt); +	list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); +	atomic_inc(&pd->appcnt);    out: -	spin_unlock_bh(&udp_app_lock);  	return ret;  }  static void -udp_unregister_app(struct ip_vs_app *inc) +udp_unregister_app(struct net *net, struct ip_vs_app *inc)  { -	spin_lock_bh(&udp_app_lock); -	atomic_dec(&ip_vs_protocol_udp.appcnt); -	list_del(&inc->p_list); -	spin_unlock_bh(&udp_app_lock); +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + +	atomic_dec(&pd->appcnt); +	list_del_rcu(&inc->p_list);  }  static int udp_app_conn_bind(struct ip_vs_conn *cp)  { +	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));  	int hash;  	struct ip_vs_app *inc;  	int result = 0; @@ -407,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)  	/* Lookup application incarnations and bind the right one */  	hash = udp_app_hashkey(cp->vport); -	spin_lock(&udp_app_lock); -	list_for_each_entry(inc, &udp_apps[hash], p_list) { +	rcu_read_lock(); +	list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {  		if (inc->port == cp->vport) {  			if (unlikely(!ip_vs_app_inc_get(inc)))  				break; -			spin_unlock(&udp_app_lock); +			rcu_read_unlock();  			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"  				      "%s:%u to app %s on port %u\n", @@ -429,14 +419,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)  			goto out;  		}  	} -	spin_unlock(&udp_app_lock); +	rcu_read_unlock();    out:  	return result;  } -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {  	[IP_VS_UDP_S_NORMAL]		=	5*60*HZ,  	[IP_VS_UDP_S_LAST]		=	2*HZ,  }; @@ -446,14 +436,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {  	[IP_VS_UDP_S_LAST]		=	"BUG!",  }; - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ -	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, -				       udp_state_name_table, sname, to); -} -  static const char * udp_state_name(int state)  {  	if (state >= IP_VS_UDP_S_LAST) @@ -461,23 +443,34 @@ static const char * udp_state_name(int state)  	return udp_state_name_table[state] ? udp_state_name_table[state] : "?";  } -static int +static void  udp_state_transition(struct ip_vs_conn *cp, int direction,  		     const struct sk_buff *skb, -		     struct ip_vs_protocol *pp) +		     struct ip_vs_proto_data *pd)  { -	cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; -	return 1; +	if (unlikely(!pd)) { +		pr_err("UDP no ns data\n"); +		return; +	} + +	cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];  } -static void udp_init(struct ip_vs_protocol *pp) +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)  { -	IP_VS_INIT_HASH_TABLE(udp_apps); -	pp->timeout_table = udp_timeouts; +	struct netns_ipvs *ipvs = net_ipvs(net); + +	ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); +	pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, +							sizeof(udp_timeouts)); +	if (!pd->timeout_table) +		return -ENOMEM; +	return 0;  } -static void udp_exit(struct ip_vs_protocol *pp) +static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)  { +	kfree(pd->timeout_table);  } @@ -486,8 +479,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = {  	.protocol =		IPPROTO_UDP,  	.num_states =		IP_VS_UDP_S_LAST,  	.dont_defrag =		0, -	.init =			udp_init, -	.exit =			udp_exit, +	.init =			NULL, +	.exit =			NULL, +	.init_netns =		__udp_init, +	.exit_netns =		__udp_exit,  	.conn_schedule =	udp_conn_schedule,  	.conn_in_get =		ip_vs_conn_in_get_proto,  	.conn_out_get =		ip_vs_conn_out_get_proto, @@ -501,5 +496,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = {  	.app_conn_bind =	udp_app_conn_bind,  	.debug_packet =		ip_vs_tcpudp_debug_packet,  	.timeout_change =	NULL, -	.set_state_timeout =	udp_set_state_timeout,  }; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index e210f37d8ea..176b87c35e3 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)  } -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)  { -	svc->sched_data = &svc->destinations; +	struct list_head *p; + +	spin_lock_bh(&svc->sched_lock); +	p = (struct list_head *) svc->sched_data; +	/* dest is already unlinked, so p->prev is not valid but +	 * p->next is valid, use it to reach previous entry. +	 */ +	if (p == &dest->n_list) +		svc->sched_data = p->next->prev; +	spin_unlock_bh(&svc->sched_lock);  	return 0;  } @@ -46,38 +55,44 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)   * Round-Robin Scheduling   */  static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		  struct ip_vs_iphdr *iph)  { -	struct list_head *p, *q; -	struct ip_vs_dest *dest; +	struct list_head *p; +	struct ip_vs_dest *dest, *last; +	int pass = 0;  	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); -	write_lock(&svc->sched_lock); -	p = (struct list_head *)svc->sched_data; -	p = p->next; -	q = p; +	spin_lock_bh(&svc->sched_lock); +	p = (struct list_head *) svc->sched_data; +	last = dest = list_entry(p, struct ip_vs_dest, n_list); +  	do { -		/* skip list head */ -		if (q == &svc->destinations) { -			q = q->next; -			continue; +		list_for_each_entry_continue_rcu(dest, +						 &svc->destinations, +						 n_list) { +			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && +			    atomic_read(&dest->weight) > 0) +				/* HIT */ +				goto out; +			if (dest == last) +				goto stop;  		} - -		dest = list_entry(q, struct ip_vs_dest, n_list); -		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && -		    atomic_read(&dest->weight) > 0) -			/* HIT */ -			goto out; -		q = q->next; -	} while (q != p); -	write_unlock(&svc->sched_lock); -	IP_VS_ERR_RL("RR: no destination available\n"); +		pass++; +		/* Previous dest could be unlinked, do not loop forever. +		 * If we stay at head there is no need for 2nd pass. +		 */ +	} while (pass < 2 && p != &svc->destinations); + +stop: +	spin_unlock_bh(&svc->sched_lock); +	ip_vs_scheduler_err(svc, "no destination available");  	return NULL;    out: -	svc->sched_data = q; -	write_unlock(&svc->sched_lock); +	svc->sched_data = &dest->n_list; +	spin_unlock_bh(&svc->sched_lock);  	IP_VS_DBG_BUF(6, "RR: server %s:%u "  		      "activeconns %d refcnt %d weight %d\n",  		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), @@ -94,7 +109,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {  	.module =		THIS_MODULE,  	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),  	.init_service =		ip_vs_rr_init_svc, -	.update_service =	ip_vs_rr_update_svc, +	.add_dest =		NULL, +	.del_dest =		ip_vs_rr_del_dest,  	.schedule =		ip_vs_rr_schedule,  }; @@ -106,6 +122,7 @@ static int __init ip_vs_rr_init(void)  static void __exit ip_vs_rr_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +	synchronize_rcu();  }  module_init(ip_vs_rr_init); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 076ebe00435..4dbcda6258b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -29,13 +29,14 @@  #include <net/ip_vs.h> +EXPORT_SYMBOL(ip_vs_scheduler_err);  /*   *  IPVS scheduler list   */  static LIST_HEAD(ip_vs_schedulers); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_sched_lock); +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex);  /* @@ -46,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,  {  	int ret; -	svc->scheduler = scheduler; -  	if (scheduler->init_service) {  		ret = scheduler->init_service(svc);  		if (ret) { @@ -55,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,  			return ret;  		}  	} - +	rcu_assign_pointer(svc->scheduler, scheduler);  	return 0;  } @@ -63,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,  /*   *  Unbind a service with its scheduler   */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, +			    struct ip_vs_scheduler *sched)  { -	struct ip_vs_scheduler *sched = svc->scheduler; +	struct ip_vs_scheduler *cur_sched; -	if (!sched) -		return 0; - -	if (sched->done_service) { -		if (sched->done_service(svc) != 0) { -			pr_err("%s(): done error\n", __func__); -			return -EINVAL; -		} -	} +	cur_sched = rcu_dereference_protected(svc->scheduler, 1); +	/* This check proves that old 'sched' was installed */ +	if (!cur_sched) +		return; -	svc->scheduler = NULL; -	return 0; +	if (sched->done_service) +		sched->done_service(svc); +	/* svc->scheduler can not be set to NULL */  } @@ -91,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)  	IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); -	spin_lock_bh(&ip_vs_sched_lock); +	mutex_lock(&ip_vs_sched_mutex);  	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {  		/* @@ -105,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)  		}  		if (strcmp(sched_name, sched->name)==0) {  			/* HIT */ -			spin_unlock_bh(&ip_vs_sched_lock); +			mutex_unlock(&ip_vs_sched_mutex);  			return sched;  		}  		if (sched->module)  			module_put(sched->module);  	} -	spin_unlock_bh(&ip_vs_sched_lock); +	mutex_unlock(&ip_vs_sched_mutex);  	return NULL;  } @@ -146,6 +142,30 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)  		module_put(scheduler->module);  } +/* + * Common error output helper for schedulers + */ + +void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) +{ +	struct ip_vs_scheduler *sched; + +	sched = rcu_dereference(svc->scheduler); +	if (svc->fwmark) { +		IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", +			     sched->name, svc->fwmark, svc->fwmark, msg); +#ifdef CONFIG_IP_VS_IPV6 +	} else if (svc->af == AF_INET6) { +		IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", +			     sched->name, ip_vs_proto_name(svc->protocol), +			     &svc->addr.in6, ntohs(svc->port), msg); +#endif +	} else { +		IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", +			     sched->name, ip_vs_proto_name(svc->protocol), +			     &svc->addr.ip, ntohs(svc->port), msg); +	} +}  /*   *  Register a scheduler in the scheduler list @@ -167,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)  	/* increase the module use count */  	ip_vs_use_count_inc(); -	spin_lock_bh(&ip_vs_sched_lock); +	mutex_lock(&ip_vs_sched_mutex);  	if (!list_empty(&scheduler->n_list)) { -		spin_unlock_bh(&ip_vs_sched_lock); +		mutex_unlock(&ip_vs_sched_mutex);  		ip_vs_use_count_dec();  		pr_err("%s(): [%s] scheduler already linked\n",  		       __func__, scheduler->name); @@ -183,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)  	 */  	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {  		if (strcmp(scheduler->name, sched->name) == 0) { -			spin_unlock_bh(&ip_vs_sched_lock); +			mutex_unlock(&ip_vs_sched_mutex);  			ip_vs_use_count_dec();  			pr_err("%s(): [%s] scheduler already existed "  			       "in the system\n", __func__, scheduler->name); @@ -194,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)  	 *	Add it into the d-linked scheduler list  	 */  	list_add(&scheduler->n_list, &ip_vs_schedulers); -	spin_unlock_bh(&ip_vs_sched_lock); +	mutex_unlock(&ip_vs_sched_mutex);  	pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -212,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)  		return -EINVAL;  	} -	spin_lock_bh(&ip_vs_sched_lock); +	mutex_lock(&ip_vs_sched_mutex);  	if (list_empty(&scheduler->n_list)) { -		spin_unlock_bh(&ip_vs_sched_lock); +		mutex_unlock(&ip_vs_sched_mutex);  		pr_err("%s(): [%s] scheduler is not in the list. failed\n",  		       __func__, scheduler->name);  		return -EINVAL; @@ -224,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)  	 *	Remove it from the d-linked scheduler list  	 */  	list_del(&scheduler->n_list); -	spin_unlock_bh(&ip_vs_sched_lock); +	mutex_unlock(&ip_vs_sched_mutex);  	/* decrease the module use count */  	ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 1ab75a9dc40..e446b9fa742 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -44,7 +44,7 @@  #include <net/ip_vs.h> -static inline unsigned int +static inline int  ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)  {  	/* @@ -59,10 +59,11 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)   *	Weighted Least Connection scheduling   */  static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		   struct ip_vs_iphdr *iph)  {  	struct ip_vs_dest *dest, *least; -	unsigned int loh, doh; +	int loh, doh;  	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -79,7 +80,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  	 * new connections.  	 */ -	list_for_each_entry(dest, &svc->destinations, n_list) { +	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&  		    atomic_read(&dest->weight) > 0) {  			least = dest; @@ -87,19 +88,19 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  			goto nextstage;  		}  	} -	IP_VS_ERR_RL("SED: no destination available\n"); +	ip_vs_scheduler_err(svc, "no destination available");  	return NULL;  	/*  	 *    Find the destination with the least load.  	 */    nextstage: -	list_for_each_entry_continue(dest, &svc->destinations, n_list) { +	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {  		if (dest->flags & IP_VS_DEST_F_OVERLOAD)  			continue;  		doh = ip_vs_sed_dest_overhead(dest); -		if (loh * atomic_read(&dest->weight) > -		    doh * atomic_read(&least->weight)) { +		if ((__s64)loh * atomic_read(&dest->weight) > +		    (__s64)doh * atomic_read(&least->weight)) {  			least = dest;  			loh = doh;  		} @@ -134,6 +135,7 @@ static int __init ip_vs_sed_init(void)  static void __exit ip_vs_sed_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +	synchronize_rcu();  }  module_init(ip_vs_sed_init); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index e6cc174fbc0..cc65b2f42cd 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -30,6 +30,11 @@   * server is dead or overloaded, the load balancer can bypass the cache   * server and send requests to the original server directly.   * + * The weight destination attribute can be used to control the + * distribution of connections to the destinations in servernode. The + * greater the weight, the more connections the destination + * will receive. + *   */  #define KMSG_COMPONENT "IPVS" @@ -43,12 +48,16 @@  #include <net/ip_vs.h> +#include <net/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> +  /*   *      IPVS SH bucket   */  struct ip_vs_sh_bucket { -	struct ip_vs_dest       *dest;          /* real server (cache) */ +	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */  };  /* @@ -61,11 +70,24 @@ struct ip_vs_sh_bucket {  #define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)  #define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1) +struct ip_vs_sh_state { +	struct rcu_head			rcu_head; +	struct ip_vs_sh_bucket		buckets[IP_VS_SH_TAB_SIZE]; +}; + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ +	return atomic_read(&dest->weight) <= 0 || +	       dest->flags & IP_VS_DEST_F_OVERLOAD; +}  /*   *	Returns hash value for IPVS SH entry   */ -static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, +		 __be16 port, unsigned int offset)  {  	__be32 addr_fold = addr->ip; @@ -74,7 +96,8 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)  		addr_fold = addr->ip6[0]^addr->ip6[1]^  			    addr->ip6[2]^addr->ip6[3];  #endif -	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; +	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & +		IP_VS_SH_TAB_MASK;  } @@ -82,38 +105,102 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)   *      Get ip_vs_dest associated with supplied parameters.   */  static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, -	     const union nf_inet_addr *addr) +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, +	     const union nf_inet_addr *addr, __be16 port)  { -	return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; +	unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); +	struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + +	return (!dest || is_unavailable(dest)) ? NULL : dest;  } +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, +		      const union nf_inet_addr *addr, __be16 port) +{ +	unsigned int offset, roffset; +	unsigned int hash, ihash; +	struct ip_vs_dest *dest; + +	/* first try the dest it's supposed to go to */ +	ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); +	dest = rcu_dereference(s->buckets[ihash].dest); +	if (!dest) +		return NULL; +	if (!is_unavailable(dest)) +		return dest; + +	IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", +		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + +	/* if the original dest is unavailable, loop around the table +	 * starting from ihash to find a new dest +	 */ +	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { +		roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; +		hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); +		dest = rcu_dereference(s->buckets[hash].dest); +		if (!dest) +			break; +		if (!is_unavailable(dest)) +			return dest; +		IP_VS_DBG_BUF(6, "SH: selected unavailable " +			      "server %s:%d (offset %d), reselecting", +			      IP_VS_DBG_ADDR(svc->af, &dest->addr), +			      ntohs(dest->port), roffset); +	} + +	return NULL; +} +  /*   *      Assign all the hash buckets of the specified table with the service.   */  static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)  {  	int i;  	struct ip_vs_sh_bucket *b;  	struct list_head *p;  	struct ip_vs_dest *dest; +	int d_count; +	bool empty; -	b = tbl; +	b = &s->buckets[0];  	p = &svc->destinations; +	empty = list_empty(p); +	d_count = 0;  	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { -		if (list_empty(p)) { -			b->dest = NULL; -		} else { +		dest = rcu_dereference_protected(b->dest, 1); +		if (dest) +			ip_vs_dest_put(dest); +		if (empty) +			RCU_INIT_POINTER(b->dest, NULL); +		else {  			if (p == &svc->destinations)  				p = p->next;  			dest = list_entry(p, struct ip_vs_dest, n_list); -			atomic_inc(&dest->refcnt); -			b->dest = dest; +			ip_vs_dest_hold(dest); +			RCU_INIT_POINTER(b->dest, dest); + +			IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", +				      i, IP_VS_DBG_ADDR(svc->af, &dest->addr), +				      atomic_read(&dest->weight)); + +			/* Don't move to next dest until filling weight */ +			if (++d_count >= atomic_read(&dest->weight)) { +				p = p->next; +				d_count = 0; +			} -			p = p->next;  		}  		b++;  	} @@ -124,16 +211,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)  /*   *      Flush all the hash buckets of the specified table.   */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +static void ip_vs_sh_flush(struct ip_vs_sh_state *s)  {  	int i;  	struct ip_vs_sh_bucket *b; +	struct ip_vs_dest *dest; -	b = tbl; +	b = &s->buckets[0];  	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { -		if (b->dest) { -			atomic_dec(&b->dest->refcnt); -			b->dest = NULL; +		dest = rcu_dereference_protected(b->dest, 1); +		if (dest) { +			ip_vs_dest_put(dest); +			RCU_INIT_POINTER(b->dest, NULL);  		}  		b++;  	} @@ -142,64 +231,84 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)  static int ip_vs_sh_init_svc(struct ip_vs_service *svc)  { -	struct ip_vs_sh_bucket *tbl; +	struct ip_vs_sh_state *s;  	/* allocate the SH table for this service */ -	tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, -		      GFP_ATOMIC); -	if (tbl == NULL) { -		pr_err("%s(): no memory\n", __func__); +	s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); +	if (s == NULL)  		return -ENOMEM; -	} -	svc->sched_data = tbl; + +	svc->sched_data = s;  	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "  		  "current service\n",  		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); -	/* assign the hash buckets with the updated service */ -	ip_vs_sh_assign(tbl, svc); +	/* assign the hash buckets with current dests */ +	ip_vs_sh_reassign(s, svc);  	return 0;  } -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +static void ip_vs_sh_done_svc(struct ip_vs_service *svc)  { -	struct ip_vs_sh_bucket *tbl = svc->sched_data; +	struct ip_vs_sh_state *s = svc->sched_data;  	/* got to clean up hash buckets here */ -	ip_vs_sh_flush(tbl); +	ip_vs_sh_flush(s);  	/* release the table itself */ -	kfree(svc->sched_data); +	kfree_rcu(s, rcu_head);  	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",  		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - -	return 0;  } -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, +				 struct ip_vs_dest *dest)  { -	struct ip_vs_sh_bucket *tbl = svc->sched_data; - -	/* got to clean up hash buckets here */ -	ip_vs_sh_flush(tbl); +	struct ip_vs_sh_state *s = svc->sched_data;  	/* assign the hash buckets with the updated service */ -	ip_vs_sh_assign(tbl, svc); +	ip_vs_sh_reassign(s, svc);  	return 0;  } -/* - *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - *      consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)  { -	return dest->flags & IP_VS_DEST_F_OVERLOAD; +	__be16 port; +	struct tcphdr _tcph, *th; +	struct udphdr _udph, *uh; +	sctp_sctphdr_t _sctph, *sh; + +	switch (iph->protocol) { +	case IPPROTO_TCP: +		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); +		if (unlikely(th == NULL)) +			return 0; +		port = th->source; +		break; +	case IPPROTO_UDP: +		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); +		if (unlikely(uh == NULL)) +			return 0; +		port = uh->source; +		break; +	case IPPROTO_SCTP: +		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); +		if (unlikely(sh == NULL)) +			return 0; +		port = sh->source; +		break; +	default: +		port = 0; +	} + +	return port;  } @@ -207,28 +316,32 @@ static inline int is_overloaded(struct ip_vs_dest *dest)   *      Source Hashing scheduling   */  static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		  struct ip_vs_iphdr *iph)  {  	struct ip_vs_dest *dest; -	struct ip_vs_sh_bucket *tbl; -	struct ip_vs_iphdr iph; - -	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); +	struct ip_vs_sh_state *s; +	__be16 port = 0;  	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); -	tbl = (struct ip_vs_sh_bucket *)svc->sched_data; -	dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); -	if (!dest -	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE) -	    || atomic_read(&dest->weight) <= 0 -	    || is_overloaded(dest)) { -		IP_VS_ERR_RL("SH: no destination available\n"); +	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) +		port = ip_vs_sh_get_port(skb, iph); + +	s = (struct ip_vs_sh_state *) svc->sched_data; + +	if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) +		dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); +	else +		dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + +	if (!dest) { +		ip_vs_scheduler_err(svc, "no destination available");  		return NULL;  	}  	IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", -		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), +		      IP_VS_DBG_ADDR(svc->af, &iph->saddr),  		      IP_VS_DBG_ADDR(svc->af, &dest->addr),  		      ntohs(dest->port)); @@ -247,7 +360,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =  	.n_list	 =		LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),  	.init_service =		ip_vs_sh_init_svc,  	.done_service =		ip_vs_sh_done_svc, -	.update_service =	ip_vs_sh_update_svc, +	.add_dest =		ip_vs_sh_dest_changed, +	.del_dest =		ip_vs_sh_dest_changed, +	.upd_dest =		ip_vs_sh_dest_changed,  	.schedule =		ip_vs_sh_schedule,  }; @@ -261,6 +376,7 @@ static int __init ip_vs_sh_init(void)  static void __exit ip_vs_sh_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +	synchronize_rcu();  } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index ab85aedea17..db801263ee9 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -5,6 +5,18 @@   *              high-performance and highly available server based on a   *              cluster of servers.   * + * Version 1,   is capable of handling both version 0 and 1 messages. + *              Version 0 is the plain old format. + *              Note Version 0 receivers will just drop Ver 1 messages. + *              Version 1 is capable of handle IPv6, Persistence data, + *              time-outs, and firewall marks. + *              In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + *              Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions  Message: is a complete datagram + *              Sync_conn: is a part of a Message + *              Param Data is an option to a Sync_conn. + *   * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>   *   * ip_vs_sync:  sync connection info from master load balancer to backups @@ -15,6 +27,8 @@   *	Alexandre Cassen	:	Added SyncID support for incoming sync   *					messages filtering.   *	Justin Ossevoort	:	Fix endian problem on sync message size. + *	Hans Schillstrom	:	Added Version 1: i.e. IPv6, + *					Persistence support, fwmark and time-out.   */  #define KMSG_COMPONENT "IPVS" @@ -35,6 +49,8 @@  #include <linux/wait.h>  #include <linux/kernel.h> +#include <asm/unaligned.h>		/* Used for ntoh_seq and hton_seq */ +  #include <net/ip.h>  #include <net/sock.h> @@ -43,11 +59,14 @@  #define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */  #define IP_VS_SYNC_PORT  8848          /* multicast port */ +#define SYNC_PROTO_VER  1		/* Protocol version in header */ +static struct lock_class_key __ipvs_sync_key;  /*   *	IPVS sync connection entry + *	Version 0, i.e. original version.   */ -struct ip_vs_sync_conn { +struct ip_vs_sync_conn_v0 {  	__u8			reserved;  	/* Protocol, addresses and port numbers */ @@ -71,51 +90,177 @@ struct ip_vs_sync_conn_options {  	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */  }; +/* +     Sync Connection format (sync_conn) + +       0                   1                   2                   3 +       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |    Type       |    Protocol   | Ver.  |        Size           | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |                             Flags                             | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |            State              |         cport                 | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |            vport              |         dport                 | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |                             fwmark                            | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |                             timeout  (in sec.)                | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |                              ...                              | +      |                        IP-Addresses  (v4 or v6)               | +      |                              ...                              | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +  Optional Parameters. +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      | Param. Type    | Param. Length |   Param. data                | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               | +      |                              ...                              | +      |                               +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |                               | Param Type    | Param. Length | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |                           Param  data                         | +      |         Last Param data should be padded for 32 bit alignment | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + *  Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { +	__u8			type; +	__u8			protocol;	/* Which protocol (TCP/UDP) */ +	__be16			ver_size;	/* Version msb 4 bits */ +	/* Flags and state transition */ +	__be32			flags;		/* status flags */ +	__be16			state;		/* state info 	*/ +	/* Protocol, addresses and port numbers */ +	__be16			cport; +	__be16			vport; +	__be16			dport; +	__be32			fwmark;		/* Firewall mark from skb */ +	__be32			timeout;	/* cp timeout */ +	__be32			caddr;		/* client address */ +	__be32			vaddr;		/* virtual address */ +	__be32			daddr;		/* destination address */ +	/* The sequence options start here */ +	/* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { +	__u8			type; +	__u8			protocol;	/* Which protocol (TCP/UDP) */ +	__be16			ver_size;	/* Version msb 4 bits */ +	/* Flags and state transition */ +	__be32			flags;		/* status flags */ +	__be16			state;		/* state info 	*/ +	/* Protocol, addresses and port numbers */ +	__be16			cport; +	__be16			vport; +	__be16			dport; +	__be32			fwmark;		/* Firewall mark from skb */ +	__be32			timeout;	/* cp timeout */ +	struct in6_addr		caddr;		/* client address */ +	struct in6_addr		vaddr;		/* virtual address */ +	struct in6_addr		daddr;		/* destination address */ +	/* The sequence options start here */ +	/* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { +	struct ip_vs_sync_v4	v4; +	struct ip_vs_sync_v6	v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6		0 +#define STYPE_F_INET6		(1 << STYPE_INET6) + +#define SVER_SHIFT		12		/* Shift to get version */ +#define SVER_MASK		0x0fff		/* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA	1 +#define IPVS_OPT_PE_DATA	2 +#define IPVS_OPT_PE_NAME	3 +#define IPVS_OPT_PARAM		7 + +#define IPVS_OPT_F_SEQ_DATA	(1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA	(1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME	(1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM	(1 << (IPVS_OPT_PARAM-1)) +  struct ip_vs_sync_thread_data { +	struct net *net;  	struct socket *sock;  	char *buf; +	int id;  }; -#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn)) +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn_v0))  #define FULL_CONN_SIZE  \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))  /* -  The master mulitcasts messages to the backup load balancers in the -  following format. +  The master mulitcasts messages (Datagrams) to the backup load balancers +  in the following format. + + Version 1: +  Note, first byte should be Zero, so ver 0 receivers will drop the packet.         0                   1                   2                   3         0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1        +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -      |  Count Conns  |    SyncID     |            Size               | +      |      0        |    SyncID     |            Size               | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |  Count Conns  |    Version    |    Reserved, set to Zero      |        +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+        |                                                               |        |                    IPVS Sync Connection (1)                   |        +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+        |                            .                                  | -      |                            .                                  | +      ~                            .                                  ~        |                            .                                  |        +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+        |                                                               |        |                    IPVS Sync Connection (n)                   |        +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header +       0                   1                   2                   3 +       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |  Count Conns  |    SyncID     |            Size               | +      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +      |                    IPVS Sync Connection (1)                   |  */  #define SYNC_MESG_HEADER_LEN	4  #define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ -struct ip_vs_sync_mesg { +/* Version 0 header */ +struct ip_vs_sync_mesg_v0 {  	__u8                    nr_conns;  	__u8                    syncid; -	__u16                   size; +	__be16                  size;  	/* ip_vs_sync_conn entries start here */  }; -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; +/* Version 1 header */ +struct ip_vs_sync_mesg { +	__u8			reserved;	/* must be zero */ +	__u8			syncid; +	__be16			size; +	__u8			nr_conns; +	__s8			version;	/* SYNC_PROTO_VER  */ +	__u16			spare; +	/* ip_vs_sync_conn entries start here */ +};  struct ip_vs_sync_buff {  	struct list_head        list; @@ -127,70 +272,75 @@ struct ip_vs_sync_buff {  	unsigned char           *end;  }; +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ +	ho->init_seq       = get_unaligned_be32(&no->init_seq); +	ho->delta          = get_unaligned_be32(&no->delta); +	ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff   *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - -/* multicast addr */ -static struct sockaddr_in mcast_addr = { -	.sin_family		= AF_INET, -	.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT), -	.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP), -}; - +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ +	put_unaligned_be32(ho->init_seq, &no->init_seq); +	put_unaligned_be32(ho->delta, &no->delta); +	put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} -static inline struct ip_vs_sync_buff *sb_dequeue(void) +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)  {  	struct ip_vs_sync_buff *sb; -	spin_lock_bh(&ip_vs_sync_lock); -	if (list_empty(&ip_vs_sync_queue)) { +	spin_lock_bh(&ipvs->sync_lock); +	if (list_empty(&ms->sync_queue)) {  		sb = NULL; +		__set_current_state(TASK_INTERRUPTIBLE);  	} else { -		sb = list_entry(ip_vs_sync_queue.next, -				struct ip_vs_sync_buff, +		sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,  				list);  		list_del(&sb->list); +		ms->sync_queue_len--; +		if (!ms->sync_queue_len) +			ms->sync_queue_delay = 0;  	} -	spin_unlock_bh(&ip_vs_sync_lock); +	spin_unlock_bh(&ipvs->sync_lock);  	return sb;  } -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +/* + * Create a new sync buffer for Version 1 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs)  {  	struct ip_vs_sync_buff *sb;  	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))  		return NULL; -	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { +	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); +	if (!sb->mesg) {  		kfree(sb);  		return NULL;  	} +	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zero now */ +	sb->mesg->version = SYNC_PROTO_VER; +	sb->mesg->syncid = ipvs->master_syncid; +	sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));  	sb->mesg->nr_conns = 0; -	sb->mesg->syncid = ip_vs_master_syncid; -	sb->mesg->size = 4; -	sb->head = (unsigned char *)sb->mesg + 4; -	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; +	sb->mesg->spare = 0; +	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); +	sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; +  	sb->firstuse = jiffies;  	return sb;  } @@ -201,14 +351,24 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)  	kfree(sb);  } -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +static inline void sb_queue_tail(struct netns_ipvs *ipvs, +				 struct ipvs_master_sync_state *ms)  { -	spin_lock(&ip_vs_sync_lock); -	if (ip_vs_sync_state & IP_VS_STATE_MASTER) -		list_add_tail(&sb->list, &ip_vs_sync_queue); -	else +	struct ip_vs_sync_buff *sb = ms->sync_buff; + +	spin_lock(&ipvs->sync_lock); +	if (ipvs->sync_state & IP_VS_STATE_MASTER && +	    ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { +		if (!ms->sync_queue_len) +			schedule_delayed_work(&ms->master_wakeup_work, +					      max(IPVS_SYNC_SEND_DELAY, 1)); +		ms->sync_queue_len++; +		list_add_tail(&sb->list, &ms->sync_queue); +		if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) +			wake_up_process(ms->master_thread); +	} else  		ip_vs_sync_buff_release(sb); -	spin_unlock(&ip_vs_sync_lock); +	spin_unlock(&ipvs->sync_lock);  }  /* @@ -216,47 +376,209 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)   *	than the specified time or the specified time is zero.   */  static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, +		   unsigned long time)  {  	struct ip_vs_sync_buff *sb; -	spin_lock_bh(&curr_sb_lock); -	if (curr_sb && (time == 0 || -			time_before(jiffies - curr_sb->firstuse, time))) { -		sb = curr_sb; -		curr_sb = NULL; +	spin_lock_bh(&ipvs->sync_buff_lock); +	sb = ms->sync_buff; +	if (sb && time_after_eq(jiffies - sb->firstuse, time)) { +		ms->sync_buff = NULL; +		__set_current_state(TASK_RUNNING);  	} else  		sb = NULL; -	spin_unlock_bh(&curr_sb_lock); +	spin_unlock_bh(&ipvs->sync_buff_lock); +	return sb; +} + +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) +{ +	return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; +} + +/* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +{ +	struct ip_vs_sync_buff *sb; +	struct ip_vs_sync_mesg_v0 *mesg; + +	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) +		return NULL; + +	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); +	if (!sb->mesg) { +		kfree(sb); +		return NULL; +	} +	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; +	mesg->nr_conns = 0; +	mesg->syncid = ipvs->master_syncid; +	mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); +	sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); +	sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; +	sb->firstuse = jiffies;  	return sb;  } +/* Check if connection is controlled by persistence */ +static inline bool in_persistence(struct ip_vs_conn *cp) +{ +	for (cp = cp->control; cp; cp = cp->control) { +		if (cp->flags & IP_VS_CONN_F_TEMPLATE) +			return true; +	} +	return false; +} + +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + *	sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + *	for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + *	with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, +				  struct ip_vs_conn *cp, int pkts) +{ +	unsigned long orig = ACCESS_ONCE(cp->sync_endtime); +	unsigned long now = jiffies; +	unsigned long n = (now + cp->timeout) & ~3UL; +	unsigned int sync_refresh_period; +	int sync_period; +	int force; + +	/* Check if we sync in current state */ +	if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) +		force = 0; +	else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) +		return 0; +	else if (likely(cp->protocol == IPPROTO_TCP)) { +		if (!((1 << cp->state) & +		      ((1 << IP_VS_TCP_S_ESTABLISHED) | +		       (1 << IP_VS_TCP_S_FIN_WAIT) | +		       (1 << IP_VS_TCP_S_CLOSE) | +		       (1 << IP_VS_TCP_S_CLOSE_WAIT) | +		       (1 << IP_VS_TCP_S_TIME_WAIT)))) +			return 0; +		force = cp->state != cp->old_state; +		if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) +			goto set; +	} else if (unlikely(cp->protocol == IPPROTO_SCTP)) { +		if (!((1 << cp->state) & +		      ((1 << IP_VS_SCTP_S_ESTABLISHED) | +		       (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | +		       (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | +		       (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | +		       (1 << IP_VS_SCTP_S_CLOSED)))) +			return 0; +		force = cp->state != cp->old_state; +		if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) +			goto set; +	} else { +		/* UDP or another protocol with single state */ +		force = 0; +	} + +	sync_refresh_period = sysctl_sync_refresh_period(ipvs); +	if (sync_refresh_period > 0) { +		long diff = n - orig; +		long min_diff = max(cp->timeout >> 1, 10UL * HZ); + +		/* Avoid sync if difference is below sync_refresh_period +		 * and below the half timeout. +		 */ +		if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { +			int retries = orig & 3; + +			if (retries >= sysctl_sync_retries(ipvs)) +				return 0; +			if (time_before(now, orig - cp->timeout + +					(sync_refresh_period >> 3))) +				return 0; +			n |= retries + 1; +		} +	} +	sync_period = sysctl_sync_period(ipvs); +	if (sync_period > 0) { +		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && +		    pkts % sync_period != sysctl_sync_threshold(ipvs)) +			return 0; +	} else if (sync_refresh_period <= 0 && +		   pkts != sysctl_sync_threshold(ipvs)) +		return 0; + +set: +	cp->old_state = cp->state; +	n = cmpxchg(&cp->sync_endtime, orig, n); +	return n == orig || force; +}  /* + *      Version 0 , could be switched in by sys_ctl.   *      Add an ip_vs_conn information into the current sync_buff. - *      Called by ip_vs_in.   */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) +static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, +			       int pkts)  { -	struct ip_vs_sync_mesg *m; -	struct ip_vs_sync_conn *s; +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_sync_mesg_v0 *m; +	struct ip_vs_sync_conn_v0 *s; +	struct ip_vs_sync_buff *buff; +	struct ipvs_master_sync_state *ms; +	int id;  	int len; -	spin_lock(&curr_sb_lock); -	if (!curr_sb) { -		if (!(curr_sb=ip_vs_sync_buff_create())) { -			spin_unlock(&curr_sb_lock); +	if (unlikely(cp->af != AF_INET)) +		return; +	/* Do not sync ONE PACKET */ +	if (cp->flags & IP_VS_CONN_F_ONE_PACKET) +		return; + +	if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) +		return; + +	spin_lock_bh(&ipvs->sync_buff_lock); +	if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { +		spin_unlock_bh(&ipvs->sync_buff_lock); +		return; +	} + +	id = select_master_thread_id(ipvs, cp); +	ms = &ipvs->ms[id]; +	buff = ms->sync_buff; +	if (buff) { +		m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; +		/* Send buffer if it is for v1 */ +		if (!m->nr_conns) { +			sb_queue_tail(ipvs, ms); +			ms->sync_buff = NULL; +			buff = NULL; +		} +	} +	if (!buff) { +		buff = ip_vs_sync_buff_create_v0(ipvs); +		if (!buff) { +			spin_unlock_bh(&ipvs->sync_buff_lock);  			pr_err("ip_vs_sync_buff_create failed.\n");  			return;  		} +		ms->sync_buff = buff;  	}  	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :  		SIMPLE_CONN_SIZE; -	m = curr_sb->mesg; -	s = (struct ip_vs_sync_conn *)curr_sb->head; +	m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; +	s = (struct ip_vs_sync_conn_v0 *) buff->head;  	/* copy members */ +	s->reserved = 0;  	s->protocol = cp->protocol;  	s->cport = cp->cport;  	s->vport = cp->vport; @@ -273,84 +595,364 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)  	}  	m->nr_conns++; -	m->size += len; -	curr_sb->head += len; +	m->size = htons(ntohs(m->size) + len); +	buff->head += len;  	/* check if there is a space for next one */ -	if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { -		sb_queue_tail(curr_sb); -		curr_sb = NULL; +	if (buff->head + FULL_CONN_SIZE > buff->end) { +		sb_queue_tail(ipvs, ms); +		ms->sync_buff = NULL;  	} -	spin_unlock(&curr_sb_lock); +	spin_unlock_bh(&ipvs->sync_buff_lock);  	/* synchronize its controller if it has */ -	if (cp->control) -		ip_vs_sync_conn(cp->control); +	cp = cp->control; +	if (cp) { +		if (cp->flags & IP_VS_CONN_F_TEMPLATE) +			pkts = atomic_add_return(1, &cp->in_pkts); +		else +			pkts = sysctl_sync_threshold(ipvs); +		ip_vs_sync_conn(net, cp->control, pkts); +	}  } +/* + *      Add an ip_vs_conn information into the current sync_buff. + *      Called by ip_vs_in. + *      Sending Version 1 messages + */ +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_sync_mesg *m; +	union ip_vs_sync_conn *s; +	struct ip_vs_sync_buff *buff; +	struct ipvs_master_sync_state *ms; +	int id; +	__u8 *p; +	unsigned int len, pe_name_len, pad; + +	/* Handle old version of the protocol */ +	if (sysctl_sync_ver(ipvs) == 0) { +		ip_vs_sync_conn_v0(net, cp, pkts); +		return; +	} +	/* Do not sync ONE PACKET */ +	if (cp->flags & IP_VS_CONN_F_ONE_PACKET) +		goto control; +sloop: +	if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) +		goto control; + +	/* Sanity checks */ +	pe_name_len = 0; +	if (cp->pe_data_len) { +		if (!cp->pe_data || !cp->dest) { +			IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); +			return; +		} +		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); +	} + +	spin_lock_bh(&ipvs->sync_buff_lock); +	if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { +		spin_unlock_bh(&ipvs->sync_buff_lock); +		return; +	} + +	id = select_master_thread_id(ipvs, cp); +	ms = &ipvs->ms[id]; + +#ifdef CONFIG_IP_VS_IPV6 +	if (cp->af == AF_INET6) +		len = sizeof(struct ip_vs_sync_v6); +	else +#endif +		len = sizeof(struct ip_vs_sync_v4); + +	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) +		len += sizeof(struct ip_vs_sync_conn_options) + 2; + +	if (cp->pe_data_len) +		len += cp->pe_data_len + 2;	/* + Param hdr field */ +	if (pe_name_len) +		len += pe_name_len + 2; + +	/* check if there is a space for this one  */ +	pad = 0; +	buff = ms->sync_buff; +	if (buff) { +		m = buff->mesg; +		pad = (4 - (size_t) buff->head) & 3; +		/* Send buffer if it is for v0 */ +		if (buff->head + len + pad > buff->end || m->reserved) { +			sb_queue_tail(ipvs, ms); +			ms->sync_buff = NULL; +			buff = NULL; +			pad = 0; +		} +	} + +	if (!buff) { +		buff = ip_vs_sync_buff_create(ipvs); +		if (!buff) { +			spin_unlock_bh(&ipvs->sync_buff_lock); +			pr_err("ip_vs_sync_buff_create failed.\n"); +			return; +		} +		ms->sync_buff = buff; +		m = buff->mesg; +	} + +	p = buff->head; +	buff->head += pad + len; +	m->size = htons(ntohs(m->size) + pad + len); +	/* Add ev. padding from prev. sync_conn */ +	while (pad--) +		*(p++) = 0; + +	s = (union ip_vs_sync_conn *)p; + +	/* Set message type  & copy members */ +	s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); +	s->v4.ver_size = htons(len & SVER_MASK);	/* Version 0 */ +	s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); +	s->v4.state = htons(cp->state); +	s->v4.protocol = cp->protocol; +	s->v4.cport = cp->cport; +	s->v4.vport = cp->vport; +	s->v4.dport = cp->dport; +	s->v4.fwmark = htonl(cp->fwmark); +	s->v4.timeout = htonl(cp->timeout / HZ); +	m->nr_conns++; + +#ifdef CONFIG_IP_VS_IPV6 +	if (cp->af == AF_INET6) { +		p += sizeof(struct ip_vs_sync_v6); +		s->v6.caddr = cp->caddr.in6; +		s->v6.vaddr = cp->vaddr.in6; +		s->v6.daddr = cp->daddr.in6; +	} else +#endif +	{ +		p += sizeof(struct ip_vs_sync_v4);	/* options ptr */ +		s->v4.caddr = cp->caddr.ip; +		s->v4.vaddr = cp->vaddr.ip; +		s->v4.daddr = cp->daddr.ip; +	} +	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { +		*(p++) = IPVS_OPT_SEQ_DATA; +		*(p++) = sizeof(struct ip_vs_sync_conn_options); +		hton_seq((struct ip_vs_seq *)p, &cp->in_seq); +		p += sizeof(struct ip_vs_seq); +		hton_seq((struct ip_vs_seq *)p, &cp->out_seq); +		p += sizeof(struct ip_vs_seq); +	} +	/* Handle pe data */ +	if (cp->pe_data_len && cp->pe_data) { +		*(p++) = IPVS_OPT_PE_DATA; +		*(p++) = cp->pe_data_len; +		memcpy(p, cp->pe_data, cp->pe_data_len); +		p += cp->pe_data_len; +		if (pe_name_len) { +			/* Add PE_NAME */ +			*(p++) = IPVS_OPT_PE_NAME; +			*(p++) = pe_name_len; +			memcpy(p, cp->pe->name, pe_name_len); +			p += pe_name_len; +		} +	} + +	spin_unlock_bh(&ipvs->sync_buff_lock); + +control: +	/* synchronize its controller if it has */ +	cp = cp->control; +	if (!cp) +		return; +	if (cp->flags & IP_VS_CONN_F_TEMPLATE) +		pkts = atomic_add_return(1, &cp->in_pkts); +	else +		pkts = sysctl_sync_threshold(ipvs); +	goto sloop; +} + +/* + *  fill_param used by version 1 + */  static inline int -ip_vs_conn_fill_param_sync(int af, int protocol, -			   const union nf_inet_addr *caddr, __be16 cport, -			   const union nf_inet_addr *vaddr, __be16 vport, -			   struct ip_vs_conn_param *p) +ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, +			   struct ip_vs_conn_param *p, +			   __u8 *pe_data, unsigned int pe_data_len, +			   __u8 *pe_name, unsigned int pe_name_len)  { -	/* XXX: Need to take into account persistence engine */ -	ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p); +#ifdef CONFIG_IP_VS_IPV6 +	if (af == AF_INET6) +		ip_vs_conn_fill_param(net, af, sc->v6.protocol, +				      (const union nf_inet_addr *)&sc->v6.caddr, +				      sc->v6.cport, +				      (const union nf_inet_addr *)&sc->v6.vaddr, +				      sc->v6.vport, p); +	else +#endif +		ip_vs_conn_fill_param(net, af, sc->v4.protocol, +				      (const union nf_inet_addr *)&sc->v4.caddr, +				      sc->v4.cport, +				      (const union nf_inet_addr *)&sc->v4.vaddr, +				      sc->v4.vport, p); +	/* Handle pe data */ +	if (pe_data_len) { +		if (pe_name_len) { +			char buff[IP_VS_PENAME_MAXLEN+1]; + +			memcpy(buff, pe_name, pe_name_len); +			buff[pe_name_len]=0; +			p->pe = __ip_vs_pe_getbyname(buff); +			if (!p->pe) { +				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", +					     buff); +				return 1; +			} +		} else { +			IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); +			return 1; +		} + +		p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); +		if (!p->pe_data) { +			if (p->pe->module) +				module_put(p->pe->module); +			return -ENOMEM; +		} +		p->pe_data_len = pe_data_len; +	}  	return 0;  }  /* - *      Process received multicast message and create the corresponding - *      ip_vs_conn entries. + *  Connection Add / Update. + *  Common for version 0 and 1 reception of backup sync_conns. + *  Param: ... + *         timeout is in sec.   */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) +static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, +			    unsigned int flags, unsigned int state, +			    unsigned int protocol, unsigned int type, +			    const union nf_inet_addr *daddr, __be16 dport, +			    unsigned long timeout, __u32 fwmark, +			    struct ip_vs_sync_conn_options *opt)  { -	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; -	struct ip_vs_sync_conn *s; -	struct ip_vs_sync_conn_options *opt; -	struct ip_vs_conn *cp; -	struct ip_vs_protocol *pp;  	struct ip_vs_dest *dest; -	struct ip_vs_conn_param param; -	char *p; -	int i; +	struct ip_vs_conn *cp; +	struct netns_ipvs *ipvs = net_ipvs(net); -	if (buflen < sizeof(struct ip_vs_sync_mesg)) { -		IP_VS_ERR_RL("sync message header too short\n"); -		return; -	} +	if (!(flags & IP_VS_CONN_F_TEMPLATE)) +		cp = ip_vs_conn_in_get(param); +	else +		cp = ip_vs_ct_in_get(param); -	/* Convert size back to host byte order */ -	m->size = ntohs(m->size); +	if (cp) { +		/* Free pe_data */ +		kfree(param->pe_data); -	if (buflen != m->size) { -		IP_VS_ERR_RL("bogus sync message size\n"); -		return; +		dest = cp->dest; +		spin_lock_bh(&cp->lock); +		if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && +		    !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { +			if (flags & IP_VS_CONN_F_INACTIVE) { +				atomic_dec(&dest->activeconns); +				atomic_inc(&dest->inactconns); +			} else { +				atomic_inc(&dest->activeconns); +				atomic_dec(&dest->inactconns); +			} +		} +		flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; +		flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; +		cp->flags = flags; +		spin_unlock_bh(&cp->lock); +		if (!dest) +			ip_vs_try_bind_dest(cp); +	} else { +		/* +		 * Find the appropriate destination for the connection. +		 * If it is not found the connection will remain unbound +		 * but still handled. +		 */ +		rcu_read_lock(); +		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, +				       param->vport, protocol, fwmark, flags); + +		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); +		rcu_read_unlock(); +		if (!cp) { +			if (param->pe_data) +				kfree(param->pe_data); +			IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); +			return; +		}  	} -	/* SyncID sanity check */ -	if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { -		IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", -			  m->syncid); -		return; +	if (opt) +		memcpy(&cp->in_seq, opt, sizeof(*opt)); +	atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); +	cp->state = state; +	cp->old_state = cp->state; +	/* +	 * For Ver 0 messages style +	 *  - Not possible to recover the right timeout for templates +	 *  - can not find the right fwmark +	 *    virtual service. If needed, we can do it for +	 *    non-fwmark persistent services. +	 * Ver 1 messages style. +	 *  - No problem. +	 */ +	if (timeout) { +		if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) +			timeout = MAX_SCHEDULE_TIMEOUT / HZ; +		cp->timeout = timeout*HZ; +	} else { +		struct ip_vs_proto_data *pd; + +		pd = ip_vs_proto_data_get(net, protocol); +		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) +			cp->timeout = pd->timeout_table[state]; +		else +			cp->timeout = (3*60*HZ);  	} +	ip_vs_conn_put(cp); +} -	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); +/* + *  Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(struct net *net, const char *buffer, +				     const size_t buflen) +{ +	struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; +	struct ip_vs_sync_conn_v0 *s; +	struct ip_vs_sync_conn_options *opt; +	struct ip_vs_protocol *pp; +	struct ip_vs_conn_param param; +	char *p; +	int i; + +	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);  	for (i=0; i<m->nr_conns; i++) { -		unsigned flags, state; +		unsigned int flags, state;  		if (p + SIMPLE_CONN_SIZE > buffer+buflen) { -			IP_VS_ERR_RL("bogus conn in sync message\n"); +			IP_VS_ERR_RL("BACKUP v0, bogus conn\n");  			return;  		} -		s = (struct ip_vs_sync_conn *) p; +		s = (struct ip_vs_sync_conn_v0 *) p;  		flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;  		flags &= ~IP_VS_CONN_F_HASHED;  		if (flags & IP_VS_CONN_F_SEQ_MASK) {  			opt = (struct ip_vs_sync_conn_options *)&s[1];  			p += FULL_CONN_SIZE;  			if (p > buffer+buflen) { -				IP_VS_ERR_RL("bogus conn options in sync message\n"); +				IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");  				return;  			}  		} else { @@ -362,123 +964,311 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)  		if (!(flags & IP_VS_CONN_F_TEMPLATE)) {  			pp = ip_vs_proto_get(s->protocol);  			if (!pp) { -				IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", +				IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",  					s->protocol);  				continue;  			}  			if (state >= pp->num_states) { -				IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", +				IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",  					pp->name, state);  				continue;  			}  		} else {  			/* protocol in templates is not used for state/timeout */ -			pp = NULL;  			if (state > 0) { -				IP_VS_DBG(2, "Invalid template state %u in sync msg\n", +				IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",  					state);  				state = 0;  			}  		} -		{ -			if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, -					      (union nf_inet_addr *)&s->caddr, -					      s->cport, -					      (union nf_inet_addr *)&s->vaddr, -					      s->vport, ¶m)) { -				pr_err("ip_vs_conn_fill_param_sync failed"); -				return; +		ip_vs_conn_fill_param(net, AF_INET, s->protocol, +				      (const union nf_inet_addr *)&s->caddr, +				      s->cport, +				      (const union nf_inet_addr *)&s->vaddr, +				      s->vport, ¶m); + +		/* Send timeout as Zero */ +		ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, +				(union nf_inet_addr *)&s->daddr, s->dport, +				0, 0, opt); +	} +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, +				    __u32 *opt_flags, +				    struct ip_vs_sync_conn_options *opt) +{ +	struct ip_vs_sync_conn_options *topt; + +	topt = (struct ip_vs_sync_conn_options *)p; + +	if (plen != sizeof(struct ip_vs_sync_conn_options)) { +		IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); +		return -EINVAL; +	} +	if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { +		IP_VS_DBG(2, "BACKUP, conn options found twice\n"); +		return -EINVAL; +	} +	ntoh_seq(&topt->in_seq, &opt->in_seq); +	ntoh_seq(&topt->out_seq, &opt->out_seq); +	*opt_flags |= IPVS_OPT_F_SEQ_DATA; +	return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, +			  __u8 **data, unsigned int maxlen, +			  __u32 *opt_flags, __u32 flag) +{ +	if (plen > maxlen) { +		IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); +		return -EINVAL; +	} +	if (*opt_flags & flag) { +		IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); +		return -EINVAL; +	} +	*data_len = plen; +	*data = p; +	*opt_flags |= flag; +	return 0; +} +/* + *   Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +{ +	struct ip_vs_sync_conn_options opt; +	union  ip_vs_sync_conn *s; +	struct ip_vs_protocol *pp; +	struct ip_vs_conn_param param; +	__u32 flags; +	unsigned int af, state, pe_data_len=0, pe_name_len=0; +	__u8 *pe_data=NULL, *pe_name=NULL; +	__u32 opt_flags=0; +	int retc=0; + +	s = (union ip_vs_sync_conn *) p; + +	if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 +		af = AF_INET6; +		p += sizeof(struct ip_vs_sync_v6); +#else +		IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); +		retc = 10; +		goto out; +#endif +	} else if (!s->v4.type) { +		af = AF_INET; +		p += sizeof(struct ip_vs_sync_v4); +	} else { +		return -10; +	} +	if (p > msg_end) +		return -20; + +	/* Process optional params check Type & Len. */ +	while (p < msg_end) { +		int ptype; +		int plen; + +		if (p+2 > msg_end) +			return -30; +		ptype = *(p++); +		plen  = *(p++); + +		if (!plen || ((p + plen) > msg_end)) +			return -40; +		/* Handle seq option  p = param data */ +		switch (ptype & ~IPVS_OPT_F_PARAM) { +		case IPVS_OPT_SEQ_DATA: +			if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) +				return -50; +			break; + +		case IPVS_OPT_PE_DATA: +			if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, +					   IP_VS_PEDATA_MAXLEN, &opt_flags, +					   IPVS_OPT_F_PE_DATA)) +				return -60; +			break; + +		case IPVS_OPT_PE_NAME: +			if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, +					   IP_VS_PENAME_MAXLEN, &opt_flags, +					   IPVS_OPT_F_PE_NAME)) +				return -70; +			break; + +		default: +			/* Param data mandatory ? */ +			if (!(ptype & IPVS_OPT_F_PARAM)) { +				IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", +					  ptype & ~IPVS_OPT_F_PARAM); +				retc = 20; +				goto out;  			} -			if (!(flags & IP_VS_CONN_F_TEMPLATE)) -				cp = ip_vs_conn_in_get(¶m); -			else -				cp = ip_vs_ct_in_get(¶m);  		} -		if (!cp) { -			/* -			 * Find the appropriate destination for the connection. -			 * If it is not found the connection will remain unbound -			 * but still handled. -			 */ -			dest = ip_vs_find_dest(AF_INET, -					       (union nf_inet_addr *)&s->daddr, -					       s->dport, -					       (union nf_inet_addr *)&s->vaddr, -					       s->vport, -					       s->protocol); -			/*  Set the approprite ativity flag */ -			if (s->protocol == IPPROTO_TCP) { -				if (state != IP_VS_TCP_S_ESTABLISHED) -					flags |= IP_VS_CONN_F_INACTIVE; -				else -					flags &= ~IP_VS_CONN_F_INACTIVE; -			} else if (s->protocol == IPPROTO_SCTP) { -				if (state != IP_VS_SCTP_S_ESTABLISHED) -					flags |= IP_VS_CONN_F_INACTIVE; -				else -					flags &= ~IP_VS_CONN_F_INACTIVE; +		p += plen;  /* Next option */ +	} + +	/* Get flags and Mask off unsupported */ +	flags  = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; +	flags |= IP_VS_CONN_F_SYNC; +	state = ntohs(s->v4.state); + +	if (!(flags & IP_VS_CONN_F_TEMPLATE)) { +		pp = ip_vs_proto_get(s->v4.protocol); +		if (!pp) { +			IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", +				s->v4.protocol); +			retc = 30; +			goto out; +		} +		if (state >= pp->num_states) { +			IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", +				pp->name, state); +			retc = 40; +			goto out; +		} +	} else { +		/* protocol in templates is not used for state/timeout */ +		if (state > 0) { +			IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", +				state); +			state = 0; +		} +	} +	if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, +				       pe_data_len, pe_name, pe_name_len)) { +		retc = 50; +		goto out; +	} +	/* If only IPv4, just silent skip IPv6 */ +	if (af == AF_INET) +		ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, +				(union nf_inet_addr *)&s->v4.daddr, s->v4.dport, +				ntohl(s->v4.timeout), ntohl(s->v4.fwmark), +				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) +				); +#ifdef CONFIG_IP_VS_IPV6 +	else +		ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, +				(union nf_inet_addr *)&s->v6.daddr, s->v6.dport, +				ntohl(s->v6.timeout), ntohl(s->v6.fwmark), +				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) +				); +#endif +	return 0; +	/* Error exit */ +out: +	IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); +	return retc; + +} +/* + *      Process received multicast message and create the corresponding + *      ip_vs_conn entries. + *      Handles Version 0 & 1 + */ +static void ip_vs_process_message(struct net *net, __u8 *buffer, +				  const size_t buflen) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; +	__u8 *p, *msg_end; +	int i, nr_conns; + +	if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { +		IP_VS_DBG(2, "BACKUP, message header too short\n"); +		return; +	} + +	if (buflen != ntohs(m2->size)) { +		IP_VS_DBG(2, "BACKUP, bogus message size\n"); +		return; +	} +	/* SyncID sanity check */ +	if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { +		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); +		return; +	} +	/* Handle version 1  message */ +	if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) +	    && (m2->spare == 0)) { + +		msg_end = buffer + sizeof(struct ip_vs_sync_mesg); +		nr_conns = m2->nr_conns; + +		for (i=0; i<nr_conns; i++) { +			union ip_vs_sync_conn *s; +			unsigned int size; +			int retc; + +			p = msg_end; +			if (p + sizeof(s->v4) > buffer+buflen) { +				IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); +				return;  			} -			cp = ip_vs_conn_new(¶m, -					    (union nf_inet_addr *)&s->daddr, -					    s->dport, flags, dest); -			if (dest) -				atomic_dec(&dest->refcnt); -			if (!cp) { -				pr_err("ip_vs_conn_new failed\n"); +			s = (union ip_vs_sync_conn *)p; +			size = ntohs(s->v4.ver_size) & SVER_MASK; +			msg_end = p + size; +			/* Basic sanity checks */ +			if (msg_end  > buffer+buflen) { +				IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");  				return;  			} -		} else if (!cp->dest) { -			dest = ip_vs_try_bind_dest(cp); -			if (dest) -				atomic_dec(&dest->refcnt); -		} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && -			   (cp->state != state)) { -			/* update active/inactive flag for the connection */ -			dest = cp->dest; -			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && -				(state != IP_VS_TCP_S_ESTABLISHED)) { -				atomic_dec(&dest->activeconns); -				atomic_inc(&dest->inactconns); -				cp->flags |= IP_VS_CONN_F_INACTIVE; -			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && -				(state == IP_VS_TCP_S_ESTABLISHED)) { -				atomic_inc(&dest->activeconns); -				atomic_dec(&dest->inactconns); -				cp->flags &= ~IP_VS_CONN_F_INACTIVE; +			if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { +				IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", +					      ntohs(s->v4.ver_size) >> SVER_SHIFT); +				return;  			} -		} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && -			   (cp->state != state)) { -			dest = cp->dest; -			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && -			     (state != IP_VS_SCTP_S_ESTABLISHED)) { -			    atomic_dec(&dest->activeconns); -			    atomic_inc(&dest->inactconns); -			    cp->flags &= ~IP_VS_CONN_F_INACTIVE; +			/* Process a single sync_conn */ +			retc = ip_vs_proc_sync_conn(net, p, msg_end); +			if (retc < 0) { +				IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", +					     retc); +				return;  			} +			/* Make sure we have 32 bit alignment */ +			msg_end = p + ((size + 3) & ~3);  		} - -		if (opt) -			memcpy(&cp->in_seq, opt, sizeof(*opt)); -		atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); -		cp->state = state; -		cp->old_state = cp->state; -		/* -		 * We can not recover the right timeout for templates -		 * in all cases, we can not find the right fwmark -		 * virtual service. If needed, we can do it for -		 * non-fwmark persistent services. -		 */ -		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) -			cp->timeout = pp->timeout_table[state]; -		else -			cp->timeout = (3*60*HZ); -		ip_vs_conn_put(cp); +	} else { +		/* Old type of message */ +		ip_vs_process_message_v0(net, buffer, buflen); +		return;  	}  }  /* + *      Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ +	/* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ +	/* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ +	lock_sock(sk); +	if (mode) { +		val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, +			      sysctl_wmem_max); +		sk->sk_sndbuf = val * 2; +		sk->sk_userlocks |= SOCK_SNDBUF_LOCK; +	} else { +		val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, +			      sysctl_rmem_max); +		sk->sk_rcvbuf = val * 2; +		sk->sk_userlocks |= SOCK_RCVBUF_LOCK; +	} +	release_sock(sk); +} + +/*   *      Setup loopback of outgoing multicasts on a sending socket   */  static void set_mcast_loop(struct sock *sk, u_char loop) @@ -511,8 +1301,10 @@ static int set_mcast_if(struct sock *sk, char *ifname)  {  	struct net_device *dev;  	struct inet_sock *inet = inet_sk(sk); +	struct net *net = sock_net(sk); -	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) +	dev = __dev_get_by_name(net, ifname); +	if (!dev)  		return -ENODEV;  	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -531,30 +1323,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)   *	Set the maximum length of sync message according to the   *	specified interface's MTU.   */ -static int set_sync_mesg_maxlen(int sync_state) +static int set_sync_mesg_maxlen(struct net *net, int sync_state)  { +	struct netns_ipvs *ipvs = net_ipvs(net);  	struct net_device *dev;  	int num;  	if (sync_state == IP_VS_STATE_MASTER) { -		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) +		dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); +		if (!dev)  			return -ENODEV;  		num = (dev->mtu - sizeof(struct iphdr) -  		       sizeof(struct udphdr) -  		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; -		sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + +		ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +  			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);  		IP_VS_DBG(7, "setting the maximum length of sync sending " -			  "message %d.\n", sync_send_mesg_maxlen); +			  "message %d.\n", ipvs->send_mesg_maxlen);  	} else if (sync_state == IP_VS_STATE_BACKUP) { -		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) +		dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); +		if (!dev)  			return -ENODEV; -		sync_recv_mesg_maxlen = dev->mtu - +		ipvs->recv_mesg_maxlen = dev->mtu -  			sizeof(struct iphdr) - sizeof(struct udphdr);  		IP_VS_DBG(7, "setting the maximum length of sync receiving " -			  "message %d.\n", sync_recv_mesg_maxlen); +			  "message %d.\n", ipvs->recv_mesg_maxlen);  	}  	return 0; @@ -569,6 +1364,7 @@ static int set_sync_mesg_maxlen(int sync_state)  static int  join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)  { +	struct net *net = sock_net(sk);  	struct ip_mreqn mreq;  	struct net_device *dev;  	int ret; @@ -576,7 +1372,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)  	memset(&mreq, 0, sizeof(mreq));  	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); -	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) +	dev = __dev_get_by_name(net, ifname); +	if (!dev)  		return -ENODEV;  	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)  		return -EINVAL; @@ -593,11 +1390,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)  static int bind_mcastif_addr(struct socket *sock, char *ifname)  { +	struct net *net = sock_net(sock->sk);  	struct net_device *dev;  	__be32 addr;  	struct sockaddr_in sin; -	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) +	dev = __dev_get_by_name(net, ifname); +	if (!dev)  		return -ENODEV;  	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); @@ -619,19 +1418,31 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)  /*   *      Set up sending multicast socket over UDP   */ -static struct socket * make_send_sock(void) +static struct socket *make_send_sock(struct net *net, int id)  { +	struct netns_ipvs *ipvs = net_ipvs(net); +	/* multicast addr */ +	struct sockaddr_in mcast_addr = { +		.sin_family		= AF_INET, +		.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT + id), +		.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP), +	};  	struct socket *sock;  	int result; -	/* First create a socket */ +	/* First create a socket move it to right name space later */  	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);  	if (result < 0) {  		pr_err("Error during creation of socket; terminating\n");  		return ERR_PTR(result);  	} - -	result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); +	/* +	 * Kernel sockets that are a part of a namespace, should not +	 * hold a reference to a namespace in order to allow to stop it. +	 * After sk_change_net should be released using sk_release_kernel. +	 */ +	sk_change_net(sock->sk, net); +	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);  	if (result < 0) {  		pr_err("Error setting outbound mcast interface\n");  		goto error; @@ -639,8 +1450,11 @@ static struct socket * make_send_sock(void)  	set_mcast_loop(sock->sk, 0);  	set_mcast_ttl(sock->sk, 1); +	result = sysctl_sync_sock_size(ipvs); +	if (result > 0) +		set_sock_size(sock->sk, 1, result); -	result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); +	result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);  	if (result < 0) {  		pr_err("Error binding address of the mcast interface\n");  		goto error; @@ -655,8 +1469,8 @@ static struct socket * make_send_sock(void)  	return sock; -  error: -	sock_release(sock); +error: +	sk_release_kernel(sock->sk);  	return ERR_PTR(result);  } @@ -664,8 +1478,15 @@ static struct socket * make_send_sock(void)  /*   *      Set up receiving multicast socket over UDP   */ -static struct socket * make_receive_sock(void) +static struct socket *make_receive_sock(struct net *net, int id)  { +	struct netns_ipvs *ipvs = net_ipvs(net); +	/* multicast addr */ +	struct sockaddr_in mcast_addr = { +		.sin_family		= AF_INET, +		.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT + id), +		.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP), +	};  	struct socket *sock;  	int result; @@ -675,9 +1496,17 @@ static struct socket * make_receive_sock(void)  		pr_err("Error during creation of socket; terminating\n");  		return ERR_PTR(result);  	} - +	/* +	 * Kernel sockets that are a part of a namespace, should not +	 * hold a reference to a namespace in order to allow to stop it. +	 * After sk_change_net should be released using sk_release_kernel. +	 */ +	sk_change_net(sock->sk, net);  	/* it is equivalent to the REUSEADDR option in user-space */ -	sock->sk->sk_reuse = 1; +	sock->sk->sk_reuse = SK_CAN_REUSE; +	result = sysctl_sync_sock_size(ipvs); +	if (result > 0) +		set_sock_size(sock->sk, 0, result);  	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,  			sizeof(struct sockaddr)); @@ -689,7 +1518,7 @@ static struct socket * make_receive_sock(void)  	/* join the multicast group */  	result = join_mcast_group(sock->sk,  			(struct in_addr *) &mcast_addr.sin_addr, -			ip_vs_backup_mcast_ifn); +			ipvs->backup_mcast_ifn);  	if (result < 0) {  		pr_err("Error joining to the multicast group\n");  		goto error; @@ -697,8 +1526,8 @@ static struct socket * make_receive_sock(void)  	return sock; -  error: -	sock_release(sock); +error: +	sk_release_kernel(sock->sk);  	return ERR_PTR(result);  } @@ -720,18 +1549,19 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)  	return len;  } -static void +static int  ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)  {  	int msize; +	int ret; -	msize = msg->size; - -	/* Put size in network byte order */ -	msg->size = htons(msg->size); +	msize = ntohs(msg->size); -	if (ip_vs_send_async(sock, (char *)msg, msize) != msize) -		pr_err("ip_vs_send_async error\n"); +	ret = ip_vs_send_async(sock, (char *)msg, msize); +	if (ret >= 0 || ret == -EAGAIN) +		return ret; +	pr_err("ip_vs_send_async error %d\n", ret); +	return 0;  }  static int @@ -747,53 +1577,95 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)  	iov.iov_base     = buffer;  	iov.iov_len      = (size_t)buflen; -	len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); +	len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);  	if (len < 0) -		return -1; +		return len;  	LeaveFunction(7);  	return len;  } +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ +	struct ipvs_master_sync_state *ms = +		container_of(work, struct ipvs_master_sync_state, +			     master_wakeup_work.work); +	struct netns_ipvs *ipvs = ms->ipvs; + +	spin_lock_bh(&ipvs->sync_lock); +	if (ms->sync_queue_len && +	    ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { +		ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; +		wake_up_process(ms->master_thread); +	} +	spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ +	struct ip_vs_sync_buff *sb; + +	sb = sb_dequeue(ipvs, ms); +	if (sb) +		return sb; +	/* Do not delay entries in buffer for more than 2 seconds */ +	return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +}  static int sync_thread_master(void *data)  {  	struct ip_vs_sync_thread_data *tinfo = data; +	struct netns_ipvs *ipvs = net_ipvs(tinfo->net); +	struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; +	struct sock *sk = tinfo->sock->sk;  	struct ip_vs_sync_buff *sb;  	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " -		"syncid = %d\n", -		ip_vs_master_mcast_ifn, ip_vs_master_syncid); - -	while (!kthread_should_stop()) { -		while ((sb = sb_dequeue())) { -			ip_vs_send_sync_msg(tinfo->sock, sb->mesg); -			ip_vs_sync_buff_release(sb); +		"syncid = %d, id = %d\n", +		ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); + +	for (;;) { +		sb = next_sync_buff(ipvs, ms); +		if (unlikely(kthread_should_stop())) +			break; +		if (!sb) { +			schedule_timeout(IPVS_SYNC_CHECK_PERIOD); +			continue;  		} - -		/* check if entries stay in curr_sb for 2 seconds */ -		sb = get_curr_sync_buff(2 * HZ); -		if (sb) { -			ip_vs_send_sync_msg(tinfo->sock, sb->mesg); -			ip_vs_sync_buff_release(sb); +		while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { +			/* (Ab)use interruptible sleep to avoid increasing +			 * the load avg. +			 */ +			__wait_event_interruptible(*sk_sleep(sk), +						   sock_writeable(sk) || +						   kthread_should_stop()); +			if (unlikely(kthread_should_stop())) +				goto done;  		} - -		schedule_timeout_interruptible(HZ); +		ip_vs_sync_buff_release(sb);  	} +done: +	__set_current_state(TASK_RUNNING); +	if (sb) +		ip_vs_sync_buff_release(sb); +  	/* clean up the sync_buff queue */ -	while ((sb=sb_dequeue())) { +	while ((sb = sb_dequeue(ipvs, ms)))  		ip_vs_sync_buff_release(sb); -	} +	__set_current_state(TASK_RUNNING);  	/* clean up the current sync_buff */ -	if ((sb = get_curr_sync_buff(0))) { +	sb = get_curr_sync_buff(ipvs, ms, 0); +	if (sb)  		ip_vs_sync_buff_release(sb); -	}  	/* release the sending multicast socket */ -	sock_release(tinfo->sock); +	sk_release_kernel(tinfo->sock->sk);  	kfree(tinfo);  	return 0; @@ -803,11 +1675,12 @@ static int sync_thread_master(void *data)  static int sync_thread_backup(void *data)  {  	struct ip_vs_sync_thread_data *tinfo = data; +	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);  	int len;  	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " -		"syncid = %d\n", -		ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); +		"syncid = %d, id = %d\n", +		ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);  	while (!kthread_should_stop()) {  		wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -817,22 +1690,19 @@ static int sync_thread_backup(void *data)  		/* do we have data now? */  		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {  			len = ip_vs_receive(tinfo->sock, tinfo->buf, -					sync_recv_mesg_maxlen); +					ipvs->recv_mesg_maxlen);  			if (len <= 0) { -				pr_err("receiving message error\n"); +				if (len != -EAGAIN) +					pr_err("receiving message error\n");  				break;  			} -			/* disable bottom half, because it accesses the data -			   shared by softirq while getting/creating conns */ -			local_bh_disable(); -			ip_vs_process_message(tinfo->buf, len); -			local_bh_enable(); +			ip_vs_process_message(tinfo->net, tinfo->buf, len);  		}  	}  	/* release the sending multicast socket */ -	sock_release(tinfo->sock); +	sk_release_kernel(tinfo->sock->sk);  	kfree(tinfo->buf);  	kfree(tinfo); @@ -840,128 +1710,239 @@ static int sync_thread_backup(void *data)  } -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)  {  	struct ip_vs_sync_thread_data *tinfo; -	struct task_struct **realtask, *task; +	struct task_struct **array = NULL, *task;  	struct socket *sock; -	char *name, *buf = NULL; +	struct netns_ipvs *ipvs = net_ipvs(net); +	char *name;  	int (*threadfn)(void *data); +	int id, count;  	int result = -ENOMEM;  	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));  	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", -		  sizeof(struct ip_vs_sync_conn)); +		  sizeof(struct ip_vs_sync_conn_v0)); + +	if (!ipvs->sync_state) { +		count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); +		ipvs->threads_mask = count - 1; +	} else +		count = ipvs->threads_mask + 1;  	if (state == IP_VS_STATE_MASTER) { -		if (sync_master_thread) +		if (ipvs->ms)  			return -EEXIST; -		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, -			sizeof(ip_vs_master_mcast_ifn)); -		ip_vs_master_syncid = syncid; -		realtask = &sync_master_thread; -		name = "ipvs_syncmaster"; +		strlcpy(ipvs->master_mcast_ifn, mcast_ifn, +			sizeof(ipvs->master_mcast_ifn)); +		ipvs->master_syncid = syncid; +		name = "ipvs-m:%d:%d";  		threadfn = sync_thread_master; -		sock = make_send_sock();  	} else if (state == IP_VS_STATE_BACKUP) { -		if (sync_backup_thread) +		if (ipvs->backup_threads)  			return -EEXIST; -		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, -			sizeof(ip_vs_backup_mcast_ifn)); -		ip_vs_backup_syncid = syncid; -		realtask = &sync_backup_thread; -		name = "ipvs_syncbackup"; +		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, +			sizeof(ipvs->backup_mcast_ifn)); +		ipvs->backup_syncid = syncid; +		name = "ipvs-b:%d:%d";  		threadfn = sync_thread_backup; -		sock = make_receive_sock();  	} else {  		return -EINVAL;  	} -	if (IS_ERR(sock)) { -		result = PTR_ERR(sock); -		goto out; +	if (state == IP_VS_STATE_MASTER) { +		struct ipvs_master_sync_state *ms; + +		ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); +		if (!ipvs->ms) +			goto out; +		ms = ipvs->ms; +		for (id = 0; id < count; id++, ms++) { +			INIT_LIST_HEAD(&ms->sync_queue); +			ms->sync_queue_len = 0; +			ms->sync_queue_delay = 0; +			INIT_DELAYED_WORK(&ms->master_wakeup_work, +					  master_wakeup_work_handler); +			ms->ipvs = ipvs; +		} +	} else { +		array = kzalloc(count * sizeof(struct task_struct *), +				GFP_KERNEL); +		if (!array) +			goto out;  	} +	set_sync_mesg_maxlen(net, state); -	set_sync_mesg_maxlen(state); -	if (state == IP_VS_STATE_BACKUP) { -		buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); -		if (!buf) +	tinfo = NULL; +	for (id = 0; id < count; id++) { +		if (state == IP_VS_STATE_MASTER) +			sock = make_send_sock(net, id); +		else +			sock = make_receive_sock(net, id); +		if (IS_ERR(sock)) { +			result = PTR_ERR(sock); +			goto outtinfo; +		} +		tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); +		if (!tinfo)  			goto outsocket; -	} - -	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); -	if (!tinfo) -		goto outbuf; - -	tinfo->sock = sock; -	tinfo->buf = buf; +		tinfo->net = net; +		tinfo->sock = sock; +		if (state == IP_VS_STATE_BACKUP) { +			tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, +					     GFP_KERNEL); +			if (!tinfo->buf) +				goto outtinfo; +		} else { +			tinfo->buf = NULL; +		} +		tinfo->id = id; -	task = kthread_run(threadfn, tinfo, name); -	if (IS_ERR(task)) { -		result = PTR_ERR(task); -		goto outtinfo; +		task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); +		if (IS_ERR(task)) { +			result = PTR_ERR(task); +			goto outtinfo; +		} +		tinfo = NULL; +		if (state == IP_VS_STATE_MASTER) +			ipvs->ms[id].master_thread = task; +		else +			array[id] = task;  	}  	/* mark as active */ -	*realtask = task; -	ip_vs_sync_state |= state; + +	if (state == IP_VS_STATE_BACKUP) +		ipvs->backup_threads = array; +	spin_lock_bh(&ipvs->sync_buff_lock); +	ipvs->sync_state |= state; +	spin_unlock_bh(&ipvs->sync_buff_lock);  	/* increase the module use count */  	ip_vs_use_count_inc();  	return 0; -outtinfo: -	kfree(tinfo); -outbuf: -	kfree(buf);  outsocket: -	sock_release(sock); +	sk_release_kernel(sock->sk); + +outtinfo: +	if (tinfo) { +		sk_release_kernel(tinfo->sock->sk); +		kfree(tinfo->buf); +		kfree(tinfo); +	} +	count = id; +	while (count-- > 0) { +		if (state == IP_VS_STATE_MASTER) +			kthread_stop(ipvs->ms[count].master_thread); +		else +			kthread_stop(array[count]); +	} +	kfree(array); +  out: +	if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { +		kfree(ipvs->ms); +		ipvs->ms = NULL; +	}  	return result;  } -int stop_sync_thread(int state) +int stop_sync_thread(struct net *net, int state)  { +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct task_struct **array; +	int id; +	int retc = -EINVAL; +  	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));  	if (state == IP_VS_STATE_MASTER) { -		if (!sync_master_thread) +		if (!ipvs->ms)  			return -ESRCH; -		pr_info("stopping master sync thread %d ...\n", -			task_pid_nr(sync_master_thread)); -  		/*  		 * The lock synchronizes with sb_queue_tail(), so that we don't  		 * add sync buffers to the queue, when we are already in  		 * progress of stopping the master sync daemon.  		 */ -		spin_lock_bh(&ip_vs_sync_lock); -		ip_vs_sync_state &= ~IP_VS_STATE_MASTER; -		spin_unlock_bh(&ip_vs_sync_lock); -		kthread_stop(sync_master_thread); -		sync_master_thread = NULL; +		spin_lock_bh(&ipvs->sync_buff_lock); +		spin_lock(&ipvs->sync_lock); +		ipvs->sync_state &= ~IP_VS_STATE_MASTER; +		spin_unlock(&ipvs->sync_lock); +		spin_unlock_bh(&ipvs->sync_buff_lock); + +		retc = 0; +		for (id = ipvs->threads_mask; id >= 0; id--) { +			struct ipvs_master_sync_state *ms = &ipvs->ms[id]; +			int ret; + +			pr_info("stopping master sync thread %d ...\n", +				task_pid_nr(ms->master_thread)); +			cancel_delayed_work_sync(&ms->master_wakeup_work); +			ret = kthread_stop(ms->master_thread); +			if (retc >= 0) +				retc = ret; +		} +		kfree(ipvs->ms); +		ipvs->ms = NULL;  	} else if (state == IP_VS_STATE_BACKUP) { -		if (!sync_backup_thread) +		if (!ipvs->backup_threads)  			return -ESRCH; -		pr_info("stopping backup sync thread %d ...\n", -			task_pid_nr(sync_backup_thread)); - -		ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; -		kthread_stop(sync_backup_thread); -		sync_backup_thread = NULL; -	} else { -		return -EINVAL; +		ipvs->sync_state &= ~IP_VS_STATE_BACKUP; +		array = ipvs->backup_threads; +		retc = 0; +		for (id = ipvs->threads_mask; id >= 0; id--) { +			int ret; + +			pr_info("stopping backup sync thread %d ...\n", +				task_pid_nr(array[id])); +			ret = kthread_stop(array[id]); +			if (retc >= 0) +				retc = ret; +		} +		kfree(array); +		ipvs->backup_threads = NULL;  	}  	/* decrease the module use count */  	ip_vs_use_count_dec(); +	return retc; +} + +/* + * Initialize data struct for each netns + */ +int __net_init ip_vs_sync_net_init(struct net *net) +{ +	struct netns_ipvs *ipvs = net_ipvs(net); + +	__mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); +	spin_lock_init(&ipvs->sync_lock); +	spin_lock_init(&ipvs->sync_buff_lock);  	return 0;  } + +void ip_vs_sync_net_cleanup(struct net *net) +{ +	int retc; +	struct netns_ipvs *ipvs = net_ipvs(net); + +	mutex_lock(&ipvs->sync_mutex); +	retc = stop_sync_thread(net, IP_VS_STATE_MASTER); +	if (retc && retc != -ESRCH) +		pr_err("Failed to stop Master Daemon\n"); + +	retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); +	if (retc && retc != -ESRCH) +		pr_err("Failed to stop Backup Daemon\n"); +	mutex_unlock(&ipvs->sync_mutex); +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index bbddfdb10db..b5b4650d50a 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -27,30 +27,15 @@  #include <net/ip_vs.h> - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ -	/* -	 * We think the overhead of processing active connections is 256 -	 * times higher than that of inactive connections in average. (This -	 * 256 times might not be accurate, we will change it later) We -	 * use the following formula to estimate the overhead now: -	 *		  dest->activeconns*256 + dest->inactconns -	 */ -	return (atomic_read(&dest->activeconns) << 8) + -		atomic_read(&dest->inactconns); -} - -  /*   *	Weighted Least Connection scheduling   */  static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		   struct ip_vs_iphdr *iph)  {  	struct ip_vs_dest *dest, *least; -	unsigned int loh, doh; +	int loh, doh;  	IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); @@ -67,27 +52,27 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)  	 * new connections.  	 */ -	list_for_each_entry(dest, &svc->destinations, n_list) { +	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {  		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&  		    atomic_read(&dest->weight) > 0) {  			least = dest; -			loh = ip_vs_wlc_dest_overhead(least); +			loh = ip_vs_dest_conn_overhead(least);  			goto nextstage;  		}  	} -	IP_VS_ERR_RL("WLC: no destination available\n"); +	ip_vs_scheduler_err(svc, "no destination available");  	return NULL;  	/*  	 *    Find the destination with the least load.  	 */    nextstage: -	list_for_each_entry_continue(dest, &svc->destinations, n_list) { +	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {  		if (dest->flags & IP_VS_DEST_F_OVERLOAD)  			continue; -		doh = ip_vs_wlc_dest_overhead(dest); -		if (loh * atomic_read(&dest->weight) > -		    doh * atomic_read(&least->weight)) { +		doh = ip_vs_dest_conn_overhead(dest); +		if ((__s64)loh * atomic_read(&dest->weight) > +		    (__s64)doh * atomic_read(&least->weight)) {  			least = dest;  			loh = doh;  		} @@ -122,6 +107,7 @@ static int __init ip_vs_wlc_init(void)  static void __exit ip_vs_wlc_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +	synchronize_rcu();  }  module_init(ip_vs_wlc_init); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 30db633f88f..0546cd572d6 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -29,14 +29,45 @@  #include <net/ip_vs.h> +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ +  /*   * current destination pointer for weighted round-robin scheduling   */  struct ip_vs_wrr_mark { -	struct list_head *cl;	/* current list head */ +	struct ip_vs_dest *cl;	/* current dest or head */  	int cw;			/* current weight */  	int mw;			/* maximum weight */  	int di;			/* decreasing interval */ +	struct rcu_head		rcu_head;  }; @@ -84,41 +115,45 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)  	/*  	 *    Allocate the mark variable for WRR scheduling  	 */ -	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); -	if (mark == NULL) { -		pr_err("%s(): no memory\n", __func__); +	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); +	if (mark == NULL)  		return -ENOMEM; -	} -	mark->cl = &svc->destinations; -	mark->cw = 0; -	mark->mw = ip_vs_wrr_max_weight(svc); + +	mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);  	mark->di = ip_vs_wrr_gcd_weight(svc); +	mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); +	mark->cw = mark->mw;  	svc->sched_data = mark;  	return 0;  } -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)  { +	struct ip_vs_wrr_mark *mark = svc->sched_data; +  	/*  	 *    Release the mark variable  	 */ -	kfree(svc->sched_data); - -	return 0; +	kfree_rcu(mark, rcu_head);  } -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, +				  struct ip_vs_dest *dest)  {  	struct ip_vs_wrr_mark *mark = svc->sched_data; -	mark->cl = &svc->destinations; -	mark->mw = ip_vs_wrr_max_weight(svc); +	spin_lock_bh(&svc->sched_lock); +	mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);  	mark->di = ip_vs_wrr_gcd_weight(svc); -	if (mark->cw > mark->mw) -		mark->cw = 0; +	mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); +	if (mark->cw > mark->mw || !mark->cw) +		mark->cw = mark->mw; +	else if (mark->di > 1) +		mark->cw = (mark->cw / mark->di) * mark->di + 1; +	spin_unlock_bh(&svc->sched_lock);  	return 0;  } @@ -127,80 +162,82 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)   *    Weighted Round-Robin Scheduling   */  static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, +		   struct ip_vs_iphdr *iph)  { -	struct ip_vs_dest *dest; +	struct ip_vs_dest *dest, *last, *stop = NULL;  	struct ip_vs_wrr_mark *mark = svc->sched_data; -	struct list_head *p; +	bool last_pass = false, restarted = false;  	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); -	/* -	 * This loop will always terminate, because mark->cw in (0, max_weight] -	 * and at least one server has its weight equal to max_weight. -	 */ -	write_lock(&svc->sched_lock); -	p = mark->cl; +	spin_lock_bh(&svc->sched_lock); +	dest = mark->cl; +	/* No available dests? */ +	if (mark->mw == 0) +		goto err_noavail; +	last = dest; +	/* Stop only after all dests were checked for weight >= 1 (last pass) */  	while (1) { -		if (mark->cl == &svc->destinations) { -			/* it is at the head of the destination list */ - -			if (mark->cl == mark->cl->next) { -				/* no dest entry */ -				IP_VS_ERR_RL("WRR: no destination available: " -					     "no destinations present\n"); -				dest = NULL; -				goto out; -			} - -			mark->cl = svc->destinations.next; -			mark->cw -= mark->di; -			if (mark->cw <= 0) { -				mark->cw = mark->mw; -				/* -				 * Still zero, which means no available servers. -				 */ -				if (mark->cw == 0) { -					mark->cl = &svc->destinations; -					IP_VS_ERR_RL("WRR: no destination " -						     "available\n"); -					dest = NULL; -					goto out; -				} -			} -		} else -			mark->cl = mark->cl->next; - -		if (mark->cl != &svc->destinations) { -			/* not at the head of the list */ -			dest = list_entry(mark->cl, struct ip_vs_dest, n_list); +		list_for_each_entry_continue_rcu(dest, +						 &svc->destinations, +						 n_list) {  			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && -			    atomic_read(&dest->weight) >= mark->cw) { -				/* got it */ -				break; -			} +			    atomic_read(&dest->weight) >= mark->cw) +				goto found; +			if (dest == stop) +				goto err_over;  		} - -		if (mark->cl == p && mark->cw == mark->di) { -			/* back to the start, and no dest is found. -			   It is only possible when all dests are OVERLOADED */ -			dest = NULL; -			IP_VS_ERR_RL("WRR: no destination available: " -				     "all destinations are overloaded\n"); -			goto out; +		mark->cw -= mark->di; +		if (mark->cw <= 0) { +			mark->cw = mark->mw; +			/* Stop if we tried last pass from first dest: +			 * 1. last_pass: we started checks when cw > di but +			 *	then all dests were checked for w >= 1 +			 * 2. last was head: the first and only traversal +			 *	was for weight >= 1, for all dests. +			 */ +			if (last_pass || +			    &last->n_list == &svc->destinations) +				goto err_over; +			restarted = true; +		} +		last_pass = mark->cw <= mark->di; +		if (last_pass && restarted && +		    &last->n_list != &svc->destinations) { +			/* First traversal was for w >= 1 but only +			 * for dests after 'last', now do the same +			 * for all dests up to 'last'. +			 */ +			stop = last;  		}  	} +found:  	IP_VS_DBG_BUF(6, "WRR: server %s:%u "  		      "activeconns %d refcnt %d weight %d\n",  		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),  		      atomic_read(&dest->activeconns),  		      atomic_read(&dest->refcnt),  		      atomic_read(&dest->weight)); +	mark->cl = dest;    out: -	write_unlock(&svc->sched_lock); +	spin_unlock_bh(&svc->sched_lock);  	return dest; + +err_noavail: +	mark->cl = dest; +	dest = NULL; +	ip_vs_scheduler_err(svc, "no destination available"); +	goto out; + +err_over: +	mark->cl = dest; +	dest = NULL; +	ip_vs_scheduler_err(svc, "no destination available: " +			    "all destinations are overloaded"); +	goto out;  } @@ -211,7 +248,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {  	.n_list =		LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),  	.init_service =		ip_vs_wrr_init_svc,  	.done_service =		ip_vs_wrr_done_svc, -	.update_service =	ip_vs_wrr_update_svc, +	.add_dest =		ip_vs_wrr_dest_changed, +	.del_dest =		ip_vs_wrr_dest_changed, +	.upd_dest =		ip_vs_wrr_dest_changed,  	.schedule =		ip_vs_wrr_schedule,  }; @@ -223,6 +262,7 @@ static int __init ip_vs_wrr_init(void)  static void __exit ip_vs_wrr_cleanup(void)  {  	unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +	synchronize_rcu();  }  module_init(ip_vs_wrr_init); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 5325a3fbe4a..73ba1cc7a88 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -17,6 +17,8 @@   * - not all connections have destination server, for example,   * connections in backup server when fwmark is used   * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback   * LOCAL_OUT rules:   * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)   * - skb->pkt_type is not set yet @@ -43,158 +45,259 @@  #include <net/ip_vs.h> +enum { +	IP_VS_RT_MODE_LOCAL	= 1, /* Allow local dest */ +	IP_VS_RT_MODE_NON_LOCAL	= 2, /* Allow non-local dest */ +	IP_VS_RT_MODE_RDR	= 4, /* Allow redirect from remote daddr to +				      * local +				      */ +	IP_VS_RT_MODE_CONNECT	= 8, /* Always bind route to saddr */ +	IP_VS_RT_MODE_KNOWN_NH	= 16,/* Route via remote addr */ +	IP_VS_RT_MODE_TUNNEL	= 32,/* Tunnel mode */ +}; + +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ +	return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ +	kfree(dest_dst); +}  /*   *      Destination cache to speed up outgoing route lookup   */  static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, -		u32 dst_cookie) +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, +		struct dst_entry *dst, u32 dst_cookie)  { -	struct dst_entry *old_dst; +	struct ip_vs_dest_dst *old; -	old_dst = dest->dst_cache; -	dest->dst_cache = dst; -	dest->dst_rtos = rtos; -	dest->dst_cookie = dst_cookie; -	dst_release(old_dst); +	old = rcu_dereference_protected(dest->dest_dst, +					lockdep_is_held(&dest->dst_lock)); + +	if (dest_dst) { +		dest_dst->dst_cache = dst; +		dest_dst->dst_cookie = dst_cookie; +	} +	rcu_assign_pointer(dest->dest_dst, dest_dst); + +	if (old) +		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);  } -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest)  { -	struct dst_entry *dst = dest->dst_cache; +	struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); +	struct dst_entry *dst; -	if (!dst) +	if (!dest_dst)  		return NULL; -	if ((dst->obsolete || rtos != dest->dst_rtos) && -	    dst->ops->check(dst, dest->dst_cookie) == NULL) { -		dest->dst_cache = NULL; -		dst_release(dst); +	dst = dest_dst->dst_cache; +	if (dst->obsolete && +	    dst->ops->check(dst, dest_dst->dst_cookie) == NULL)  		return NULL; +	return dest_dst; +} + +static inline bool +__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) +{ +	if (IP6CB(skb)->frag_max_size) { +		/* frag_max_size tell us that, this packet have been +		 * defragmented by netfilter IPv6 conntrack module. +		 */ +		if (IP6CB(skb)->frag_max_size > mtu) +			return true; /* largest fragment violate MTU */  	} -	dst_hold(dst); -	return dst; +	else if (skb->len > mtu && !skb_is_gso(skb)) { +		return true; /* Packet size violate MTU size */ +	} +	return false;  } -/* - * Get route to destination or remote server - * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, - *	    &4=Allow redirect from remote daddr to local - */ -static struct rtable * +/* Get route to daddr, update *saddr, optionally bind route to saddr */ +static struct rtable *do_output_route4(struct net *net, __be32 daddr, +				       int rt_mode, __be32 *saddr) +{ +	struct flowi4 fl4; +	struct rtable *rt; +	int loop = 0; + +	memset(&fl4, 0, sizeof(fl4)); +	fl4.daddr = daddr; +	fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; +	fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? +			   FLOWI_FLAG_KNOWN_NH : 0; + +retry: +	rt = ip_route_output_key(net, &fl4); +	if (IS_ERR(rt)) { +		/* Invalid saddr ? */ +		if (PTR_ERR(rt) == -EINVAL && *saddr && +		    rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { +			*saddr = 0; +			flowi4_update_output(&fl4, 0, 0, daddr, 0); +			goto retry; +		} +		IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); +		return NULL; +	} else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { +		ip_rt_put(rt); +		*saddr = fl4.saddr; +		flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); +		loop++; +		goto retry; +	} +	*saddr = fl4.saddr; +	return rt; +} + +/* Get route to destination or remote server */ +static int  __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, -		   __be32 daddr, u32 rtos, int rt_mode) +		   __be32 daddr, int rt_mode, __be32 *ret_saddr)  {  	struct net *net = dev_net(skb_dst(skb)->dev); +	struct netns_ipvs *ipvs = net_ipvs(net); +	struct ip_vs_dest_dst *dest_dst;  	struct rtable *rt;			/* Route to the other host */  	struct rtable *ort;			/* Original route */ -	int local; +	struct iphdr *iph; +	__be16 df; +	int mtu; +	int local, noref = 1;  	if (dest) { -		spin_lock(&dest->dst_lock); -		if (!(rt = (struct rtable *) -		      __ip_vs_dst_check(dest, rtos))) { -			struct flowi fl = { -				.fl4_dst = dest->addr.ip, -				.fl4_tos = rtos, -			}; - -			if (ip_route_output_key(net, &rt, &fl)) { -				spin_unlock(&dest->dst_lock); -				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", -					     &dest->addr.ip); -				return NULL; +		dest_dst = __ip_vs_dst_check(dest); +		if (likely(dest_dst)) +			rt = (struct rtable *) dest_dst->dst_cache; +		else { +			dest_dst = ip_vs_dest_dst_alloc(); +			spin_lock_bh(&dest->dst_lock); +			if (!dest_dst) { +				__ip_vs_dst_set(dest, NULL, NULL, 0); +				spin_unlock_bh(&dest->dst_lock); +				goto err_unreach;  			} -			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); -			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", -				  &dest->addr.ip, -				  atomic_read(&rt->dst.__refcnt), rtos); +			rt = do_output_route4(net, dest->addr.ip, rt_mode, +					      &dest_dst->dst_saddr.ip); +			if (!rt) { +				__ip_vs_dst_set(dest, NULL, NULL, 0); +				spin_unlock_bh(&dest->dst_lock); +				ip_vs_dest_dst_free(dest_dst); +				goto err_unreach; +			} +			__ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); +			spin_unlock_bh(&dest->dst_lock); +			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", +				  &dest->addr.ip, &dest_dst->dst_saddr.ip, +				  atomic_read(&rt->dst.__refcnt));  		} -		spin_unlock(&dest->dst_lock); +		daddr = dest->addr.ip; +		if (ret_saddr) +			*ret_saddr = dest_dst->dst_saddr.ip;  	} else { -		struct flowi fl = { -			.fl4_dst = daddr, -			.fl4_tos = rtos, -		}; - -		if (ip_route_output_key(net, &rt, &fl)) { -			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", -				     &daddr); -			return NULL; -		} +		__be32 saddr = htonl(INADDR_ANY); + +		noref = 0; + +		/* For such unconfigured boxes avoid many route lookups +		 * for performance reasons because we do not remember saddr +		 */ +		rt_mode &= ~IP_VS_RT_MODE_CONNECT; +		rt = do_output_route4(net, daddr, rt_mode, &saddr); +		if (!rt) +			goto err_unreach; +		if (ret_saddr) +			*ret_saddr = saddr;  	} -	local = rt->rt_flags & RTCF_LOCAL; -	if (!((local ? 1 : 2) & rt_mode)) { +	local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; +	if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & +	      rt_mode)) {  		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",  			     (rt->rt_flags & RTCF_LOCAL) ? -			     "local":"non-local", &rt->rt_dst); -		ip_rt_put(rt); -		return NULL; +			     "local":"non-local", &daddr); +		goto err_put;  	} -	if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) && -					 ort->rt_flags & RTCF_LOCAL)) { -		IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " -			     "requires NAT method, dest: %pI4\n", -			     &ip_hdr(skb)->daddr, &rt->rt_dst); -		ip_rt_put(rt); -		return NULL; -	} -	if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { -		IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " -			     "to non-local address, dest: %pI4\n", -			     &ip_hdr(skb)->saddr, &rt->rt_dst); -		ip_rt_put(rt); -		return NULL; +	iph = ip_hdr(skb); +	if (likely(!local)) { +		if (unlikely(ipv4_is_loopback(iph->saddr))) { +			IP_VS_DBG_RL("Stopping traffic from loopback address " +				     "%pI4 to non-local address, dest: %pI4\n", +				     &iph->saddr, &daddr); +			goto err_put; +		} +	} else { +		ort = skb_rtable(skb); +		if (!(rt_mode & IP_VS_RT_MODE_RDR) && +		    !(ort->rt_flags & RTCF_LOCAL)) { +			IP_VS_DBG_RL("Redirect from non-local address %pI4 to " +				     "local requires NAT method, dest: %pI4\n", +				     &iph->daddr, &daddr); +			goto err_put; +		} +		/* skb to local stack, preserve old route */ +		if (!noref) +			ip_rt_put(rt); +		return local;  	} -	return rt; -} - -/* Reroute packet to local IPv4 stack after DNAT */ -static int -__ip_vs_reroute_locally(struct sk_buff *skb) -{ -	struct rtable *rt = skb_rtable(skb); -	struct net_device *dev = rt->dst.dev; -	struct net *net = dev_net(dev); -	struct iphdr *iph = ip_hdr(skb); - -	if (rt_is_input_route(rt)) { -		unsigned long orefdst = skb->_skb_refdst; - -		if (ip_route_input(skb, iph->daddr, iph->saddr, -				   iph->tos, skb->dev)) -			return 0; -		refdst_drop(orefdst); +	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { +		mtu = dst_mtu(&rt->dst); +		df = iph->frag_off & htons(IP_DF);  	} else { -		struct flowi fl = { -			.fl4_dst = iph->daddr, -			.fl4_src = iph->saddr, -			.fl4_tos = RT_TOS(iph->tos), -			.mark = skb->mark, -		}; -		struct rtable *rt; - -		if (ip_route_output_key(net, &rt, &fl)) -			return 0; -		if (!(rt->rt_flags & RTCF_LOCAL)) { -			ip_rt_put(rt); -			return 0; +		struct sock *sk = skb->sk; + +		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); +		if (mtu < 68) { +			IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); +			goto err_put;  		} -		/* Drop old route. */ -		skb_dst_drop(skb); -		skb_dst_set(skb, &rt->dst); +		ort = skb_rtable(skb); +		if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) +			ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); +		/* MTU check allowed? */ +		df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0;  	} -	return 1; + +	/* MTU checking */ +	if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { +		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +		IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); +		goto err_put; +	} + +	skb_dst_drop(skb); +	if (noref) { +		if (!local) +			skb_dst_set_noref_force(skb, &rt->dst); +		else +			skb_dst_set(skb, dst_clone(&rt->dst)); +	} else +		skb_dst_set(skb, &rt->dst); + +	return local; + +err_put: +	if (!noref) +		ip_rt_put(rt); +	return -1; + +err_unreach: +	dst_link_failure(skb); +	return -1;  }  #ifdef CONFIG_IP_VS_IPV6  static inline int __ip_vs_is_local_route6(struct rt6_info *rt)  { -	return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK; +	return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;  }  static struct dst_entry * @@ -202,22 +305,27 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,  			struct in6_addr *ret_saddr, int do_xfrm)  {  	struct dst_entry *dst; -	struct flowi fl = { -		.fl6_dst = *daddr, +	struct flowi6 fl6 = { +		.daddr = *daddr,  	}; -	dst = ip6_route_output(net, NULL, &fl); +	dst = ip6_route_output(net, NULL, &fl6);  	if (dst->error)  		goto out_err;  	if (!ret_saddr)  		return dst; -	if (ipv6_addr_any(&fl.fl6_src) && +	if (ipv6_addr_any(&fl6.saddr) &&  	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, -			       &fl.fl6_dst, 0, &fl.fl6_src) < 0) +			       &fl6.daddr, 0, &fl6.saddr) < 0)  		goto out_err; -	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0) -		goto out_err; -	ipv6_addr_copy(ret_saddr, &fl.fl6_src); +	if (do_xfrm) { +		dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); +		if (IS_ERR(dst)) { +			dst = NULL; +			goto out_err; +		} +	} +	*ret_saddr = fl6.saddr;  	return dst;  out_err: @@ -228,133 +336,197 @@ out_err:  /*   * Get route to destination or remote server - * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, - *	    &4=Allow redirect from remote daddr to local   */ -static struct rt6_info * +static int  __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,  		      struct in6_addr *daddr, struct in6_addr *ret_saddr, -		      int do_xfrm, int rt_mode) +		      struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)  {  	struct net *net = dev_net(skb_dst(skb)->dev); +	struct ip_vs_dest_dst *dest_dst;  	struct rt6_info *rt;			/* Route to the other host */  	struct rt6_info *ort;			/* Original route */  	struct dst_entry *dst; -	int local; +	int mtu; +	int local, noref = 1;  	if (dest) { -		spin_lock(&dest->dst_lock); -		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); -		if (!rt) { +		dest_dst = __ip_vs_dst_check(dest); +		if (likely(dest_dst)) +			rt = (struct rt6_info *) dest_dst->dst_cache; +		else {  			u32 cookie; +			dest_dst = ip_vs_dest_dst_alloc(); +			spin_lock_bh(&dest->dst_lock); +			if (!dest_dst) { +				__ip_vs_dst_set(dest, NULL, NULL, 0); +				spin_unlock_bh(&dest->dst_lock); +				goto err_unreach; +			}  			dst = __ip_vs_route_output_v6(net, &dest->addr.in6, -						      &dest->dst_saddr, +						      &dest_dst->dst_saddr.in6,  						      do_xfrm);  			if (!dst) { -				spin_unlock(&dest->dst_lock); -				return NULL; +				__ip_vs_dst_set(dest, NULL, NULL, 0); +				spin_unlock_bh(&dest->dst_lock); +				ip_vs_dest_dst_free(dest_dst); +				goto err_unreach;  			}  			rt = (struct rt6_info *) dst;  			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; -			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); +			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); +			spin_unlock_bh(&dest->dst_lock);  			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", -				  &dest->addr.in6, &dest->dst_saddr, +				  &dest->addr.in6, &dest_dst->dst_saddr.in6,  				  atomic_read(&rt->dst.__refcnt));  		}  		if (ret_saddr) -			ipv6_addr_copy(ret_saddr, &dest->dst_saddr); -		spin_unlock(&dest->dst_lock); +			*ret_saddr = dest_dst->dst_saddr.in6;  	} else { +		noref = 0;  		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);  		if (!dst) -			return NULL; +			goto err_unreach;  		rt = (struct rt6_info *) dst;  	}  	local = __ip_vs_is_local_route6(rt); -	if (!((local ? 1 : 2) & rt_mode)) { -		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", +	if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & +	      rt_mode)) { +		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",  			     local ? "local":"non-local", daddr); -		dst_release(&rt->dst); -		return NULL; +		goto err_put;  	} -	if (local && !(rt_mode & 4) && -	    !((ort = (struct rt6_info *) skb_dst(skb)) && -	      __ip_vs_is_local_route6(ort))) { -		IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " -			     "requires NAT method, dest: %pI6\n", -			     &ipv6_hdr(skb)->daddr, daddr); -		dst_release(&rt->dst); -		return NULL; +	if (likely(!local)) { +		if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && +			     ipv6_addr_type(&ipv6_hdr(skb)->saddr) & +					    IPV6_ADDR_LOOPBACK)) { +			IP_VS_DBG_RL("Stopping traffic from loopback address " +				     "%pI6c to non-local address, " +				     "dest: %pI6c\n", +				     &ipv6_hdr(skb)->saddr, daddr); +			goto err_put; +		} +	} else { +		ort = (struct rt6_info *) skb_dst(skb); +		if (!(rt_mode & IP_VS_RT_MODE_RDR) && +		    !__ip_vs_is_local_route6(ort)) { +			IP_VS_DBG_RL("Redirect from non-local address %pI6c " +				     "to local requires NAT method, " +				     "dest: %pI6c\n", +				     &ipv6_hdr(skb)->daddr, daddr); +			goto err_put; +		} +		/* skb to local stack, preserve old route */ +		if (!noref) +			dst_release(&rt->dst); +		return local;  	} -	if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && -		     ipv6_addr_type(&ipv6_hdr(skb)->saddr) & -				    IPV6_ADDR_LOOPBACK)) { -		IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " -			     "to non-local address, dest: %pI6\n", -			     &ipv6_hdr(skb)->saddr, daddr); -		dst_release(&rt->dst); -		return NULL; + +	/* MTU checking */ +	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) +		mtu = dst_mtu(&rt->dst); +	else { +		struct sock *sk = skb->sk; + +		mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); +		if (mtu < IPV6_MIN_MTU) { +			IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, +				     IPV6_MIN_MTU); +			goto err_put; +		} +		ort = (struct rt6_info *) skb_dst(skb); +		if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) +			ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);  	} -	return rt; +	if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { +		if (!skb->dev) +			skb->dev = net->loopback_dev; +		/* only send ICMP too big on first fragment */ +		if (!ipvsh->fragoffs) +			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); +		IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); +		goto err_put; +	} + +	skb_dst_drop(skb); +	if (noref) { +		if (!local) +			skb_dst_set_noref_force(skb, &rt->dst); +		else +			skb_dst_set(skb, dst_clone(&rt->dst)); +	} else +		skb_dst_set(skb, &rt->dst); + +	return local; + +err_put: +	if (!noref) +		dst_release(&rt->dst); +	return -1; + +err_unreach: +	dst_link_failure(skb); +	return -1;  }  #endif -/* - *	Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, +					    struct ip_vs_conn *cp)  { -	struct dst_entry *old_dst; +	int ret = NF_ACCEPT; + +	skb->ipvs_property = 1; +	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) +		ret = ip_vs_confirm_conntrack(skb); +	if (ret == NF_ACCEPT) { +		nf_reset(skb); +		skb_forward_csum(skb); +	} +	return ret; +} -	old_dst = dest->dst_cache; -	dest->dst_cache = NULL; -	dst_release(old_dst); +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, +					 struct ip_vs_conn *cp, int local) +{ +	int ret = NF_STOLEN; + +	skb->ipvs_property = 1; +	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) +		ip_vs_notrack(skb); +	else +		ip_vs_update_conntrack(skb, cp, 1); +	if (!local) { +		skb_forward_csum(skb); +		NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, +			dst_output); +	} else +		ret = NF_ACCEPT; +	return ret;  } -#define IP_VS_XMIT_TUNNEL(skb, cp)				\ -({								\ -	int __ret = NF_ACCEPT;					\ -								\ -	(skb)->ipvs_property = 1;				\ -	if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))		\ -		__ret = ip_vs_confirm_conntrack(skb, cp);	\ -	if (__ret == NF_ACCEPT) {				\ -		nf_reset(skb);					\ -		skb_forward_csum(skb);				\ -	}							\ -	__ret;							\ -}) - -#define IP_VS_XMIT_NAT(pf, skb, cp, local)		\ -do {							\ -	(skb)->ipvs_property = 1;			\ -	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\ -		ip_vs_notrack(skb);			\ -	else						\ -		ip_vs_update_conntrack(skb, cp, 1);	\ -	if (local)					\ -		return NF_ACCEPT;			\ -	skb_forward_csum(skb);				\ -	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\ -		skb_dst(skb)->dev, dst_output);		\ -} while (0) - -#define IP_VS_XMIT(pf, skb, cp, local)			\ -do {							\ -	(skb)->ipvs_property = 1;			\ -	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\ -		ip_vs_notrack(skb);			\ -	if (local)					\ -		return NF_ACCEPT;			\ -	skb_forward_csum(skb);				\ -	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\ -		skb_dst(skb)->dev, dst_output);		\ -} while (0) +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, +				     struct ip_vs_conn *cp, int local) +{ +	int ret = NF_STOLEN; + +	skb->ipvs_property = 1; +	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) +		ip_vs_notrack(skb); +	if (!local) { +		skb_forward_csum(skb); +		NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, +			dst_output); +	} else +		ret = NF_ACCEPT; +	return ret; +}  /* @@ -362,10 +534,10 @@ do {							\   */  int  ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -		struct ip_vs_protocol *pp) +		struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  {  	/* we do not touch skb and do not need pskb ptr */ -	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); +	return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);  } @@ -376,53 +548,31 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,   */  int  ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -		  struct ip_vs_protocol *pp) +		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  { -	struct rtable *rt;			/* Route to the other host */  	struct iphdr  *iph = ip_hdr(skb); -	int    mtu;  	EnterFunction(10); -	if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, -				      RT_TOS(iph->tos), 2))) -		goto tx_error_icmp; - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { -		ip_rt_put(rt); -		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); +	rcu_read_lock(); +	if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, +			       NULL) < 0)  		goto tx_error; -	} -	/* -	 * Call ip_send_check because we are not sure it is called -	 * after ip_defrag. Is copy-on-write needed? -	 */ -	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { -		ip_rt_put(rt); -		return NF_STOLEN; -	} -	ip_send_check(ip_hdr(skb)); - -	/* drop old route */ -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); +	ip_send_check(iph);  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); +	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; - tx_error_icmp: -	dst_link_failure(skb);   tx_error:  	kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN;  } @@ -430,57 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  #ifdef CONFIG_IP_VS_IPV6  int  ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, -		     struct ip_vs_protocol *pp) +		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  { -	struct rt6_info *rt;			/* Route to the other host */ -	struct ipv6hdr  *iph = ipv6_hdr(skb); -	int    mtu; -  	EnterFunction(10); -	if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2))) -		goto tx_error_icmp; - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if (skb->len > mtu) { -		if (!skb->dev) { -			struct net *net = dev_net(skb_dst(skb)->dev); - -			skb->dev = net->loopback_dev; -		} -		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -		dst_release(&rt->dst); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); +	rcu_read_lock(); +	if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, +				  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)  		goto tx_error; -	} - -	/* -	 * Call ip_send_check because we are not sure it is called -	 * after ip_defrag. Is copy-on-write needed? -	 */ -	skb = skb_share_check(skb, GFP_ATOMIC); -	if (unlikely(skb == NULL)) { -		dst_release(&rt->dst); -		return NF_STOLEN; -	} - -	/* drop old route */ -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst);  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); +	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; - tx_error_icmp: -	dst_link_failure(skb);   tx_error:  	kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN;  } @@ -492,92 +612,71 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,   */  int  ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -	       struct ip_vs_protocol *pp) +	       struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  {  	struct rtable *rt;		/* Route to the other host */ -	int mtu; -	struct iphdr *iph = ip_hdr(skb); -	int local; +	int local, rc, was_input;  	EnterFunction(10); +	rcu_read_lock();  	/* check if it is a connection of no-client-port */  	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {  		__be16 _pt, *p; -		p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + +		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);  		if (p == NULL)  			goto tx_error;  		ip_vs_conn_fill_cport(cp, *p);  		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));  	} -	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, -				      RT_TOS(iph->tos), 1|2|4))) -		goto tx_error_icmp; -	local = rt->rt_flags & RTCF_LOCAL; +	was_input = rt_is_input_route(skb_rtable(skb)); +	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, +				   IP_VS_RT_MODE_LOCAL | +				   IP_VS_RT_MODE_NON_LOCAL | +				   IP_VS_RT_MODE_RDR, NULL); +	if (local < 0) +		goto tx_error; +	rt = skb_rtable(skb);  	/*  	 * Avoid duplicate tuple in reply direction for NAT traffic  	 * to local address when connection is sync-ed  	 */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  	if (cp->flags & IP_VS_CONN_F_SYNC && local) {  		enum ip_conntrack_info ctinfo; -		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); +		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);  		if (ct && !nf_ct_is_untracked(ct)) {  			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,  					 "ip_vs_nat_xmit(): "  					 "stopping DNAT to local address"); -			goto tx_error_put; +			goto tx_error;  		}  	}  #endif  	/* From world but DNAT to loopback address? */ -	if (local && ipv4_is_loopback(rt->rt_dst) && -	    rt_is_input_route(skb_rtable(skb))) { +	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {  		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "  				 "stopping DNAT to loopback address"); -		goto tx_error_put; -	} - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { -		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); -		IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, -				 "ip_vs_nat_xmit(): frag needed for"); -		goto tx_error_put; +		goto tx_error;  	}  	/* copy-on-write the packet before mangling it */  	if (!skb_make_writable(skb, sizeof(struct iphdr))) -		goto tx_error_put; +		goto tx_error;  	if (skb_cow(skb, rt->dst.dev->hard_header_len)) -		goto tx_error_put; +		goto tx_error;  	/* mangle the packet */ -	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) -		goto tx_error_put; +	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) +		goto tx_error;  	ip_hdr(skb)->daddr = cp->daddr.ip;  	ip_send_check(ip_hdr(skb)); -	if (!local) { -		/* drop old route */ -		skb_dst_drop(skb); -		skb_dst_set(skb, &rt->dst); -	} else { -		ip_rt_put(rt); -		/* -		 * Some IPv4 replies get local address from routes, -		 * not from iph, so while we DNAT after routing -		 * we need this second input/output route. -		 */ -		if (!__ip_vs_reroute_locally(skb)) -			goto tx_error; -	} -  	IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");  	/* FIXME: when application helper enlarges the packet and the length @@ -585,64 +684,64 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	   MTU problem. */  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); +	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); +	rcu_read_unlock();  	LeaveFunction(10); -	return NF_STOLEN; +	return rc; -  tx_error_icmp: -	dst_link_failure(skb);    tx_error:  	kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; -  tx_error_put: -	ip_rt_put(rt); -	goto tx_error;  }  #ifdef CONFIG_IP_VS_IPV6  int  ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, -		  struct ip_vs_protocol *pp) +		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  {  	struct rt6_info *rt;		/* Route to the other host */ -	int mtu; -	int local; +	int local, rc;  	EnterFunction(10); +	rcu_read_lock();  	/* check if it is a connection of no-client-port */ -	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { +	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {  		__be16 _pt, *p; -		p = skb_header_pointer(skb, sizeof(struct ipv6hdr), -				       sizeof(_pt), &_pt); +		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);  		if (p == NULL)  			goto tx_error;  		ip_vs_conn_fill_cport(cp, *p);  		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));  	} -	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, -					 0, 1|2|4))) -		goto tx_error_icmp; -	local = __ip_vs_is_local_route6(rt); +	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, +				      ipvsh, 0, +				      IP_VS_RT_MODE_LOCAL | +				      IP_VS_RT_MODE_NON_LOCAL | +				      IP_VS_RT_MODE_RDR); +	if (local < 0) +		goto tx_error; +	rt = (struct rt6_info *) skb_dst(skb);  	/*  	 * Avoid duplicate tuple in reply direction for NAT traffic  	 * to local address when connection is sync-ed  	 */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  	if (cp->flags & IP_VS_CONN_F_SYNC && local) {  		enum ip_conntrack_info ctinfo; -		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); +		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);  		if (ct && !nf_ct_is_untracked(ct)) {  			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,  					 "ip_vs_nat_xmit_v6(): "  					 "stopping DNAT to local address"); -			goto tx_error_put; +			goto tx_error;  		}  	}  #endif @@ -653,43 +752,20 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,  				 "ip_vs_nat_xmit_v6(): "  				 "stopping DNAT to loopback address"); -		goto tx_error_put; -	} - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if (skb->len > mtu) { -		if (!skb->dev) { -			struct net *net = dev_net(skb_dst(skb)->dev); - -			skb->dev = net->loopback_dev; -		} -		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -		IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, -				 "ip_vs_nat_xmit_v6(): frag needed for"); -		goto tx_error_put; +		goto tx_error;  	}  	/* copy-on-write the packet before mangling it */  	if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) -		goto tx_error_put; +		goto tx_error;  	if (skb_cow(skb, rt->dst.dev->hard_header_len)) -		goto tx_error_put; +		goto tx_error;  	/* mangle the packet */ -	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) +	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))  		goto tx_error; -	ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6); - -	if (!local || !skb->dev) { -		/* drop the old route when skb is not shared */ -		skb_dst_drop(skb); -		skb_dst_set(skb, &rt->dst); -	} else { -		/* destined to loopback, do we need to change route? */ -		dst_release(&rt->dst); -	} +	ipv6_hdr(skb)->daddr = cp->daddr.in6;  	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); @@ -698,22 +774,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	   MTU problem. */  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); +	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); +	rcu_read_unlock();  	LeaveFunction(10); -	return NF_STOLEN; +	return rc; -tx_error_icmp: -	dst_link_failure(skb);  tx_error:  	LeaveFunction(10);  	kfree_skb(skb); +	rcu_read_unlock();  	return NF_STOLEN; -tx_error_put: -	dst_release(&rt->dst); -	goto tx_error;  }  #endif @@ -739,63 +812,52 @@ tx_error_put:   */  int  ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -		  struct ip_vs_protocol *pp) +		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  { +	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));  	struct rtable *rt;			/* Route to the other host */ +	__be32 saddr;				/* Source for tunnel */  	struct net_device *tdev;		/* Device to other host */  	struct iphdr  *old_iph = ip_hdr(skb);  	u8     tos = old_iph->tos; -	__be16 df = old_iph->frag_off; +	__be16 df;  	struct iphdr  *iph;			/* Our new IP header */  	unsigned int max_headroom;		/* The extra header space needed */ -	int    mtu; -	int ret; +	int ret, local;  	EnterFunction(10); -	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, -				      RT_TOS(tos), 1|2))) -		goto tx_error_icmp; -	if (rt->rt_flags & RTCF_LOCAL) { -		ip_rt_put(rt); -		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); +	rcu_read_lock(); +	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, +				   IP_VS_RT_MODE_LOCAL | +				   IP_VS_RT_MODE_NON_LOCAL | +				   IP_VS_RT_MODE_CONNECT | +				   IP_VS_RT_MODE_TUNNEL, &saddr); +	if (local < 0) +		goto tx_error; +	if (local) { +		rcu_read_unlock(); +		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);  	} +	rt = skb_rtable(skb);  	tdev = rt->dst.dev; -	mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); -	if (mtu < 68) { -		IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); -		goto tx_error_put; -	} -	if (skb_dst(skb)) -		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - -	df |= (old_iph->frag_off & htons(IP_DF)); - -	if ((old_iph->frag_off & htons(IP_DF)) -	    && mtu < ntohs(old_iph->tot_len)) { -		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); -		goto tx_error_put; -	} +	/* Copy DF, reset fragment offset and MF */ +	df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;  	/*  	 * Okay, now see if we can stuff it in the buffer as-is.  	 */  	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); -	if (skb_headroom(skb) < max_headroom -	    || skb_cloned(skb) || skb_shared(skb)) { +	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {  		struct sk_buff *new_skb =  			skb_realloc_headroom(skb, max_headroom); -		if (!new_skb) { -			ip_rt_put(rt); -			kfree_skb(skb); -			IP_VS_ERR_RL("%s(): no memory\n", __func__); -			return NF_STOLEN; -		} -		kfree_skb(skb); + +		if (!new_skb) +			goto tx_error; +		consume_skb(skb);  		skb = new_skb;  		old_iph = ip_hdr(skb);  	} @@ -809,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	skb_reset_network_header(skb);  	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); -	/* drop old route */ -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); -  	/*  	 *	Push down and install the IPIP header.  	 */ @@ -822,39 +880,36 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	iph->frag_off		=	df;  	iph->protocol		=	IPPROTO_IPIP;  	iph->tos		=	tos; -	iph->daddr		=	rt->rt_dst; -	iph->saddr		=	rt->rt_src; +	iph->daddr		=	cp->daddr.ip; +	iph->saddr		=	saddr;  	iph->ttl		=	old_iph->ttl; -	ip_select_ident(iph, &rt->dst, NULL); +	ip_select_ident(skb, NULL);  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	ret = IP_VS_XMIT_TUNNEL(skb, cp); +	ret = ip_vs_tunnel_xmit_prepare(skb, cp);  	if (ret == NF_ACCEPT)  		ip_local_out(skb);  	else if (ret == NF_DROP)  		kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; -  tx_error_icmp: -	dst_link_failure(skb);    tx_error:  	kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; -tx_error_put: -	ip_rt_put(rt); -	goto tx_error;  }  #ifdef CONFIG_IP_VS_IPV6  int  ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, -		     struct ip_vs_protocol *pp) +		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  {  	struct rt6_info *rt;		/* Route to the other host */  	struct in6_addr saddr;		/* Source for tunnel */ @@ -862,57 +917,38 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	struct ipv6hdr  *old_iph = ipv6_hdr(skb);  	struct ipv6hdr  *iph;		/* Our new IP header */  	unsigned int max_headroom;	/* The extra header space needed */ -	int    mtu; -	int ret; +	int ret, local;  	EnterFunction(10); -	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, -					 &saddr, 1, 1|2))) -		goto tx_error_icmp; -	if (__ip_vs_is_local_route6(rt)) { -		dst_release(&rt->dst); -		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); +	rcu_read_lock(); +	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, +				      &saddr, ipvsh, 1, +				      IP_VS_RT_MODE_LOCAL | +				      IP_VS_RT_MODE_NON_LOCAL | +				      IP_VS_RT_MODE_TUNNEL); +	if (local < 0) +		goto tx_error; +	if (local) { +		rcu_read_unlock(); +		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);  	} +	rt = (struct rt6_info *) skb_dst(skb);  	tdev = rt->dst.dev; -	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); -	if (mtu < IPV6_MIN_MTU) { -		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, -			     IPV6_MIN_MTU); -		goto tx_error_put; -	} -	if (skb_dst(skb)) -		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - -	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { -		if (!skb->dev) { -			struct net *net = dev_net(skb_dst(skb)->dev); - -			skb->dev = net->loopback_dev; -		} -		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); -		goto tx_error_put; -	} -  	/*  	 * Okay, now see if we can stuff it in the buffer as-is.  	 */  	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); -	if (skb_headroom(skb) < max_headroom -	    || skb_cloned(skb) || skb_shared(skb)) { +	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {  		struct sk_buff *new_skb =  			skb_realloc_headroom(skb, max_headroom); -		if (!new_skb) { -			dst_release(&rt->dst); -			kfree_skb(skb); -			IP_VS_ERR_RL("%s(): no memory\n", __func__); -			return NF_STOLEN; -		} -		kfree_skb(skb); + +		if (!new_skb) +			goto tx_error; +		consume_skb(skb);  		skb = new_skb;  		old_iph = ipv6_hdr(skb);  	} @@ -923,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	skb_reset_network_header(skb);  	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); -	/* drop old route */ -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); -  	/*  	 *	Push down and install the IPIP header.  	 */ @@ -937,32 +969,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));  	iph->priority		=	old_iph->priority;  	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); -	ipv6_addr_copy(&iph->daddr, &cp->daddr.in6); -	ipv6_addr_copy(&iph->saddr, &saddr); +	iph->daddr = cp->daddr.in6; +	iph->saddr = saddr;  	iph->hop_limit		=	old_iph->hop_limit;  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	ret = IP_VS_XMIT_TUNNEL(skb, cp); +	ret = ip_vs_tunnel_xmit_prepare(skb, cp);  	if (ret == NF_ACCEPT)  		ip6_local_out(skb);  	else if (ret == NF_DROP)  		kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; -tx_error_icmp: -	dst_link_failure(skb);  tx_error:  	kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; -tx_error_put: -	dst_release(&rt->dst); -	goto tx_error;  }  #endif @@ -973,57 +1002,38 @@ tx_error_put:   */  int  ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -	      struct ip_vs_protocol *pp) +	      struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  { -	struct rtable *rt;			/* Route to the other host */ -	struct iphdr  *iph = ip_hdr(skb); -	int    mtu; +	int local;  	EnterFunction(10); -	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, -				      RT_TOS(iph->tos), 1|2))) -		goto tx_error_icmp; -	if (rt->rt_flags & RTCF_LOCAL) { -		ip_rt_put(rt); -		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); -	} - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { -		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); -		ip_rt_put(rt); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); +	rcu_read_lock(); +	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, +				   IP_VS_RT_MODE_LOCAL | +				   IP_VS_RT_MODE_NON_LOCAL | +				   IP_VS_RT_MODE_KNOWN_NH, NULL); +	if (local < 0)  		goto tx_error; +	if (local) { +		rcu_read_unlock(); +		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);  	} -	/* -	 * Call ip_send_check because we are not sure it is called -	 * after ip_defrag. Is copy-on-write needed? -	 */ -	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { -		ip_rt_put(rt); -		return NF_STOLEN; -	}  	ip_send_check(ip_hdr(skb)); -	/* drop old route */ -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); -  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); +	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; -  tx_error_icmp: -	dst_link_failure(skb);    tx_error:  	kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN;  } @@ -1031,61 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  #ifdef CONFIG_IP_VS_IPV6  int  ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, -		 struct ip_vs_protocol *pp) +		 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)  { -	struct rt6_info *rt;			/* Route to the other host */ -	int    mtu; +	int local;  	EnterFunction(10); -	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, -					 0, 1|2))) -		goto tx_error_icmp; -	if (__ip_vs_is_local_route6(rt)) { -		dst_release(&rt->dst); -		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); -	} - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if (skb->len > mtu) { -		if (!skb->dev) { -			struct net *net = dev_net(skb_dst(skb)->dev); - -			skb->dev = net->loopback_dev; -		} -		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -		dst_release(&rt->dst); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); +	rcu_read_lock(); +	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, +				      ipvsh, 0, +				      IP_VS_RT_MODE_LOCAL | +				      IP_VS_RT_MODE_NON_LOCAL); +	if (local < 0)  		goto tx_error; +	if (local) { +		rcu_read_unlock(); +		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);  	} -	/* -	 * Call ip_send_check because we are not sure it is called -	 * after ip_defrag. Is copy-on-write needed? -	 */ -	skb = skb_share_check(skb, GFP_ATOMIC); -	if (unlikely(skb == NULL)) { -		dst_release(&rt->dst); -		return NF_STOLEN; -	} - -	/* drop old route */ -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); -  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); +	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN; -tx_error_icmp: -	dst_link_failure(skb);  tx_error:  	kfree_skb(skb); +	rcu_read_unlock();  	LeaveFunction(10);  	return NF_STOLEN;  } @@ -1098,12 +1083,13 @@ tx_error:   */  int  ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -		struct ip_vs_protocol *pp, int offset) +		struct ip_vs_protocol *pp, int offset, unsigned int hooknum, +		struct ip_vs_iphdr *iph)  {  	struct rtable	*rt;	/* Route to the other host */ -	int mtu;  	int rc;  	int local; +	int rt_mode, was_input;  	EnterFunction(10); @@ -1112,7 +1098,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	   translate address/port back */  	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {  		if (cp->packet_xmit) -			rc = cp->packet_xmit(skb, cp, pp); +			rc = cp->packet_xmit(skb, cp, pp, iph);  		else  			rc = NF_ACCEPT;  		/* do not touch skb anymore */ @@ -1123,101 +1109,79 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,  	/*  	 * mangle and send the packet here (only for VS/NAT)  	 */ - -	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, -				      RT_TOS(ip_hdr(skb)->tos), 1|2|4))) -		goto tx_error_icmp; -	local = rt->rt_flags & RTCF_LOCAL; +	was_input = rt_is_input_route(skb_rtable(skb)); + +	/* LOCALNODE from FORWARD hook is not supported */ +	rt_mode = (hooknum != NF_INET_FORWARD) ? +		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | +		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; +	rcu_read_lock(); +	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); +	if (local < 0) +		goto tx_error; +	rt = skb_rtable(skb);  	/*  	 * Avoid duplicate tuple in reply direction for NAT traffic  	 * to local address when connection is sync-ed  	 */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  	if (cp->flags & IP_VS_CONN_F_SYNC && local) {  		enum ip_conntrack_info ctinfo; -		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); +		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);  		if (ct && !nf_ct_is_untracked(ct)) {  			IP_VS_DBG(10, "%s(): "  				  "stopping DNAT to local address %pI4\n",  				  __func__, &cp->daddr.ip); -			goto tx_error_put; +			goto tx_error;  		}  	}  #endif  	/* From world but DNAT to loopback address? */ -	if (local && ipv4_is_loopback(rt->rt_dst) && -	    rt_is_input_route(skb_rtable(skb))) { +	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {  		IP_VS_DBG(1, "%s(): "  			  "stopping DNAT to loopback %pI4\n",  			  __func__, &cp->daddr.ip); -		goto tx_error_put; -	} - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { -		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); -		goto tx_error_put; +		goto tx_error;  	}  	/* copy-on-write the packet before mangling it */  	if (!skb_make_writable(skb, offset)) -		goto tx_error_put; +		goto tx_error;  	if (skb_cow(skb, rt->dst.dev->hard_header_len)) -		goto tx_error_put; +		goto tx_error;  	ip_vs_nat_icmp(skb, pp, cp, 0); -	if (!local) { -		/* drop the old route when skb is not shared */ -		skb_dst_drop(skb); -		skb_dst_set(skb, &rt->dst); -	} else { -		ip_rt_put(rt); -		/* -		 * Some IPv4 replies get local address from routes, -		 * not from iph, so while we DNAT after routing -		 * we need this second input/output route. -		 */ -		if (!__ip_vs_reroute_locally(skb)) -			goto tx_error; -	} -  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); - -	rc = NF_STOLEN; +	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); +	rcu_read_unlock();  	goto out; -  tx_error_icmp: -	dst_link_failure(skb);    tx_error: -	dev_kfree_skb(skb); +	kfree_skb(skb); +	rcu_read_unlock();  	rc = NF_STOLEN;    out:  	LeaveFunction(10);  	return rc; -  tx_error_put: -	ip_rt_put(rt); -	goto tx_error;  }  #ifdef CONFIG_IP_VS_IPV6  int  ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, -		struct ip_vs_protocol *pp, int offset) +		struct ip_vs_protocol *pp, int offset, unsigned int hooknum, +		struct ip_vs_iphdr *ipvsh)  {  	struct rt6_info	*rt;	/* Route to the other host */ -	int mtu;  	int rc;  	int local; +	int rt_mode;  	EnterFunction(10); @@ -1226,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	   translate address/port back */  	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {  		if (cp->packet_xmit) -			rc = cp->packet_xmit(skb, cp, pp); +			rc = cp->packet_xmit(skb, cp, pp, ipvsh);  		else  			rc = NF_ACCEPT;  		/* do not touch skb anymore */ @@ -1238,25 +1202,30 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  	 * mangle and send the packet here (only for VS/NAT)  	 */ -	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, -					 0, 1|2|4))) -		goto tx_error_icmp; - -	local = __ip_vs_is_local_route6(rt); +	/* LOCALNODE from FORWARD hook is not supported */ +	rt_mode = (hooknum != NF_INET_FORWARD) ? +		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | +		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; +	rcu_read_lock(); +	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, +				      ipvsh, 0, rt_mode); +	if (local < 0) +		goto tx_error; +	rt = (struct rt6_info *) skb_dst(skb);  	/*  	 * Avoid duplicate tuple in reply direction for NAT traffic  	 * to local address when connection is sync-ed  	 */ -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  	if (cp->flags & IP_VS_CONN_F_SYNC && local) {  		enum ip_conntrack_info ctinfo; -		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); +		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);  		if (ct && !nf_ct_is_untracked(ct)) {  			IP_VS_DBG(10, "%s(): "  				  "stopping DNAT to local address %pI6\n",  				  __func__, &cp->daddr.in6); -			goto tx_error_put; +			goto tx_error;  		}  	}  #endif @@ -1267,58 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,  		IP_VS_DBG(1, "%s(): "  			  "stopping DNAT to loopback %pI6\n",  			  __func__, &cp->daddr.in6); -		goto tx_error_put; -	} - -	/* MTU checking */ -	mtu = dst_mtu(&rt->dst); -	if (skb->len > mtu) { -		if (!skb->dev) { -			struct net *net = dev_net(skb_dst(skb)->dev); - -			skb->dev = net->loopback_dev; -		} -		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -		IP_VS_DBG_RL("%s(): frag needed\n", __func__); -		goto tx_error_put; +		goto tx_error;  	}  	/* copy-on-write the packet before mangling it */  	if (!skb_make_writable(skb, offset)) -		goto tx_error_put; +		goto tx_error;  	if (skb_cow(skb, rt->dst.dev->hard_header_len)) -		goto tx_error_put; +		goto tx_error;  	ip_vs_nat_icmp_v6(skb, pp, cp, 0); -	if (!local || !skb->dev) { -		/* drop the old route when skb is not shared */ -		skb_dst_drop(skb); -		skb_dst_set(skb, &rt->dst); -	} else { -		/* destined to loopback, do we need to change route? */ -		dst_release(&rt->dst); -	} -  	/* Another hack: avoid icmp_send in ip_fragment */ -	skb->local_df = 1; +	skb->ignore_df = 1; -	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); - -	rc = NF_STOLEN; +	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); +	rcu_read_unlock();  	goto out; -tx_error_icmp: -	dst_link_failure(skb);  tx_error: -	dev_kfree_skb(skb); +	kfree_skb(skb); +	rcu_read_unlock();  	rc = NF_STOLEN;  out:  	LeaveFunction(10);  	return rc; -tx_error_put: -	dst_release(&rt->dst); -	goto tx_error;  }  #endif diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 5178c691ecb..a4b5e2a435a 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -12,12 +12,13 @@  #include <linux/slab.h>  #include <linux/kernel.h>  #include <linux/moduleparam.h> +#include <linux/export.h>  #include <net/netfilter/nf_conntrack.h>  #include <net/netfilter/nf_conntrack_extend.h>  #include <net/netfilter/nf_conntrack_acct.h> -static int nf_ct_acct __read_mostly; +static bool nf_ct_acct __read_mostly;  module_param_named(acct, nf_ct_acct, bool, 0644);  MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); @@ -38,21 +39,23 @@ static struct ctl_table acct_sysctl_table[] = {  unsigned int  seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)  { -	struct nf_conn_counter *acct; +	struct nf_conn_acct *acct; +	struct nf_conn_counter *counter;  	acct = nf_conn_acct_find(ct);  	if (!acct)  		return 0; +	counter = acct->counter;  	return seq_printf(s, "packets=%llu bytes=%llu ", -			  (unsigned long long)acct[dir].packets, -			  (unsigned long long)acct[dir].bytes); +			  (unsigned long long)atomic64_read(&counter[dir].packets), +			  (unsigned long long)atomic64_read(&counter[dir].bytes));  };  EXPORT_SYMBOL_GPL(seq_print_acct);  static struct nf_ct_ext_type acct_extend __read_mostly = { -	.len	= sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]), -	.align	= __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]), +	.len	= sizeof(struct nf_conn_acct), +	.align	= __alignof__(struct nf_conn_acct),  	.id	= NF_CT_EXT_ACCT,  }; @@ -68,8 +71,12 @@ static int nf_conntrack_acct_init_sysctl(struct net *net)  	table[0].data = &net->ct.sysctl_acct; -	net->ct.acct_sysctl_header = register_net_sysctl_table(net, -			nf_net_netfilter_sysctl_path, table); +	/* Don't export sysctls to unprivileged users */ +	if (net->user_ns != &init_user_ns) +		table[0].procname = NULL; + +	net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter", +							 table);  	if (!net->ct.acct_sysctl_header) {  		printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");  		goto out_register; @@ -101,36 +108,26 @@ static void nf_conntrack_acct_fini_sysctl(struct net *net)  }  #endif -int nf_conntrack_acct_init(struct net *net) +int nf_conntrack_acct_pernet_init(struct net *net)  { -	int ret; -  	net->ct.sysctl_acct = nf_ct_acct; +	return nf_conntrack_acct_init_sysctl(net); +} -	if (net_eq(net, &init_net)) { -		ret = nf_ct_extend_register(&acct_extend); -		if (ret < 0) { -			printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); -			goto out_extend_register; -		} -	} +void nf_conntrack_acct_pernet_fini(struct net *net) +{ +	nf_conntrack_acct_fini_sysctl(net); +} -	ret = nf_conntrack_acct_init_sysctl(net); +int nf_conntrack_acct_init(void) +{ +	int ret = nf_ct_extend_register(&acct_extend);  	if (ret < 0) -		goto out_sysctl; - -	return 0; - -out_sysctl: -	if (net_eq(net, &init_net)) -		nf_ct_extend_unregister(&acct_extend); -out_extend_register: +		pr_err("nf_conntrack_acct: Unable to register extension\n");  	return ret;  } -void nf_conntrack_acct_fini(struct net *net) +void nf_conntrack_acct_fini(void)  { -	nf_conntrack_acct_fini_sysctl(net); -	if (net_eq(net, &init_net)) -		nf_ct_extend_unregister(&acct_extend); +	nf_ct_extend_unregister(&acct_extend);  } diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 13fd2c55e32..b8b95f4027c 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -2,6 +2,7 @@   *   * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>   * based on HW's ip_conntrack_irc.c as well as other modules + * (C) 2006 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public License @@ -40,6 +41,7 @@ MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");  unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb,  				   enum ip_conntrack_info ctinfo, +				   unsigned int protoff,  				   unsigned int matchoff,  				   unsigned int matchlen,  				   struct nf_conntrack_expect *exp) @@ -107,8 +109,7 @@ static int amanda_help(struct sk_buff *skb,  	/* No data? */  	dataoff = protoff + sizeof(struct udphdr);  	if (dataoff >= skb->len) { -		if (net_ratelimit()) -			printk(KERN_ERR "amanda_help: skblen = %u\n", skb->len); +		net_err_ratelimited("amanda_help: skblen = %u\n", skb->len);  		return NF_ACCEPT;  	} @@ -145,6 +146,7 @@ static int amanda_help(struct sk_buff *skb,  		exp = nf_ct_expect_alloc(ct);  		if (exp == NULL) { +			nf_ct_helper_log(skb, ct, "cannot alloc expectation");  			ret = NF_DROP;  			goto out;  		} @@ -156,10 +158,12 @@ static int amanda_help(struct sk_buff *skb,  		nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook);  		if (nf_nat_amanda && ct->status & IPS_NAT_MASK) -			ret = nf_nat_amanda(skb, ctinfo, off - dataoff, -					    len, exp); -		else if (nf_ct_expect_related(exp) != 0) +			ret = nf_nat_amanda(skb, ctinfo, protoff, +					    off - dataoff, len, exp); +		else if (nf_ct_expect_related(exp) != 0) { +			nf_ct_helper_log(skb, ct, "cannot add expectation");  			ret = NF_DROP; +		}  		nf_ct_expect_put(exp);  	} diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c new file mode 100644 index 00000000000..4e99cca6161 --- /dev/null +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -0,0 +1,82 @@ +/* + *      broadcast connection tracking helper + * + *      (c) 2005 Patrick McHardy <kaber@trash.net> + * + *      This program is free software; you can redistribute it and/or + *      modify it under the terms of the GNU General Public License + *      as published by the Free Software Foundation; either version + *      2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <net/route.h> +#include <linux/inetdevice.h> +#include <linux/skbuff.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> + +int nf_conntrack_broadcast_help(struct sk_buff *skb, +				unsigned int protoff, +				struct nf_conn *ct, +				enum ip_conntrack_info ctinfo, +				unsigned int timeout) +{ +	struct nf_conntrack_expect *exp; +	struct iphdr *iph = ip_hdr(skb); +	struct rtable *rt = skb_rtable(skb); +	struct in_device *in_dev; +	struct nf_conn_help *help = nfct_help(ct); +	__be32 mask = 0; + +	/* we're only interested in locally generated packets */ +	if (skb->sk == NULL) +		goto out; +	if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) +		goto out; +	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) +		goto out; + +	rcu_read_lock(); +	in_dev = __in_dev_get_rcu(rt->dst.dev); +	if (in_dev != NULL) { +		for_primary_ifa(in_dev) { +			if (ifa->ifa_broadcast == iph->daddr) { +				mask = ifa->ifa_mask; +				break; +			} +		} endfor_ifa(in_dev); +	} +	rcu_read_unlock(); + +	if (mask == 0) +		goto out; + +	exp = nf_ct_expect_alloc(ct); +	if (exp == NULL) +		goto out; + +	exp->tuple                = ct->tuplehash[IP_CT_DIR_REPLY].tuple; +	exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port; + +	exp->mask.src.u3.ip       = mask; +	exp->mask.src.u.udp.port  = htons(0xFFFF); + +	exp->expectfn             = NULL; +	exp->flags                = NF_CT_EXPECT_PERMANENT; +	exp->class		  = NF_CT_EXPECT_CLASS_DEFAULT; +	exp->helper               = NULL; + +	nf_ct_expect_related(exp); +	nf_ct_expect_put(exp); + +	nf_ct_refresh(ct, skb, timeout * HZ); +out: +	return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 27a5ea6b6a0..1f4f954c4b4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -5,6 +5,7 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>   * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -38,13 +39,19 @@  #include <net/netfilter/nf_conntrack_l4proto.h>  #include <net/netfilter/nf_conntrack_expect.h>  #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h>  #include <net/netfilter/nf_conntrack_core.h>  #include <net/netfilter/nf_conntrack_extend.h>  #include <net/netfilter/nf_conntrack_acct.h>  #include <net/netfilter/nf_conntrack_ecache.h>  #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_synproxy.h>  #include <net/netfilter/nf_nat.h>  #include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h>  #define NF_CONNTRACK_VERSION	"0.5.0" @@ -53,8 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,  				      const struct nlattr *attr) __read_mostly;  EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); -DEFINE_SPINLOCK(nf_conntrack_lock); -EXPORT_SYMBOL_GPL(nf_conntrack_lock); +__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; +EXPORT_SYMBOL_GPL(nf_conntrack_locks); + +__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); + +static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) +{ +	h1 %= CONNTRACK_LOCKS; +	h2 %= CONNTRACK_LOCKS; +	spin_unlock(&nf_conntrack_locks[h1]); +	if (h1 != h2) +		spin_unlock(&nf_conntrack_locks[h2]); +} + +/* return true if we need to recompute hashes (in case hash table was resized) */ +static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, +				     unsigned int h2, unsigned int sequence) +{ +	h1 %= CONNTRACK_LOCKS; +	h2 %= CONNTRACK_LOCKS; +	if (h1 <= h2) { +		spin_lock(&nf_conntrack_locks[h1]); +		if (h1 != h2) +			spin_lock_nested(&nf_conntrack_locks[h2], +					 SINGLE_DEPTH_NESTING); +	} else { +		spin_lock(&nf_conntrack_locks[h2]); +		spin_lock_nested(&nf_conntrack_locks[h1], +				 SINGLE_DEPTH_NESTING); +	} +	if (read_seqcount_retry(&net->ct.generation, sequence)) { +		nf_conntrack_double_unlock(h1, h2); +		return true; +	} +	return false; +} + +static void nf_conntrack_all_lock(void) +{ +	int i; + +	for (i = 0; i < CONNTRACK_LOCKS; i++) +		spin_lock_nested(&nf_conntrack_locks[i], i); +} + +static void nf_conntrack_all_unlock(void) +{ +	int i; + +	for (i = 0; i < CONNTRACK_LOCKS; i++) +		spin_unlock(&nf_conntrack_locks[i]); +}  unsigned int nf_conntrack_htable_size __read_mostly;  EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); @@ -65,7 +123,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);  DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);  EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -static unsigned int nf_conntrack_hash_rnd __read_mostly; +unsigned int nf_conntrack_hash_rnd __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);  static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)  { @@ -184,6 +243,50 @@ clean_from_lists(struct nf_conn *ct)  	nf_ct_remove_expectations(ct);  } +/* must be called with local_bh_disable */ +static void nf_ct_add_to_dying_list(struct nf_conn *ct) +{ +	struct ct_pcpu *pcpu; + +	/* add this conntrack to the (per cpu) dying list */ +	ct->cpu = smp_processor_id(); +	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + +	spin_lock(&pcpu->lock); +	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, +			     &pcpu->dying); +	spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) +{ +	struct ct_pcpu *pcpu; + +	/* add this conntrack to the (per cpu) unconfirmed list */ +	ct->cpu = smp_processor_id(); +	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + +	spin_lock(&pcpu->lock); +	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, +			     &pcpu->unconfirmed); +	spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) +{ +	struct ct_pcpu *pcpu; + +	/* We overload first tuple to link into unconfirmed or dying list.*/ +	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + +	spin_lock(&pcpu->lock); +	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); +	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); +	spin_unlock(&pcpu->lock); +} +  static void  destroy_conntrack(struct nf_conntrack *nfct)  { @@ -195,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct)  	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);  	NF_CT_ASSERT(!timer_pending(&ct->timeout)); -	/* To make sure we don't get any weird locking issues here: -	 * destroy_conntrack() MUST NOT be called with a write lock -	 * to nf_conntrack_lock!!! -HW */  	rcu_read_lock();  	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));  	if (l4proto && l4proto->destroy) @@ -205,21 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct)  	rcu_read_unlock(); -	spin_lock_bh(&nf_conntrack_lock); +	local_bh_disable();  	/* Expectations will have been removed in clean_from_lists,  	 * except TFTP can create an expectation on the first packet,  	 * before connection is in the list, so we need to clean here, -	 * too. */ +	 * too. +	 */  	nf_ct_remove_expectations(ct); -	/* We overload first tuple to link into unconfirmed list. */ -	if (!nf_ct_is_confirmed(ct)) { -		BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); -		hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); -	} +	nf_ct_del_from_dying_or_unconfirmed_list(ct);  	NF_CT_STAT_INC(net, delete); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	if (ct->master)  		nf_ct_put(ct->master); @@ -228,79 +325,114 @@ destroy_conntrack(struct nf_conntrack *nfct)  	nf_conntrack_free(ct);  } -void nf_ct_delete_from_lists(struct nf_conn *ct) +static void nf_ct_delete_from_lists(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); +	unsigned int hash, reply_hash; +	u16 zone = nf_ct_zone(ct); +	unsigned int sequence;  	nf_ct_helper_destroy(ct); -	spin_lock_bh(&nf_conntrack_lock); -	/* Inside lock so preempt is disabled on module removal path. -	 * Otherwise we can get spurious warnings. */ -	NF_CT_STAT_INC(net, delete_list); + +	local_bh_disable(); +	do { +		sequence = read_seqcount_begin(&net->ct.generation); +		hash = hash_conntrack(net, zone, +				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +		reply_hash = hash_conntrack(net, zone, +					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); +	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); +  	clean_from_lists(ct); -	spin_unlock_bh(&nf_conntrack_lock); +	nf_conntrack_double_unlock(hash, reply_hash); + +	nf_ct_add_to_dying_list(ct); + +	NF_CT_STAT_INC(net, delete_list); +	local_bh_enable();  } -EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);  static void death_by_event(unsigned long ul_conntrack)  {  	struct nf_conn *ct = (void *)ul_conntrack;  	struct net *net = nf_ct_net(ct); +	struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + +	BUG_ON(ecache == NULL);  	if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {  		/* bad luck, let's retry again */ -		ct->timeout.expires = jiffies + -			(random32() % net->ct.sysctl_events_retry_timeout); -		add_timer(&ct->timeout); +		ecache->timeout.expires = jiffies + +			(prandom_u32() % net->ct.sysctl_events_retry_timeout); +		add_timer(&ecache->timeout);  		return;  	}  	/* we've got the event delivered, now it's dying */  	set_bit(IPS_DYING_BIT, &ct->status); -	spin_lock(&nf_conntrack_lock); -	hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); -	spin_unlock(&nf_conntrack_lock);  	nf_ct_put(ct);  } -void nf_ct_insert_dying_list(struct nf_conn *ct) +static void nf_ct_dying_timeout(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); +	struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + +	BUG_ON(ecache == NULL); -	/* add this conntrack to the dying list */ -	spin_lock_bh(&nf_conntrack_lock); -	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, -			     &net->ct.dying); -	spin_unlock_bh(&nf_conntrack_lock);  	/* set a new timer to retry event delivery */ -	setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); -	ct->timeout.expires = jiffies + -		(random32() % net->ct.sysctl_events_retry_timeout); -	add_timer(&ct->timeout); +	setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); +	ecache->timeout.expires = jiffies + +		(prandom_u32() % net->ct.sysctl_events_retry_timeout); +	add_timer(&ecache->timeout);  } -EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); -static void death_by_timeout(unsigned long ul_conntrack) +bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)  { -	struct nf_conn *ct = (void *)ul_conntrack; +	struct nf_conn_tstamp *tstamp; + +	tstamp = nf_conn_tstamp_find(ct); +	if (tstamp && tstamp->stop == 0) +		tstamp->stop = ktime_to_ns(ktime_get_real()); -	if (!test_bit(IPS_DYING_BIT, &ct->status) && -	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { +	if (!nf_ct_is_dying(ct) && +	    unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, +	    portid, report) < 0)) {  		/* destroy event was not delivered */  		nf_ct_delete_from_lists(ct); -		nf_ct_insert_dying_list(ct); -		return; +		nf_ct_dying_timeout(ct); +		return false;  	}  	set_bit(IPS_DYING_BIT, &ct->status);  	nf_ct_delete_from_lists(ct);  	nf_ct_put(ct); +	return true; +} +EXPORT_SYMBOL_GPL(nf_ct_delete); + +static void death_by_timeout(unsigned long ul_conntrack) +{ +	nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); +} + +static inline bool +nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, +			const struct nf_conntrack_tuple *tuple, +			u16 zone) +{ +	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + +	/* A conntrack can be recreated with the equal tuple, +	 * so we need to check that the conntrack is confirmed +	 */ +	return nf_ct_tuple_equal(tuple, &h->tuple) && +		nf_ct_zone(ct) == zone && +		nf_ct_is_confirmed(ct);  }  /*   * Warning :   * - Caller must take a reference on returned object   *   and recheck nf_ct_tuple_equal(tuple, &h->tuple) - * OR - * - Caller must lock nf_conntrack_lock before calling this function   */  static struct nf_conntrack_tuple_hash *  ____nf_conntrack_find(struct net *net, u16 zone, @@ -316,8 +448,7 @@ ____nf_conntrack_find(struct net *net, u16 zone,  	local_bh_disable();  begin:  	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { -		if (nf_ct_tuple_equal(tuple, &h->tuple) && -		    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { +		if (nf_ct_key_equal(h, tuple, zone)) {  			NF_CT_STAT_INC(net, found);  			local_bh_enable();  			return h; @@ -338,15 +469,6 @@ begin:  	return NULL;  } -struct nf_conntrack_tuple_hash * -__nf_conntrack_find(struct net *net, u16 zone, -		    const struct nf_conntrack_tuple *tuple) -{ -	return ____nf_conntrack_find(net, zone, tuple, -				     hash_conntrack_raw(tuple, zone)); -} -EXPORT_SYMBOL_GPL(__nf_conntrack_find); -  /* Find a connection corresponding to a tuple. */  static struct nf_conntrack_tuple_hash *  __nf_conntrack_find_get(struct net *net, u16 zone, @@ -364,8 +486,7 @@ begin:  			     !atomic_inc_not_zero(&ct->ct_general.use)))  			h = NULL;  		else { -			if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || -				     nf_ct_zone(ct) != zone)) { +			if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {  				nf_ct_put(ct);  				goto begin;  			} @@ -387,42 +508,103 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get);  static void __nf_conntrack_hash_insert(struct nf_conn *ct,  				       unsigned int hash, -				       unsigned int repl_hash) +				       unsigned int reply_hash)  {  	struct net *net = nf_ct_net(ct);  	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,  			   &net->ct.hash[hash]);  	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, -			   &net->ct.hash[repl_hash]); +			   &net->ct.hash[reply_hash]);  } -void nf_conntrack_hash_insert(struct nf_conn *ct) +int +nf_conntrack_hash_check_insert(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); -	unsigned int hash, repl_hash; +	unsigned int hash, reply_hash; +	struct nf_conntrack_tuple_hash *h; +	struct hlist_nulls_node *n;  	u16 zone; +	unsigned int sequence;  	zone = nf_ct_zone(ct); -	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); -	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); -	__nf_conntrack_hash_insert(ct, hash, repl_hash); +	local_bh_disable(); +	do { +		sequence = read_seqcount_begin(&net->ct.generation); +		hash = hash_conntrack(net, zone, +				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +		reply_hash = hash_conntrack(net, zone, +					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); +	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + +	/* See if there's one in the list already, including reverse */ +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, +				      &h->tuple) && +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) +			goto out; +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) +		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, +				      &h->tuple) && +		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) +			goto out; + +	add_timer(&ct->timeout); +	smp_wmb(); +	/* The caller holds a reference to this object */ +	atomic_set(&ct->ct_general.use, 2); +	__nf_conntrack_hash_insert(ct, hash, reply_hash); +	nf_conntrack_double_unlock(hash, reply_hash); +	NF_CT_STAT_INC(net, insert); +	local_bh_enable(); +	return 0; + +out: +	nf_conntrack_double_unlock(hash, reply_hash); +	NF_CT_STAT_INC(net, insert_failed); +	local_bh_enable(); +	return -EEXIST;  } -EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); + +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ +	struct ct_pcpu *pcpu; + +	__set_bit(IPS_TEMPLATE_BIT, &tmpl->status); +	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status); +	nf_conntrack_get(&tmpl->ct_general); + +	/* add this conntrack to the (per cpu) tmpl list */ +	local_bh_disable(); +	tmpl->cpu = smp_processor_id(); +	pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); + +	spin_lock(&pcpu->lock); +	/* Overload tuple linked list to put us in template list. */ +	hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, +				 &pcpu->tmpl); +	spin_unlock_bh(&pcpu->lock); +} +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);  /* Confirm a connection given skb; places it in hash table */  int  __nf_conntrack_confirm(struct sk_buff *skb)  { -	unsigned int hash, repl_hash; +	unsigned int hash, reply_hash;  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct;  	struct nf_conn_help *help; +	struct nf_conn_tstamp *tstamp;  	struct hlist_nulls_node *n;  	enum ip_conntrack_info ctinfo;  	struct net *net;  	u16 zone; +	unsigned int sequence;  	ct = nf_ct_get(skb, &ctinfo);  	net = nf_ct_net(ct); @@ -435,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)  		return NF_ACCEPT;  	zone = nf_ct_zone(ct); -	/* reuse the hash saved before */ -	hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; -	hash = hash_bucket(hash, net); -	repl_hash = hash_conntrack(net, zone, -				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); +	local_bh_disable(); + +	do { +		sequence = read_seqcount_begin(&net->ct.generation); +		/* reuse the hash saved before */ +		hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; +		hash = hash_bucket(hash, net); +		reply_hash = hash_conntrack(net, zone, +					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + +	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));  	/* We're not in hash table, and we refuse to set up related -	   connections for unconfirmed conns.  But packet copies and -	   REJECT will give spurious warnings here. */ +	 * connections for unconfirmed conns.  But packet copies and +	 * REJECT will give spurious warnings here. +	 */  	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ -	/* No external references means noone else could have -	   confirmed us. */ +	/* No external references means no one else could have +	 * confirmed us. +	 */  	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));  	pr_debug("Confirming conntrack %p\n", ct); - -	spin_lock_bh(&nf_conntrack_lock); -  	/* We have to check the DYING flag inside the lock to prevent  	   a race against nf_ct_get_next_corpse() possibly called from  	   user context, else we insert an already 'dead' hash, blocking  	   further use of that particular connection -JM */  	if (unlikely(nf_ct_is_dying(ct))) { -		spin_unlock_bh(&nf_conntrack_lock); +		nf_conntrack_double_unlock(hash, reply_hash); +		local_bh_enable();  		return NF_ACCEPT;  	} @@ -471,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb)  				      &h->tuple) &&  		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))  			goto out; -	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) +	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)  		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,  				      &h->tuple) &&  		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))  			goto out; -	/* Remove from unconfirmed list */ -	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); +	nf_ct_del_from_dying_or_unconfirmed_list(ct);  	/* Timer relative to confirmation time, not original  	   setting time, otherwise we'd get timer wrap in @@ -486,16 +673,25 @@ __nf_conntrack_confirm(struct sk_buff *skb)  	ct->timeout.expires += jiffies;  	add_timer(&ct->timeout);  	atomic_inc(&ct->ct_general.use); -	set_bit(IPS_CONFIRMED_BIT, &ct->status); +	ct->status |= IPS_CONFIRMED; +	/* set conntrack timestamp, if enabled. */ +	tstamp = nf_conn_tstamp_find(ct); +	if (tstamp) { +		if (skb->tstamp.tv64 == 0) +			__net_timestamp(skb); + +		tstamp->start = ktime_to_ns(skb->tstamp); +	}  	/* Since the lookup is lockless, hash insertion must be done after  	 * starting the timer and setting the CONFIRMED bit. The RCU barriers  	 * guarantee that no other CPU can find the conntrack before the above  	 * stores are visible.  	 */ -	__nf_conntrack_hash_insert(ct, hash, repl_hash); +	__nf_conntrack_hash_insert(ct, hash, reply_hash); +	nf_conntrack_double_unlock(hash, reply_hash);  	NF_CT_STAT_INC(net, insert); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	help = nfct_help(ct);  	if (help && help->helper) @@ -506,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)  	return NF_ACCEPT;  out: +	nf_conntrack_double_unlock(hash, reply_hash);  	NF_CT_STAT_INC(net, insert_failed); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	return NF_DROP;  }  EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -550,52 +747,77 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);  /* There's a small race here where we may free a just-assured     connection.  Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int hash) +static noinline int early_drop(struct net *net, unsigned int _hash)  {  	/* Use oldest entry, which is roughly LRU */  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct = NULL, *tmp;  	struct hlist_nulls_node *n; -	unsigned int i, cnt = 0; +	unsigned int i = 0, cnt = 0;  	int dropped = 0; +	unsigned int hash, sequence; +	spinlock_t *lockp; -	rcu_read_lock(); -	for (i = 0; i < net->ct.htable_size; i++) { +	local_bh_disable(); +restart: +	sequence = read_seqcount_begin(&net->ct.generation); +	hash = hash_bucket(_hash, net); +	for (; i < net->ct.htable_size; i++) { +		lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; +		spin_lock(lockp); +		if (read_seqcount_retry(&net->ct.generation, sequence)) { +			spin_unlock(lockp); +			goto restart; +		}  		hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],  					 hnnode) {  			tmp = nf_ct_tuplehash_to_ctrack(h); -			if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) +			if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && +			    !nf_ct_is_dying(tmp) && +			    atomic_inc_not_zero(&tmp->ct_general.use)) {  				ct = tmp; +				break; +			}  			cnt++;  		} -		if (ct != NULL) { -			if (likely(!nf_ct_is_dying(ct) && -				   atomic_inc_not_zero(&ct->ct_general.use))) -				break; -			else -				ct = NULL; -		} +		hash = (hash + 1) % net->ct.htable_size; +		spin_unlock(lockp); -		if (cnt >= NF_CT_EVICTION_RANGE) +		if (ct || cnt >= NF_CT_EVICTION_RANGE)  			break; -		hash = (hash + 1) % net->ct.htable_size;  	} -	rcu_read_unlock(); +	local_bh_enable();  	if (!ct)  		return dropped;  	if (del_timer(&ct->timeout)) { -		death_by_timeout((unsigned long)ct); -		dropped = 1; -		NF_CT_STAT_INC_ATOMIC(net, early_drop); +		if (nf_ct_delete(ct, 0, 0)) { +			dropped = 1; +			NF_CT_STAT_INC_ATOMIC(net, early_drop); +		}  	}  	nf_ct_put(ct);  	return dropped;  } +void init_nf_conntrack_hash_rnd(void) +{ +	unsigned int rand; + +	/* +	 * Why not initialize nf_conntrack_rnd in a "init()" function ? +	 * Because there isn't enough entropy when system initializing, +	 * and we initialize it as late as possible. +	 */ +	do { +		get_random_bytes(&rand, sizeof(rand)); +	} while (!rand); +	cmpxchg(&nf_conntrack_hash_rnd, 0, rand); +} +  static struct nf_conn *  __nf_conntrack_alloc(struct net *net, u16 zone,  		     const struct nf_conntrack_tuple *orig, @@ -605,18 +827,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,  	struct nf_conn *ct;  	if (unlikely(!nf_conntrack_hash_rnd)) { -		unsigned int rand; - -		/* -		 * Why not initialize nf_conntrack_rnd in a "init()" function ? -		 * Because there isn't enough entropy when system initializing, -		 * and we initialize it as late as possible. -		 */ -		do { -			get_random_bytes(&rand, sizeof(rand)); -		} while (!rand); -		cmpxchg(&nf_conntrack_hash_rnd, 0, rand); - +		init_nf_conntrack_hash_rnd();  		/* recompute the hash as nf_conntrack_hash_rnd is initialized */  		hash = hash_conntrack_raw(orig, zone);  	} @@ -626,12 +837,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone,  	if (nf_conntrack_max &&  	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { -		if (!early_drop(net, hash_bucket(hash, net))) { +		if (!early_drop(net, hash)) {  			atomic_dec(&net->ct.count); -			if (net_ratelimit()) -				printk(KERN_WARNING -				       "nf_conntrack: table full, dropping" -				       " packet.\n"); +			net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");  			return ERR_PTR(-ENOMEM);  		}  	} @@ -642,7 +850,6 @@ __nf_conntrack_alloc(struct net *net, u16 zone,  	 */  	ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);  	if (ct == NULL) { -		pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");  		atomic_dec(&net->ct.count);  		return ERR_PTR(-ENOMEM);  	} @@ -651,7 +858,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone,  	 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.  	 */  	memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, -	       sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); +	       offsetof(struct nf_conn, proto) - +	       offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));  	spin_lock_init(&ct->lock);  	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;  	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; @@ -671,15 +879,15 @@ __nf_conntrack_alloc(struct net *net, u16 zone,  		nf_ct_zone->id = zone;  	}  #endif -	/* -	 * changes to lookup keys must be done before setting refcnt to 1 +	/* Because we use RCU lookups, we set ct_general.use to zero before +	 * this is inserted in any list.  	 */ -	smp_wmb(); -	atomic_set(&ct->ct_general.use, 1); +	atomic_set(&ct->ct_general.use, 0);  	return ct;  #ifdef CONFIG_NF_CONNTRACK_ZONES  out_free: +	atomic_dec(&net->ct.count);  	kmem_cache_free(net->ct.nf_conntrack_cachep, ct);  	return ERR_PTR(-ENOMEM);  #endif @@ -698,13 +906,20 @@ void nf_conntrack_free(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); +	/* A freed object has refcnt == 0, that's +	 * the golden rule for SLAB_DESTROY_BY_RCU +	 */ +	NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); +  	nf_ct_ext_destroy(ct); -	atomic_dec(&net->ct.count);  	nf_ct_ext_free(ct);  	kmem_cache_free(net->ct.nf_conntrack_cachep, ct); +	smp_mb__before_atomic(); +	atomic_dec(&net->ct.count);  }  EXPORT_SYMBOL_GPL(nf_conntrack_free); +  /* Allocate a new conntrack: we return -ENOMEM if classification     failed due to stress.  Otherwise it really is unclassifiable. */  static struct nf_conntrack_tuple_hash * @@ -719,8 +934,10 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,  	struct nf_conn_help *help;  	struct nf_conntrack_tuple repl_tuple;  	struct nf_conntrack_ecache *ecache; -	struct nf_conntrack_expect *exp; +	struct nf_conntrack_expect *exp = NULL;  	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; +	struct nf_conn_timeout *timeout_ext; +	unsigned int *timeouts;  	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {  		pr_debug("Can't invert tuple.\n"); @@ -729,56 +946,76 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,  	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,  				  hash); -	if (IS_ERR(ct)) { -		pr_debug("Can't allocate conntrack.\n"); +	if (IS_ERR(ct))  		return (struct nf_conntrack_tuple_hash *)ct; + +	if (tmpl && nfct_synproxy(tmpl)) { +		nfct_seqadj_ext_add(ct); +		nfct_synproxy_ext_add(ct);  	} -	if (!l4proto->new(ct, skb, dataoff)) { +	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; +	if (timeout_ext) +		timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); +	else +		timeouts = l4proto->get_timeouts(net); + +	if (!l4proto->new(ct, skb, dataoff, timeouts)) {  		nf_conntrack_free(ct);  		pr_debug("init conntrack: can't track with proto module\n");  		return NULL;  	} +	if (timeout_ext) +		nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC); +  	nf_ct_acct_ext_add(ct, GFP_ATOMIC); +	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); +	nf_ct_labels_ext_add(ct);  	ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;  	nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,  				 ecache ? ecache->expmask : 0,  			     GFP_ATOMIC); -	spin_lock_bh(&nf_conntrack_lock); -	exp = nf_ct_find_expectation(net, zone, tuple); -	if (exp) { -		pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", -			 ct, exp); -		/* Welcome, Mr. Bond.  We've been expecting you... */ -		__set_bit(IPS_EXPECTED_BIT, &ct->status); -		ct->master = exp->master; -		if (exp->helper) { -			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); -			if (help) -				rcu_assign_pointer(help->helper, exp->helper); -		} +	local_bh_disable(); +	if (net->ct.expect_count) { +		spin_lock(&nf_conntrack_expect_lock); +		exp = nf_ct_find_expectation(net, zone, tuple); +		if (exp) { +			pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", +				 ct, exp); +			/* Welcome, Mr. Bond.  We've been expecting you... */ +			__set_bit(IPS_EXPECTED_BIT, &ct->status); +			/* exp->master safe, refcnt bumped in nf_ct_find_expectation */ +			ct->master = exp->master; +			if (exp->helper) { +				help = nf_ct_helper_ext_add(ct, exp->helper, +							    GFP_ATOMIC); +				if (help) +					rcu_assign_pointer(help->helper, exp->helper); +			}  #ifdef CONFIG_NF_CONNTRACK_MARK -		ct->mark = exp->master->mark; +			ct->mark = exp->master->mark;  #endif  #ifdef CONFIG_NF_CONNTRACK_SECMARK -		ct->secmark = exp->master->secmark; +			ct->secmark = exp->master->secmark;  #endif -		nf_conntrack_get(&ct->master->ct_general); -		NF_CT_STAT_INC(net, expect_new); -	} else { +			NF_CT_STAT_INC(net, expect_new); +		} +		spin_unlock(&nf_conntrack_expect_lock); +	} +	if (!exp) {  		__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);  		NF_CT_STAT_INC(net, new);  	} -	/* Overload tuple linked list to put us in unconfirmed list. */ -	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, -		       &net->ct.unconfirmed); +	/* Now it is inserted into the unconfirmed list, bump refcount */ +	nf_conntrack_get(&ct->ct_general); +	nf_ct_add_to_unconfirmed_list(ct); -	spin_unlock_bh(&nf_conntrack_lock); +	local_bh_enable();  	if (exp) {  		if (exp->expectfn) @@ -829,7 +1066,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,  	/* It exists; we have (non-exclusive) reference. */  	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { -		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; +		*ctinfo = IP_CT_ESTABLISHED_REPLY;  		/* Please set reply bit if this packet OK */  		*set_reply = 1;  	} else { @@ -860,6 +1097,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,  	enum ip_conntrack_info ctinfo;  	struct nf_conntrack_l3proto *l3proto;  	struct nf_conntrack_l4proto *l4proto; +	unsigned int *timeouts;  	unsigned int dataoff;  	u_int8_t protonum;  	int set_reply = 0; @@ -880,7 +1118,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,  	ret = l3proto->get_l4proto(skb, skb_network_offset(skb),  				   &dataoff, &protonum);  	if (ret <= 0) { -		pr_debug("not prepared to track yet or error occured\n"); +		pr_debug("not prepared to track yet or error occurred\n");  		NF_CT_STAT_INC_ATOMIC(net, error);  		NF_CT_STAT_INC_ATOMIC(net, invalid);  		ret = -ret; @@ -901,6 +1139,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,  			ret = -ret;  			goto out;  		} +		/* ICMP[v6] protocol trackers may assign one conntrack. */ +		if (skb->nfct) +			goto out;  	}  	ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, @@ -921,7 +1162,10 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,  	NF_CT_ASSERT(skb->nfct); -	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); +	/* Decide what timeout policy we want to apply to this flow. */ +	timeouts = nf_ct_timeout_lookup(net, ct, l4proto); + +	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);  	if (ret <= 0) {  		/* Invalid: inverse of the return code tells  		 * the netfilter core what to do */ @@ -938,8 +1182,15 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,  	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))  		nf_conntrack_event_cache(IPCT_REPLY, ct);  out: -	if (tmpl) -		nf_ct_put(tmpl); +	if (tmpl) { +		/* Special case: we have to repeat this hook, assign the +		 * template again to this packet. We assume that this packet +		 * has no conntrack assigned. This is used by nf_ct_tcp. */ +		if (ret == NF_REPEAT) +			skb->nfct = (struct nf_conntrack *)tmpl; +		else +			nf_ct_put(tmpl); +	}  	return ret;  } @@ -1012,14 +1263,14 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,  acct:  	if (do_acct) { -		struct nf_conn_counter *acct; +		struct nf_conn_acct *acct;  		acct = nf_conn_acct_find(ct);  		if (acct) { -			spin_lock_bh(&ct->lock); -			acct[CTINFO2DIR(ctinfo)].packets++; -			acct[CTINFO2DIR(ctinfo)].bytes += skb->len; -			spin_unlock_bh(&ct->lock); +			struct nf_conn_counter *counter = acct->counter; + +			atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); +			atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);  		}  	}  } @@ -1031,15 +1282,15 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,  		       int do_acct)  {  	if (do_acct) { -		struct nf_conn_counter *acct; +		struct nf_conn_acct *acct;  		acct = nf_conn_acct_find(ct);  		if (acct) { -			spin_lock_bh(&ct->lock); -			acct[CTINFO2DIR(ctinfo)].packets++; -			acct[CTINFO2DIR(ctinfo)].bytes += -				skb->len - skb_network_offset(skb); -			spin_unlock_bh(&ct->lock); +			struct nf_conn_counter *counter = acct->counter; + +			atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); +			atomic64_add(skb->len - skb_network_offset(skb), +				     &counter[CTINFO2DIR(ctinfo)].bytes);  		}  	} @@ -1059,7 +1310,7 @@ static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {  };  #endif -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  #include <linux/netfilter/nfnetlink.h>  #include <linux/netfilter/nfnetlink_conntrack.h> @@ -1071,8 +1322,9 @@ static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {  int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,  			       const struct nf_conntrack_tuple *tuple)  { -	NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); -	NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); +	if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || +	    nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -1107,7 +1359,7 @@ EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);  #endif  /* Used by ipt_REJECT and ip6t_REJECT. */ -static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)  {  	struct nf_conn *ct;  	enum ip_conntrack_info ctinfo; @@ -1115,7 +1367,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)  	/* This ICMP is in reverse direction to the packet which caused it */  	ct = nf_ct_get(skb, &ctinfo);  	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) -		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; +		ctinfo = IP_CT_RELATED_REPLY;  	else  		ctinfo = IP_CT_RELATED; @@ -1133,31 +1385,48 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct;  	struct hlist_nulls_node *n; +	int cpu; +	spinlock_t *lockp; -	spin_lock_bh(&nf_conntrack_lock);  	for (; *bucket < net->ct.htable_size; (*bucket)++) { -		hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { +		lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; +		local_bh_disable(); +		spin_lock(lockp); +		if (*bucket < net->ct.htable_size) { +			hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { +				if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) +					continue; +				ct = nf_ct_tuplehash_to_ctrack(h); +				if (iter(ct, data)) +					goto found; +			} +		} +		spin_unlock(lockp); +		local_bh_enable(); +	} + +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_bh(&pcpu->lock); +		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {  			ct = nf_ct_tuplehash_to_ctrack(h);  			if (iter(ct, data)) -				goto found; +				set_bit(IPS_DYING_BIT, &ct->status);  		} +		spin_unlock_bh(&pcpu->lock);  	} -	hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { -		ct = nf_ct_tuplehash_to_ctrack(h); -		if (iter(ct, data)) -			set_bit(IPS_DYING_BIT, &ct->status); -	} -	spin_unlock_bh(&nf_conntrack_lock);  	return NULL;  found:  	atomic_inc(&ct->ct_general.use); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock(lockp); +	local_bh_enable();  	return ct;  }  void nf_ct_iterate_cleanup(struct net *net,  			   int (*iter)(struct nf_conn *i, void *data), -			   void *data) +			   void *data, u32 portid, int report)  {  	struct nf_conn *ct;  	unsigned int bucket = 0; @@ -1165,7 +1434,8 @@ void nf_ct_iterate_cleanup(struct net *net,  	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {  		/* Time to push up daises... */  		if (del_timer(&ct->timeout)) -			death_by_timeout((unsigned long)ct); +			nf_ct_delete(ct, portid, report); +  		/* ... else the timer will get him soon. */  		nf_ct_put(ct); @@ -1173,33 +1443,14 @@ void nf_ct_iterate_cleanup(struct net *net,  }  EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); -struct __nf_ct_flush_report { -	u32 pid; -	int report; -}; - -static int kill_report(struct nf_conn *i, void *data) -{ -	struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; - -	/* If we fail to deliver the event, death_by_timeout() will retry */ -	if (nf_conntrack_event_report(IPCT_DESTROY, i, -				      fr->pid, fr->report) < 0) -		return 1; - -	/* Avoid the delivery of the destroy event in death_by_timeout(). */ -	set_bit(IPS_DYING_BIT, &i->status); -	return 1; -} -  static int kill_all(struct nf_conn *i, void *data)  {  	return 1;  } -void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) +void nf_ct_free_hashtable(void *hash, unsigned int size)  { -	if (vmalloced) +	if (is_vmalloc_addr(hash))  		vfree(hash);  	else  		free_pages((unsigned long)hash, @@ -1207,13 +1458,9 @@ void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)  }  EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report)  { -	struct __nf_ct_flush_report fr = { -		.pid 	= pid, -		.report = report, -	}; -	nf_ct_iterate_cleanup(net, kill_report, &fr); +	nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report);  }  EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); @@ -1222,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net)  	struct nf_conntrack_tuple_hash *h;  	struct nf_conn *ct;  	struct hlist_nulls_node *n; +	int cpu; -	spin_lock_bh(&nf_conntrack_lock); -	hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { -		ct = nf_ct_tuplehash_to_ctrack(h); -		/* never fails to remove them, no listeners at this point */ -		nf_ct_kill(ct); +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_bh(&pcpu->lock); +		hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { +			ct = nf_ct_tuplehash_to_ctrack(h); +			/* never fails to remove them, no listeners at this point */ +			nf_ct_kill(ct); +		} +		spin_unlock_bh(&pcpu->lock);  	} -	spin_unlock_bh(&nf_conntrack_lock);  }  static int untrack_refs(void) @@ -1244,76 +1496,96 @@ static int untrack_refs(void)  	return cnt;  } -static void nf_conntrack_cleanup_init_net(void) +void nf_conntrack_cleanup_start(void)  { +	RCU_INIT_POINTER(ip_ct_attach, NULL); +} + +void nf_conntrack_cleanup_end(void) +{ +	RCU_INIT_POINTER(nf_ct_destroy, NULL);  	while (untrack_refs() > 0)  		schedule(); -	nf_conntrack_helper_fini(); -	nf_conntrack_proto_fini();  #ifdef CONFIG_NF_CONNTRACK_ZONES  	nf_ct_extend_unregister(&nf_ct_zone_extend);  #endif +	nf_conntrack_proto_fini(); +	nf_conntrack_seqadj_fini(); +	nf_conntrack_labels_fini(); +	nf_conntrack_helper_fini(); +	nf_conntrack_timeout_fini(); +	nf_conntrack_ecache_fini(); +	nf_conntrack_tstamp_fini(); +	nf_conntrack_acct_fini(); +	nf_conntrack_expect_fini();  } -static void nf_conntrack_cleanup_net(struct net *net) +/* + * Mishearing the voices in his head, our hero wonders how he's + * supposed to kill the mall. + */ +void nf_conntrack_cleanup_net(struct net *net)  { - i_see_dead_people: -	nf_ct_iterate_cleanup(net, kill_all, NULL); -	nf_ct_release_dying_list(net); -	if (atomic_read(&net->ct.count) != 0) { -		schedule(); -		goto i_see_dead_people; -	} +	LIST_HEAD(single); -	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, -			     net->ct.htable_size); -	nf_conntrack_ecache_fini(net); -	nf_conntrack_acct_fini(net); -	nf_conntrack_expect_fini(net); -	kmem_cache_destroy(net->ct.nf_conntrack_cachep); -	kfree(net->ct.slabname); -	free_percpu(net->ct.stat); +	list_add(&net->exit_list, &single); +	nf_conntrack_cleanup_net_list(&single);  } -/* Mishearing the voices in his head, our hero wonders how he's -   supposed to kill the mall. */ -void nf_conntrack_cleanup(struct net *net) +void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)  { -	if (net_eq(net, &init_net)) -		rcu_assign_pointer(ip_ct_attach, NULL); +	int busy; +	struct net *net; -	/* This makes sure all current packets have passed through -	   netfilter framework.  Roll on, two-stage module -	   delete... */ +	/* +	 * This makes sure all current packets have passed through +	 *  netfilter framework.  Roll on, two-stage module +	 *  delete... +	 */  	synchronize_net(); +i_see_dead_people: +	busy = 0; +	list_for_each_entry(net, net_exit_list, exit_list) { +		nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); +		nf_ct_release_dying_list(net); +		if (atomic_read(&net->ct.count) != 0) +			busy = 1; +	} +	if (busy) { +		schedule(); +		goto i_see_dead_people; +	} -	nf_conntrack_cleanup_net(net); - -	if (net_eq(net, &init_net)) { -		rcu_assign_pointer(nf_ct_destroy, NULL); -		nf_conntrack_cleanup_init_net(); +	list_for_each_entry(net, net_exit_list, exit_list) { +		nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); +		nf_conntrack_proto_pernet_fini(net); +		nf_conntrack_helper_pernet_fini(net); +		nf_conntrack_ecache_pernet_fini(net); +		nf_conntrack_tstamp_pernet_fini(net); +		nf_conntrack_acct_pernet_fini(net); +		nf_conntrack_expect_pernet_fini(net); +		kmem_cache_destroy(net->ct.nf_conntrack_cachep); +		kfree(net->ct.slabname); +		free_percpu(net->ct.stat); +		free_percpu(net->ct.pcpu_lists);  	}  } -void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) +void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)  {  	struct hlist_nulls_head *hash;  	unsigned int nr_slots, i;  	size_t sz; -	*vmalloced = 0; -  	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));  	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));  	sz = nr_slots * sizeof(struct hlist_nulls_head);  	hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,  					get_order(sz));  	if (!hash) { -		*vmalloced = 1;  		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); -		hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, -				 PAGE_KERNEL); +		hash = vzalloc(sz);  	}  	if (hash && nulls) @@ -1326,7 +1598,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);  int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)  { -	int i, bucket, vmalloced, old_vmalloced; +	int i, bucket, rc;  	unsigned int hashsize, old_size;  	struct hlist_nulls_head *hash, *old_hash;  	struct nf_conntrack_tuple_hash *h; @@ -1339,20 +1611,26 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)  	if (!nf_conntrack_htable_size)  		return param_set_uint(val, kp); -	hashsize = simple_strtoul(val, NULL, 0); +	rc = kstrtouint(val, 0, &hashsize); +	if (rc) +		return rc;  	if (!hashsize)  		return -EINVAL; -	hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); +	hash = nf_ct_alloc_hashtable(&hashsize, 1);  	if (!hash)  		return -ENOMEM; +	local_bh_disable(); +	nf_conntrack_all_lock(); +	write_seqcount_begin(&init_net.ct.generation); +  	/* Lookups in the old hash might happen in parallel, which means we  	 * might get false negatives during connection lookup. New connections  	 * created because of a false negative won't make it into the hash -	 * though since that required taking the lock. +	 * though since that required taking the locks.  	 */ -	spin_lock_bh(&nf_conntrack_lock); +  	for (i = 0; i < init_net.ct.htable_size; i++) {  		while (!hlist_nulls_empty(&init_net.ct.hash[i])) {  			h = hlist_nulls_entry(init_net.ct.hash[i].first, @@ -1365,15 +1643,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)  		}  	}  	old_size = init_net.ct.htable_size; -	old_vmalloced = init_net.ct.hash_vmalloc;  	old_hash = init_net.ct.hash;  	init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; -	init_net.ct.hash_vmalloc = vmalloced;  	init_net.ct.hash = hash; -	spin_unlock_bh(&nf_conntrack_lock); -	nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); +	write_seqcount_end(&init_net.ct.generation); +	nf_conntrack_all_unlock(); +	local_bh_enable(); + +	nf_ct_free_hashtable(old_hash, old_size);  	return 0;  }  EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); @@ -1390,10 +1669,13 @@ void nf_ct_untracked_status_or(unsigned long bits)  }  EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); -static int nf_conntrack_init_init_net(void) +int nf_conntrack_init_start(void)  {  	int max_factor = 8; -	int ret, cpu; +	int i, ret, cpu; + +	for (i = 0; i < CONNTRACK_LOCKS; i++) +		spin_lock_init(&nf_conntrack_locks[i]);  	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB  	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ @@ -1418,19 +1700,47 @@ static int nf_conntrack_init_init_net(void)  	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,  	       nf_conntrack_max); -	ret = nf_conntrack_proto_init(); +	ret = nf_conntrack_expect_init();  	if (ret < 0) -		goto err_proto; +		goto err_expect; + +	ret = nf_conntrack_acct_init(); +	if (ret < 0) +		goto err_acct; + +	ret = nf_conntrack_tstamp_init(); +	if (ret < 0) +		goto err_tstamp; + +	ret = nf_conntrack_ecache_init(); +	if (ret < 0) +		goto err_ecache; + +	ret = nf_conntrack_timeout_init(); +	if (ret < 0) +		goto err_timeout;  	ret = nf_conntrack_helper_init();  	if (ret < 0)  		goto err_helper; +	ret = nf_conntrack_labels_init(); +	if (ret < 0) +		goto err_labels; + +	ret = nf_conntrack_seqadj_init(); +	if (ret < 0) +		goto err_seqadj; +  #ifdef CONFIG_NF_CONNTRACK_ZONES  	ret = nf_ct_extend_register(&nf_ct_zone_extend);  	if (ret < 0)  		goto err_extend;  #endif +	ret = nf_conntrack_proto_init(); +	if (ret < 0) +		goto err_proto; +  	/* Set up fake conntrack: to never be deleted, not in any hashes */  	for_each_possible_cpu(cpu) {  		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); @@ -1441,118 +1751,127 @@ static int nf_conntrack_init_init_net(void)  	nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);  	return 0; +err_proto:  #ifdef CONFIG_NF_CONNTRACK_ZONES +	nf_ct_extend_unregister(&nf_ct_zone_extend);  err_extend: -	nf_conntrack_helper_fini();  #endif +	nf_conntrack_seqadj_fini(); +err_seqadj: +	nf_conntrack_labels_fini(); +err_labels: +	nf_conntrack_helper_fini();  err_helper: -	nf_conntrack_proto_fini(); -err_proto: +	nf_conntrack_timeout_fini(); +err_timeout: +	nf_conntrack_ecache_fini(); +err_ecache: +	nf_conntrack_tstamp_fini(); +err_tstamp: +	nf_conntrack_acct_fini(); +err_acct: +	nf_conntrack_expect_fini(); +err_expect:  	return ret;  } +void nf_conntrack_init_end(void) +{ +	/* For use by REJECT target */ +	RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); +	RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); +} +  /*   * We need to use special "null" values, not used in hash table   */  #define UNCONFIRMED_NULLS_VAL	((1<<30)+0)  #define DYING_NULLS_VAL		((1<<30)+1) +#define TEMPLATE_NULLS_VAL	((1<<30)+2) -static int nf_conntrack_init_net(struct net *net) +int nf_conntrack_init_net(struct net *net)  { -	int ret; +	int ret = -ENOMEM; +	int cpu;  	atomic_set(&net->ct.count, 0); -	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); -	INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); -	net->ct.stat = alloc_percpu(struct ip_conntrack_stat); -	if (!net->ct.stat) { -		ret = -ENOMEM; +	seqcount_init(&net->ct.generation); + +	net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); +	if (!net->ct.pcpu_lists)  		goto err_stat; + +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_init(&pcpu->lock); +		INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); +		INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); +		INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);  	} +	net->ct.stat = alloc_percpu(struct ip_conntrack_stat); +	if (!net->ct.stat) +		goto err_pcpu_lists; +  	net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); -	if (!net->ct.slabname) { -		ret = -ENOMEM; +	if (!net->ct.slabname)  		goto err_slabname; -	}  	net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,  							sizeof(struct nf_conn), 0,  							SLAB_DESTROY_BY_RCU, NULL);  	if (!net->ct.nf_conntrack_cachep) {  		printk(KERN_ERR "Unable to create nf_conn slab cache\n"); -		ret = -ENOMEM;  		goto err_cache;  	}  	net->ct.htable_size = nf_conntrack_htable_size; -	net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, -					     &net->ct.hash_vmalloc, 1); +	net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);  	if (!net->ct.hash) { -		ret = -ENOMEM;  		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");  		goto err_hash;  	} -	ret = nf_conntrack_expect_init(net); +	ret = nf_conntrack_expect_pernet_init(net);  	if (ret < 0)  		goto err_expect; -	ret = nf_conntrack_acct_init(net); +	ret = nf_conntrack_acct_pernet_init(net);  	if (ret < 0)  		goto err_acct; -	ret = nf_conntrack_ecache_init(net); +	ret = nf_conntrack_tstamp_pernet_init(net); +	if (ret < 0) +		goto err_tstamp; +	ret = nf_conntrack_ecache_pernet_init(net);  	if (ret < 0)  		goto err_ecache; - +	ret = nf_conntrack_helper_pernet_init(net); +	if (ret < 0) +		goto err_helper; +	ret = nf_conntrack_proto_pernet_init(net); +	if (ret < 0) +		goto err_proto;  	return 0; +err_proto: +	nf_conntrack_helper_pernet_fini(net); +err_helper: +	nf_conntrack_ecache_pernet_fini(net);  err_ecache: -	nf_conntrack_acct_fini(net); +	nf_conntrack_tstamp_pernet_fini(net); +err_tstamp: +	nf_conntrack_acct_pernet_fini(net);  err_acct: -	nf_conntrack_expect_fini(net); +	nf_conntrack_expect_pernet_fini(net);  err_expect: -	nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, -			     net->ct.htable_size); +	nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);  err_hash:  	kmem_cache_destroy(net->ct.nf_conntrack_cachep);  err_cache:  	kfree(net->ct.slabname);  err_slabname:  	free_percpu(net->ct.stat); +err_pcpu_lists: +	free_percpu(net->ct.pcpu_lists);  err_stat:  	return ret;  } - -s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, -			enum ip_conntrack_dir dir, -			u32 seq); -EXPORT_SYMBOL_GPL(nf_ct_nat_offset); - -int nf_conntrack_init(struct net *net) -{ -	int ret; - -	if (net_eq(net, &init_net)) { -		ret = nf_conntrack_init_init_net(); -		if (ret < 0) -			goto out_init_net; -	} -	ret = nf_conntrack_init_net(net); -	if (ret < 0) -		goto out_net; - -	if (net_eq(net, &init_net)) { -		/* For use by REJECT target */ -		rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); -		rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); - -		/* Howto get NAT offsets */ -		rcu_assign_pointer(nf_ct_nat_offset, NULL); -	} -	return 0; - -out_net: -	if (net_eq(net, &init_net)) -		nf_conntrack_cleanup_init_net(); -out_init_net: -	return ret; -} diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 5702de35e2b..1df17614656 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -1,8 +1,10 @@  /* Event cache for netfilter. */ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> +/* + * (C) 2005 Harald Welte <laforge@gnumonks.org> + * (C) 2005 Patrick McHardy <kaber@trash.net> + * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -19,6 +21,7 @@  #include <linux/kernel.h>  #include <linux/netdevice.h>  #include <linux/slab.h> +#include <linux/export.h>  #include <net/netfilter/nf_conntrack.h>  #include <net/netfilter/nf_conntrack_core.h> @@ -26,22 +29,19 @@  static DEFINE_MUTEX(nf_ct_ecache_mutex); -struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); - -struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly; -EXPORT_SYMBOL_GPL(nf_expect_event_cb); -  /* deliver cached events and clear cache entry - must be called with locally   * disabled softirqs */  void nf_ct_deliver_cached_events(struct nf_conn *ct)  { -	unsigned long events; +	struct net *net = nf_ct_net(ct); +	unsigned long events, missed;  	struct nf_ct_event_notifier *notify;  	struct nf_conntrack_ecache *e; +	struct nf_ct_event item; +	int ret;  	rcu_read_lock(); -	notify = rcu_dereference(nf_conntrack_event_cb); +	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);  	if (notify == NULL)  		goto out_unlock; @@ -51,49 +51,53 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)  	events = xchg(&e->cache, 0); -	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) { -		struct nf_ct_event item = { -			.ct	= ct, -			.pid	= 0, -			.report	= 0 -		}; -		int ret; -		/* We make a copy of the missed event cache without taking -		 * the lock, thus we may send missed events twice. However, -		 * this does not harm and it happens very rarely. */ -		unsigned long missed = e->missed; - -		ret = notify->fcn(events | missed, &item); -		if (unlikely(ret < 0 || missed)) { -			spin_lock_bh(&ct->lock); -			if (ret < 0) -				e->missed |= events; -			else -				e->missed &= ~missed; -			spin_unlock_bh(&ct->lock); -		}  -	} +	if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events) +		goto out_unlock; + +	/* We make a copy of the missed event cache without taking +	 * the lock, thus we may send missed events twice. However, +	 * this does not harm and it happens very rarely. */ +	missed = e->missed; + +	if (!((events | missed) & e->ctmask)) +		goto out_unlock; + +	item.ct = ct; +	item.portid = 0; +	item.report = 0; + +	ret = notify->fcn(events | missed, &item); + +	if (likely(ret >= 0 && !missed)) +		goto out_unlock; + +	spin_lock_bh(&ct->lock); +	if (ret < 0) +		e->missed |= events; +	else +		e->missed &= ~missed; +	spin_unlock_bh(&ct->lock);  out_unlock:  	rcu_read_unlock();  }  EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); -int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new) +int nf_conntrack_register_notifier(struct net *net, +				   struct nf_ct_event_notifier *new)  { -	int ret = 0; +	int ret;  	struct nf_ct_event_notifier *notify;  	mutex_lock(&nf_ct_ecache_mutex); -	notify = rcu_dereference_protected(nf_conntrack_event_cb, +	notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,  					   lockdep_is_held(&nf_ct_ecache_mutex));  	if (notify != NULL) {  		ret = -EBUSY;  		goto out_unlock;  	} -	rcu_assign_pointer(nf_conntrack_event_cb, new); -	mutex_unlock(&nf_ct_ecache_mutex); -	return ret; +	rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); +	ret = 0;  out_unlock:  	mutex_unlock(&nf_ct_ecache_mutex); @@ -101,34 +105,35 @@ out_unlock:  }  EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); -void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new) +void nf_conntrack_unregister_notifier(struct net *net, +				      struct nf_ct_event_notifier *new)  {  	struct nf_ct_event_notifier *notify;  	mutex_lock(&nf_ct_ecache_mutex); -	notify = rcu_dereference_protected(nf_conntrack_event_cb, +	notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,  					   lockdep_is_held(&nf_ct_ecache_mutex));  	BUG_ON(notify != new); -	rcu_assign_pointer(nf_conntrack_event_cb, NULL); +	RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);  	mutex_unlock(&nf_ct_ecache_mutex);  }  EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); -int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new) +int nf_ct_expect_register_notifier(struct net *net, +				   struct nf_exp_event_notifier *new)  { -	int ret = 0; +	int ret;  	struct nf_exp_event_notifier *notify;  	mutex_lock(&nf_ct_ecache_mutex); -	notify = rcu_dereference_protected(nf_expect_event_cb, +	notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,  					   lockdep_is_held(&nf_ct_ecache_mutex));  	if (notify != NULL) {  		ret = -EBUSY;  		goto out_unlock;  	} -	rcu_assign_pointer(nf_expect_event_cb, new); -	mutex_unlock(&nf_ct_ecache_mutex); -	return ret; +	rcu_assign_pointer(net->ct.nf_expect_event_cb, new); +	ret = 0;  out_unlock:  	mutex_unlock(&nf_ct_ecache_mutex); @@ -136,15 +141,16 @@ out_unlock:  }  EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); -void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new) +void nf_ct_expect_unregister_notifier(struct net *net, +				      struct nf_exp_event_notifier *new)  {  	struct nf_exp_event_notifier *notify;  	mutex_lock(&nf_ct_ecache_mutex); -	notify = rcu_dereference_protected(nf_expect_event_cb, +	notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,  					   lockdep_is_held(&nf_ct_ecache_mutex));  	BUG_ON(notify != new); -	rcu_assign_pointer(nf_expect_event_cb, NULL); +	RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);  	mutex_unlock(&nf_ct_ecache_mutex);  }  EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); @@ -192,9 +198,12 @@ static int nf_conntrack_event_init_sysctl(struct net *net)  	table[0].data = &net->ct.sysctl_events;  	table[1].data = &net->ct.sysctl_events_retry_timeout; +	/* Don't export sysctls to unprivileged users */ +	if (net->user_ns != &init_user_ns) +		table[0].procname = NULL; +  	net->ct.event_sysctl_header = -		register_net_sysctl_table(net, -					  nf_net_netfilter_sysctl_path, table); +		register_net_sysctl(net, "net/netfilter", table);  	if (!net->ct.event_sysctl_header) {  		printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");  		goto out_register; @@ -226,38 +235,27 @@ static void nf_conntrack_event_fini_sysctl(struct net *net)  }  #endif /* CONFIG_SYSCTL */ -int nf_conntrack_ecache_init(struct net *net) +int nf_conntrack_ecache_pernet_init(struct net *net)  { -	int ret; -  	net->ct.sysctl_events = nf_ct_events;  	net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; +	return nf_conntrack_event_init_sysctl(net); +} -	if (net_eq(net, &init_net)) { -		ret = nf_ct_extend_register(&event_extend); -		if (ret < 0) { -			printk(KERN_ERR "nf_ct_event: Unable to register " -					"event extension.\n"); -			goto out_extend_register; -		} -	} +void nf_conntrack_ecache_pernet_fini(struct net *net) +{ +	nf_conntrack_event_fini_sysctl(net); +} -	ret = nf_conntrack_event_init_sysctl(net); +int nf_conntrack_ecache_init(void) +{ +	int ret = nf_ct_extend_register(&event_extend);  	if (ret < 0) -		goto out_sysctl; - -	return 0; - -out_sysctl: -	if (net_eq(net, &init_net)) -		nf_ct_extend_unregister(&event_extend); -out_extend_register: +		pr_err("nf_ct_event: Unable to register event extension.\n");  	return ret;  } -void nf_conntrack_ecache_fini(struct net *net) +void nf_conntrack_ecache_fini(void)  { -	nf_conntrack_event_fini_sysctl(net); -	if (net_eq(net, &init_net)) -		nf_ct_extend_unregister(&event_extend); +	nf_ct_extend_unregister(&event_extend);  } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 46e8966912b..f87e8f68ad4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -3,6 +3,7 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>   * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (c) 2005-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -20,6 +21,8 @@  #include <linux/percpu.h>  #include <linux/kernel.h>  #include <linux/jhash.h> +#include <linux/moduleparam.h> +#include <linux/export.h>  #include <net/net_namespace.h>  #include <net/netfilter/nf_conntrack.h> @@ -32,31 +35,27 @@  unsigned int nf_ct_expect_hsize __read_mostly;  EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); -static unsigned int nf_ct_expect_hash_rnd __read_mostly;  unsigned int nf_ct_expect_max __read_mostly; -static int nf_ct_expect_hash_rnd_initted __read_mostly;  static struct kmem_cache *nf_ct_expect_cachep __read_mostly; -static HLIST_HEAD(nf_ct_userspace_expect_list); -  /* nf_conntrack_expect helper functions */  void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, -				u32 pid, int report) +				u32 portid, int report)  {  	struct nf_conn_help *master_help = nfct_help(exp->master);  	struct net *net = nf_ct_exp_net(exp); +	NF_CT_ASSERT(master_help);  	NF_CT_ASSERT(!timer_pending(&exp->timeout));  	hlist_del_rcu(&exp->hnode);  	net->ct.expect_count--;  	hlist_del(&exp->lnode); -	if (!(exp->flags & NF_CT_EXPECT_USERSPACE)) -		master_help->expecting[exp->class]--; +	master_help->expecting[exp->class]--; -	nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); +	nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);  	nf_ct_expect_put(exp);  	NF_CT_STAT_INC(net, expect_delete); @@ -67,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)  {  	struct nf_conntrack_expect *exp = (void *)ul_expect; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	nf_ct_unlink_expect(exp); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	nf_ct_expect_put(exp);  } @@ -77,15 +76,13 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple  {  	unsigned int hash; -	if (unlikely(!nf_ct_expect_hash_rnd_initted)) { -		get_random_bytes(&nf_ct_expect_hash_rnd, -				 sizeof(nf_ct_expect_hash_rnd)); -		nf_ct_expect_hash_rnd_initted = 1; +	if (unlikely(!nf_conntrack_hash_rnd)) { +		init_nf_conntrack_hash_rnd();  	}  	hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),  		      (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | -		       (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hash_rnd); +		       (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);  	return ((u64)hash * nf_ct_expect_hsize) >> 32;  } @@ -94,14 +91,13 @@ __nf_ct_expect_find(struct net *net, u16 zone,  		    const struct nf_conntrack_tuple *tuple)  {  	struct nf_conntrack_expect *i; -	struct hlist_node *n;  	unsigned int h;  	if (!net->ct.expect_count)  		return NULL;  	h = nf_ct_expect_dst_hash(tuple); -	hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) { +	hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {  		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&  		    nf_ct_zone(i->master) == zone)  			return i; @@ -134,14 +130,13 @@ nf_ct_find_expectation(struct net *net, u16 zone,  		       const struct nf_conntrack_tuple *tuple)  {  	struct nf_conntrack_expect *i, *exp = NULL; -	struct hlist_node *n;  	unsigned int h;  	if (!net->ct.expect_count)  		return NULL;  	h = nf_ct_expect_dst_hash(tuple); -	hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { +	hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {  		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&  		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&  		    nf_ct_zone(i->master) == zone) { @@ -160,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone,  	if (!nf_ct_is_confirmed(exp->master))  		return NULL; +	/* Avoid race with other CPUs, that for exp->master ct, is +	 * about to invoke ->destroy(), or nf_ct_delete() via timeout +	 * or early_drop(). +	 * +	 * The atomic_inc_not_zero() check tells:  If that fails, we +	 * know that the ct is being destroyed.  If it succeeds, we +	 * can be sure the ct cannot disappear underneath. +	 */ +	if (unlikely(nf_ct_is_dying(exp->master) || +		     !atomic_inc_not_zero(&exp->master->ct_general.use))) +		return NULL; +  	if (exp->flags & NF_CT_EXPECT_PERMANENT) {  		atomic_inc(&exp->use);  		return exp; @@ -167,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone,  		nf_ct_unlink_expect(exp);  		return exp;  	} +	/* Undo exp->master refcnt increase, if del_timer() failed */ +	nf_ct_put(exp->master);  	return NULL;  } @@ -176,18 +185,20 @@ void nf_ct_remove_expectations(struct nf_conn *ct)  {  	struct nf_conn_help *help = nfct_help(ct);  	struct nf_conntrack_expect *exp; -	struct hlist_node *n, *next; +	struct hlist_node *next;  	/* Optimization: most connection never expect any others. */  	if (!help)  		return; -	hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { +	spin_lock_bh(&nf_conntrack_expect_lock); +	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {  		if (del_timer(&exp->timeout)) {  			nf_ct_unlink_expect(exp);  			nf_ct_expect_put(exp);  		}  	} +	spin_unlock_bh(&nf_conntrack_expect_lock);  }  EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); @@ -222,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,  /* Generally a bad idea to call this: could have matched already. */  void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)  { -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	if (del_timer(&exp->timeout)) {  		nf_ct_unlink_expect(exp);  		nf_ct_expect_put(exp);  	} -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  }  EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); @@ -298,6 +309,11 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,  		       sizeof(exp->tuple.dst.u3) - len);  	exp->tuple.dst.u.all = *dst; + +#ifdef CONFIG_NF_NAT_NEEDED +	memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); +	memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); +#endif  }  EXPORT_SYMBOL_GPL(nf_ct_expect_init); @@ -316,34 +332,34 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)  }  EXPORT_SYMBOL_GPL(nf_ct_expect_put); -static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) +static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)  {  	struct nf_conn_help *master_help = nfct_help(exp->master); +	struct nf_conntrack_helper *helper;  	struct net *net = nf_ct_exp_net(exp); -	const struct nf_conntrack_expect_policy *p;  	unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); -	atomic_inc(&exp->use); +	/* two references : one for hash insert, one for the timer */ +	atomic_add(2, &exp->use); -	if (master_help) { -		hlist_add_head(&exp->lnode, &master_help->expectations); -		master_help->expecting[exp->class]++; -	} else if (exp->flags & NF_CT_EXPECT_USERSPACE) -		hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list); +	hlist_add_head(&exp->lnode, &master_help->expectations); +	master_help->expecting[exp->class]++;  	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);  	net->ct.expect_count++;  	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,  		    (unsigned long)exp); -	if (master_help) { -		p = &master_help->helper->expect_policy[exp->class]; -		exp->timeout.expires = jiffies + p->timeout * HZ; +	helper = rcu_dereference_protected(master_help->helper, +					   lockdep_is_held(&nf_conntrack_expect_lock)); +	if (helper) { +		exp->timeout.expires = jiffies + +			helper->expect_policy[exp->class].timeout * HZ;  	}  	add_timer(&exp->timeout); -	atomic_inc(&exp->use);  	NF_CT_STAT_INC(net, expect_create); +	return 0;  }  /* Race with expectations being used means we could have none to find; OK. */ @@ -352,9 +368,8 @@ static void evict_oldest_expect(struct nf_conn *master,  {  	struct nf_conn_help *master_help = nfct_help(master);  	struct nf_conntrack_expect *exp, *last = NULL; -	struct hlist_node *n; -	hlist_for_each_entry(exp, n, &master_help->expectations, lnode) { +	hlist_for_each_entry(exp, &master_help->expectations, lnode) {  		if (exp->class == new->class)  			last = exp;  	} @@ -365,44 +380,29 @@ static void evict_oldest_expect(struct nf_conn *master,  	}  } -static inline int refresh_timer(struct nf_conntrack_expect *i) -{ -	struct nf_conn_help *master_help = nfct_help(i->master); -	const struct nf_conntrack_expect_policy *p; - -	if (!del_timer(&i->timeout)) -		return 0; - -	p = &master_help->helper->expect_policy[i->class]; -	i->timeout.expires = jiffies + p->timeout * HZ; -	add_timer(&i->timeout); -	return 1; -} -  static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)  {  	const struct nf_conntrack_expect_policy *p;  	struct nf_conntrack_expect *i;  	struct nf_conn *master = expect->master;  	struct nf_conn_help *master_help = nfct_help(master); +	struct nf_conntrack_helper *helper;  	struct net *net = nf_ct_exp_net(expect); -	struct hlist_node *n; +	struct hlist_node *next;  	unsigned int h;  	int ret = 1; -	/* Don't allow expectations created from kernel-space with no helper */ -	if (!(expect->flags & NF_CT_EXPECT_USERSPACE) && -	    (!master_help || (master_help && !master_help->helper))) { +	if (!master_help) {  		ret = -ESHUTDOWN;  		goto out;  	}  	h = nf_ct_expect_dst_hash(&expect->tuple); -	hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { +	hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {  		if (expect_matches(i, expect)) { -			/* Refresh timer: if it's dying, ignore.. */ -			if (refresh_timer(i)) { -				ret = 0; -				goto out; +			if (del_timer(&i->timeout)) { +				nf_ct_unlink_expect(i); +				nf_ct_expect_put(i); +				break;  			}  		} else if (expect_clash(i, expect)) {  			ret = -EBUSY; @@ -410,8 +410,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)  		}  	}  	/* Will be over limit? */ -	if (master_help) { -		p = &master_help->helper->expect_policy[expect->class]; +	helper = rcu_dereference_protected(master_help->helper, +					   lockdep_is_held(&nf_conntrack_expect_lock)); +	if (helper) { +		p = &helper->expect_policy[expect->class];  		if (p->max_expected &&  		    master_help->expecting[expect->class] >= p->max_expected) {  			evict_oldest_expect(master, expect); @@ -424,52 +426,36 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)  	}  	if (net->ct.expect_count >= nf_ct_expect_max) { -		if (net_ratelimit()) -			printk(KERN_WARNING -			       "nf_conntrack: expectation table full\n"); +		net_warn_ratelimited("nf_conntrack: expectation table full\n");  		ret = -EMFILE;  	}  out:  	return ret;  } -int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,  -				u32 pid, int report) +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, +				u32 portid, int report)  {  	int ret; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	ret = __nf_ct_expect_check(expect);  	if (ret <= 0)  		goto out; -	ret = 0; -	nf_ct_expect_insert(expect); -	spin_unlock_bh(&nf_conntrack_lock); -	nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); +	ret = nf_ct_expect_insert(expect); +	if (ret < 0) +		goto out; +	spin_unlock_bh(&nf_conntrack_expect_lock); +	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);  	return ret;  out: -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	return ret;  }  EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); -void nf_ct_remove_userspace_expectations(void) -{ -	struct nf_conntrack_expect *exp; -	struct hlist_node *n, *next; - -	hlist_for_each_entry_safe(exp, n, next, -				  &nf_ct_userspace_expect_list, lnode) { -		if (del_timer(&exp->timeout)) { -			nf_ct_unlink_expect(exp); -			nf_ct_expect_put(exp); -		} -	} -} -EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations); - -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_NF_CONNTRACK_PROCFS  struct ct_expect_iter_state {  	struct seq_net_private p;  	unsigned int bucket; @@ -482,7 +468,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)  	struct hlist_node *n;  	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { -		n = rcu_dereference(net->ct.expect_hash[st->bucket].first); +		n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));  		if (n)  			return n;  	} @@ -495,11 +481,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,  	struct net *net = seq_file_net(seq);  	struct ct_expect_iter_state *st = seq->private; -	head = rcu_dereference(head->next); +	head = rcu_dereference(hlist_next_rcu(head));  	while (head == NULL) {  		if (++st->bucket >= nf_ct_expect_hsize)  			return NULL; -		head = rcu_dereference(net->ct.expect_hash[st->bucket].first); +		head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));  	}  	return head;  } @@ -597,79 +583,74 @@ static const struct file_operations exp_file_ops = {  	.llseek  = seq_lseek,  	.release = seq_release_net,  }; -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NF_CONNTRACK_PROCFS */  static int exp_proc_init(struct net *net)  { -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_NF_CONNTRACK_PROCFS  	struct proc_dir_entry *proc; -	proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops); +	proc = proc_create("nf_conntrack_expect", 0440, net->proc_net, +			   &exp_file_ops);  	if (!proc)  		return -ENOMEM; -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NF_CONNTRACK_PROCFS */  	return 0;  }  static void exp_proc_remove(struct net *net)  { -#ifdef CONFIG_PROC_FS -	proc_net_remove(net, "nf_conntrack_expect"); -#endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_NF_CONNTRACK_PROCFS +	remove_proc_entry("nf_conntrack_expect", net->proc_net); +#endif /* CONFIG_NF_CONNTRACK_PROCFS */  }  module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); -int nf_conntrack_expect_init(struct net *net) +int nf_conntrack_expect_pernet_init(struct net *net)  {  	int err = -ENOMEM; -	if (net_eq(net, &init_net)) { -		if (!nf_ct_expect_hsize) { -			nf_ct_expect_hsize = net->ct.htable_size / 256; -			if (!nf_ct_expect_hsize) -				nf_ct_expect_hsize = 1; -		} -		nf_ct_expect_max = nf_ct_expect_hsize * 4; -	} -  	net->ct.expect_count = 0; -	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, -						  &net->ct.expect_vmalloc, 0); +	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);  	if (net->ct.expect_hash == NULL)  		goto err1; -	if (net_eq(net, &init_net)) { -		nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", -					sizeof(struct nf_conntrack_expect), -					0, 0, NULL); -		if (!nf_ct_expect_cachep) -			goto err2; -	} -  	err = exp_proc_init(net);  	if (err < 0) -		goto err3; +		goto err2;  	return 0; - -err3: -	if (net_eq(net, &init_net)) -		kmem_cache_destroy(nf_ct_expect_cachep);  err2: -	nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, -			     nf_ct_expect_hsize); +	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);  err1:  	return err;  } -void nf_conntrack_expect_fini(struct net *net) +void nf_conntrack_expect_pernet_fini(struct net *net)  {  	exp_proc_remove(net); -	if (net_eq(net, &init_net)) { -		rcu_barrier(); /* Wait for call_rcu() before destroy */ -		kmem_cache_destroy(nf_ct_expect_cachep); +	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); +} + +int nf_conntrack_expect_init(void) +{ +	if (!nf_ct_expect_hsize) { +		nf_ct_expect_hsize = nf_conntrack_htable_size / 256; +		if (!nf_ct_expect_hsize) +			nf_ct_expect_hsize = 1;  	} -	nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, -			     nf_ct_expect_hsize); +	nf_ct_expect_max = nf_ct_expect_hsize * 4; +	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", +				sizeof(struct nf_conntrack_expect), +				0, 0, NULL); +	if (!nf_ct_expect_cachep) +		return -ENOMEM; +	return 0; +} + +void nf_conntrack_expect_fini(void) +{ +	rcu_barrier(); /* Wait for call_rcu() before destroy */ +	kmem_cache_destroy(nf_ct_expect_cachep);  } diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index bd82450c193..1a9545965c0 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -44,7 +44,8 @@ void __nf_ct_ext_destroy(struct nf_conn *ct)  EXPORT_SYMBOL(__nf_ct_ext_destroy);  static void * -nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp) +nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, +		 size_t var_alloc_len, gfp_t gfp)  {  	unsigned int off, len;  	struct nf_ct_ext_type *t; @@ -54,8 +55,8 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)  	t = rcu_dereference(nf_ct_ext_types[id]);  	BUG_ON(t == NULL);  	off = ALIGN(sizeof(struct nf_ct_ext), t->align); -	len = off + t->len; -	alloc_size = t->alloc_size; +	len = off + t->len + var_alloc_len; +	alloc_size = t->alloc_size + var_alloc_len;  	rcu_read_unlock();  	*ext = kzalloc(alloc_size, gfp); @@ -68,13 +69,8 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)  	return (void *)(*ext) + off;  } -static void __nf_ct_ext_free_rcu(struct rcu_head *head) -{ -	struct nf_ct_ext *ext = container_of(head, struct nf_ct_ext, rcu); -	kfree(ext); -} - -void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) +void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, +			     size_t var_alloc_len, gfp_t gfp)  {  	struct nf_ct_ext *old, *new;  	int i, newlen, newoff; @@ -85,7 +81,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)  	old = ct->ext;  	if (!old) -		return nf_ct_ext_create(&ct->ext, id, gfp); +		return nf_ct_ext_create(&ct->ext, id, var_alloc_len, gfp);  	if (__nf_ct_ext_exist(old, id))  		return NULL; @@ -95,7 +91,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)  	BUG_ON(t == NULL);  	newoff = ALIGN(old->len, t->align); -	newlen = newoff + t->len; +	newlen = newoff + t->len + var_alloc_len;  	rcu_read_unlock();  	new = __krealloc(old, newlen, gfp); @@ -114,7 +110,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)  					(void *)old + old->offset[i]);  			rcu_read_unlock();  		} -		call_rcu(&old->rcu, __nf_ct_ext_free_rcu); +		kfree_rcu(old, rcu);  		ct->ext = new;  	} @@ -123,7 +119,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)  	memset((void *)new + newoff, 0, newlen - newoff);  	return (void *)new + newoff;  } -EXPORT_SYMBOL(__nf_ct_ext_add); +EXPORT_SYMBOL(__nf_ct_ext_add_length);  static void update_alloc_size(struct nf_ct_ext_type *type)  { @@ -140,15 +136,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type)  	/* This assumes that extended areas in conntrack for the types  	   whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */  	for (i = min; i <= max; i++) { -		t1 = nf_ct_ext_types[i]; +		t1 = rcu_dereference_protected(nf_ct_ext_types[i], +				lockdep_is_held(&nf_ct_ext_type_mutex));  		if (!t1)  			continue; -		t1->alloc_size = sizeof(struct nf_ct_ext) -				 + ALIGN(sizeof(struct nf_ct_ext), t1->align) -				 + t1->len; +		t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + +				 t1->len;  		for (j = 0; j < NF_CT_EXT_NUM; j++) { -			t2 = nf_ct_ext_types[j]; +			t2 = rcu_dereference_protected(nf_ct_ext_types[j], +				lockdep_is_held(&nf_ct_ext_type_mutex));  			if (t2 == NULL || t2 == t1 ||  			    (t2->flags & NF_CT_EXT_F_PREALLOC) == 0)  				continue; @@ -186,7 +183,7 @@ EXPORT_SYMBOL_GPL(nf_ct_extend_register);  void nf_ct_extend_unregister(struct nf_ct_ext_type *type)  {  	mutex_lock(&nf_ct_ext_type_mutex); -	rcu_assign_pointer(nf_ct_ext_types[type->id], NULL); +	RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);  	update_alloc_size(type);  	mutex_unlock(&nf_ct_ext_type_mutex);  	rcu_barrier(); /* Wait for completion of call_rcu()'s */ diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index e17cb7c7dd8..b8a0924064e 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -3,6 +3,7 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>   * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -42,21 +43,26 @@ static u_int16_t ports[MAX_PORTS];  static unsigned int ports_c;  module_param_array(ports, ushort, &ports_c, 0400); -static int loose; +static bool loose;  module_param(loose, bool, 0600);  unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,  				enum ip_conntrack_info ctinfo,  				enum nf_ct_ftp_type type, +				unsigned int protoff,  				unsigned int matchoff,  				unsigned int matchlen,  				struct nf_conntrack_expect *exp);  EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); -static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); -static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, +		      char, unsigned int *); +static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *, +		       char, unsigned int *); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, +		    char, unsigned int *);  static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, -			     char); +			     char, unsigned int *);  static struct ftp_search {  	const char *pattern; @@ -64,7 +70,7 @@ static struct ftp_search {  	char skip;  	char term;  	enum nf_ct_ftp_type ftptype; -	int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); +	int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *);  } search[IP_CT_DIR_MAX][2] = {  	[IP_CT_DIR_ORIGINAL] = {  		{ @@ -88,10 +94,8 @@ static struct ftp_search {  		{  			.pattern	= "227 ",  			.plen		= sizeof("227 ") - 1, -			.skip		= '(', -			.term		= ')',  			.ftptype	= NF_CT_FTP_PASV, -			.getnum		= try_rfc959, +			.getnum		= try_rfc1123,  		},  		{  			.pattern	= "229 ", @@ -130,8 +134,9 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],  			i++;  		else {  			/* Unexpected character; true if it's the -			   terminator and we're finished. */ -			if (*data == term && i == array_size - 1) +			   terminator (or we don't care about one) +			   and we're finished. */ +			if ((*data == term || !term) && i == array_size - 1)  				return len;  			pr_debug("Char %u (got %u nums) `%u' unexpected\n", @@ -146,7 +151,8 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],  /* Returns 0, or length of numbers: 192,168,1,1,5,6 */  static int try_rfc959(const char *data, size_t dlen, -		      struct nf_conntrack_man *cmd, char term) +		      struct nf_conntrack_man *cmd, char term, +		      unsigned int *offset)  {  	int length;  	u_int32_t array[6]; @@ -161,6 +167,33 @@ static int try_rfc959(const char *data, size_t dlen,  	return length;  } +/* + * From RFC 1123: + * The format of the 227 reply to a PASV command is not + * well standardized.  In particular, an FTP client cannot + * assume that the parentheses shown on page 40 of RFC-959 + * will be present (and in fact, Figure 3 on page 43 omits + * them).  Therefore, a User-FTP program that interprets + * the PASV reply must scan the reply for the first digit + * of the host and port numbers. + */ +static int try_rfc1123(const char *data, size_t dlen, +		       struct nf_conntrack_man *cmd, char term, +		       unsigned int *offset) +{ +	int i; +	for (i = 0; i < dlen; i++) +		if (isdigit(data[i])) +			break; + +	if (i == dlen) +		return 0; + +	*offset += i; + +	return try_rfc959(data + i, dlen - i, cmd, 0, offset); +} +  /* Grab port: number up to delimiter */  static int get_port(const char *data, int start, size_t dlen, char delim,  		    __be16 *port) @@ -189,7 +222,7 @@ static int get_port(const char *data, int start, size_t dlen, char delim,  /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */  static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, -		    char term) +		    char term, unsigned int *offset)  {  	char delim;  	int length; @@ -237,7 +270,8 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,  /* Returns 0, or length of numbers: |||6446| */  static int try_epsv_response(const char *data, size_t dlen, -			     struct nf_conntrack_man *cmd, char term) +			     struct nf_conntrack_man *cmd, char term, +			     unsigned int *offset)  {  	char delim; @@ -259,9 +293,10 @@ static int find_pattern(const char *data, size_t dlen,  			unsigned int *numlen,  			struct nf_conntrack_man *cmd,  			int (*getnum)(const char *, size_t, -				      struct nf_conntrack_man *, char)) +				      struct nf_conntrack_man *, char, +				      unsigned int *))  { -	size_t i; +	size_t i = plen;  	pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);  	if (dlen == 0) @@ -291,16 +326,18 @@ static int find_pattern(const char *data, size_t dlen,  	pr_debug("Pattern matches!\n");  	/* Now we've found the constant string, try to skip  	   to the 'skip' character */ -	for (i = plen; data[i] != skip; i++) -		if (i == dlen - 1) return -1; +	if (skip) { +		for (i = plen; data[i] != skip; i++) +			if (i == dlen - 1) return -1; -	/* Skip over the last character */ -	i++; +		/* Skip over the last character */ +		i++; +	}  	pr_debug("Skipped up to `%c'!\n", skip);  	*numoff = i; -	*numlen = getnum(data + i, dlen - i, cmd, term); +	*numlen = getnum(data + i, dlen - i, cmd, term, numoff);  	if (!*numlen)  		return -1; @@ -358,7 +395,7 @@ static int help(struct sk_buff *skb,  	u32 seq;  	int dir = CTINFO2DIR(ctinfo);  	unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); -	struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info; +	struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);  	struct nf_conntrack_expect *exp;  	union nf_inet_addr *daddr;  	struct nf_conntrack_man cmd = {}; @@ -368,7 +405,7 @@ static int help(struct sk_buff *skb,  	/* Until there's been traffic both ways, don't look in packets. */  	if (ctinfo != IP_CT_ESTABLISHED && -	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { +	    ctinfo != IP_CT_ESTABLISHED_REPLY) {  		pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);  		return NF_ACCEPT;  	} @@ -395,6 +432,12 @@ static int help(struct sk_buff *skb,  	/* Look up to see if we're just after a \n. */  	if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { +		/* We're picking up this, clear flags and let it continue */ +		if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) { +			ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP; +			goto skip_nl_seq; +		} +  		/* Now if this ends in \n, update ftp info. */  		pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",  			 ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", @@ -405,6 +448,7 @@ static int help(struct sk_buff *skb,  		goto out_update_nl;  	} +skip_nl_seq:  	/* Initialize IP/IPv6 addr to expected address (it's not mentioned  	   in EPSV responses) */  	cmd.l3num = nf_ct_l3num(ct); @@ -427,8 +471,8 @@ static int help(struct sk_buff *skb,  		   connection tracking, not packet filtering.  		   However, it is necessary for accurate tracking in  		   this case. */ -		pr_debug("conntrack_ftp: partial %s %u+%u\n", -			 search[dir][i].pattern,  ntohl(th->seq), datalen); +		nf_ct_helper_log(skb, ct, "partial matching of `%s'", +			         search[dir][i].pattern);  		ret = NF_DROP;  		goto out;  	} else if (found == 0) { /* No match */ @@ -442,6 +486,7 @@ static int help(struct sk_buff *skb,  	exp = nf_ct_expect_alloc(ct);  	if (exp == NULL) { +		nf_ct_helper_log(skb, ct, "cannot alloc expectation");  		ret = NF_DROP;  		goto out;  	} @@ -489,12 +534,13 @@ static int help(struct sk_buff *skb,  	nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);  	if (nf_nat_ftp && ct->status & IPS_NAT_MASK)  		ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype, -				 matchoff, matchlen, exp); +				 protoff, matchoff, matchlen, exp);  	else {  		/* Can't expect this?  Best to drop packet now. */ -		if (nf_ct_expect_related(exp) != 0) +		if (nf_ct_expect_related(exp) != 0) { +			nf_ct_helper_log(skb, ct, "cannot add expectation");  			ret = NF_DROP; -		else +		} else  			ret = NF_ACCEPT;  	} @@ -511,8 +557,20 @@ out_update_nl:  	return ret;  } +static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct) +{ +	struct nf_ct_ftp_master *ftp = nfct_help_data(ct); + +	/* This conntrack has been injected from user-space, always pick up +	 * sequence tracking. Otherwise, the first FTP command after the +	 * failover breaks. +	 */ +	ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP; +	ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP; +	return 0; +} +  static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; -static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")] __read_mostly;  static const struct nf_conntrack_expect_policy ftp_exp_policy = {  	.max_expected	= 1, @@ -541,7 +599,6 @@ static void nf_conntrack_ftp_fini(void)  static int __init nf_conntrack_ftp_init(void)  {  	int i, j = -1, ret = 0; -	char *tmpname;  	ftp_buffer = kmalloc(65536, GFP_KERNEL);  	if (!ftp_buffer) @@ -556,17 +613,17 @@ static int __init nf_conntrack_ftp_init(void)  		ftp[i][0].tuple.src.l3num = PF_INET;  		ftp[i][1].tuple.src.l3num = PF_INET6;  		for (j = 0; j < 2; j++) { +			ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);  			ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);  			ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;  			ftp[i][j].expect_policy = &ftp_exp_policy;  			ftp[i][j].me = THIS_MODULE;  			ftp[i][j].help = help; -			tmpname = &ftp_names[i][j][0]; +			ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr;  			if (ports[i] == FTP_PORT) -				sprintf(tmpname, "ftp"); +				sprintf(ftp[i][j].name, "ftp");  			else -				sprintf(tmpname, "ftp-%d", ports[i]); -			ftp[i][j].name = tmpname; +				sprintf(ftp[i][j].name, "ftp-%d", ports[i]);  			pr_debug("nf_ct_ftp: registering helper for pf: %d "  				 "port: %d\n", diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 867882313e4..bcd5ed6b713 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -631,7 +631,7 @@ static int decode_seqof(bitstr_t *bs, const struct field_t *f,  		CHECK_BOUND(bs, 2);  		count = *bs->cur++;  		count <<= 8; -		count = *bs->cur++; +		count += *bs->cur++;  		break;  	case SEMI:  		BYTE_ALIGN(bs); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index b969025cf82..3a3a60b126e 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -2,6 +2,7 @@   * H.323 connection tracking helper   *   * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This source code is licensed under General Public License version 2.   * @@ -42,19 +43,19 @@ static int gkrouted_only __read_mostly = 1;  module_param(gkrouted_only, int, 0600);  MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); -static int callforward_filter __read_mostly = 1; +static bool callforward_filter __read_mostly = true;  module_param(callforward_filter, bool, 0600);  MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "  				     "if both endpoints are on different sides "  				     "(determined by routing information)");  /* Hooks for NAT */ -int (*set_h245_addr_hook) (struct sk_buff *skb, +int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,  			   unsigned char **data, int dataoff,  			   H245_TransportAddress *taddr,  			   union nf_inet_addr *addr, __be16 port)  			   __read_mostly; -int (*set_h225_addr_hook) (struct sk_buff *skb, +int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,  			   unsigned char **data, int dataoff,  			   TransportAddress *taddr,  			   union nf_inet_addr *addr, __be16 port) @@ -62,16 +63,17 @@ int (*set_h225_addr_hook) (struct sk_buff *skb,  int (*set_sig_addr_hook) (struct sk_buff *skb,  			  struct nf_conn *ct,  			  enum ip_conntrack_info ctinfo, -			  unsigned char **data, +			  unsigned int protoff, unsigned char **data,  			  TransportAddress *taddr, int count) __read_mostly;  int (*set_ras_addr_hook) (struct sk_buff *skb,  			  struct nf_conn *ct,  			  enum ip_conntrack_info ctinfo, -			  unsigned char **data, +			  unsigned int protoff, unsigned char **data,  			  TransportAddress *taddr, int count) __read_mostly;  int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,  			  struct nf_conn *ct,  			  enum ip_conntrack_info ctinfo, +			  unsigned int protoff,  			  unsigned char **data, int dataoff,  			  H245_TransportAddress *taddr,  			  __be16 port, __be16 rtp_port, @@ -80,24 +82,28 @@ int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,  int (*nat_t120_hook) (struct sk_buff *skb,  		      struct nf_conn *ct,  		      enum ip_conntrack_info ctinfo, +		      unsigned int protoff,  		      unsigned char **data, int dataoff,  		      H245_TransportAddress *taddr, __be16 port,  		      struct nf_conntrack_expect *exp) __read_mostly;  int (*nat_h245_hook) (struct sk_buff *skb,  		      struct nf_conn *ct,  		      enum ip_conntrack_info ctinfo, +		      unsigned int protoff,  		      unsigned char **data, int dataoff,  		      TransportAddress *taddr, __be16 port,  		      struct nf_conntrack_expect *exp) __read_mostly;  int (*nat_callforwarding_hook) (struct sk_buff *skb,  				struct nf_conn *ct,  				enum ip_conntrack_info ctinfo, +				unsigned int protoff,  				unsigned char **data, int dataoff,  				TransportAddress *taddr, __be16 port,  				struct nf_conntrack_expect *exp) __read_mostly;  int (*nat_q931_hook) (struct sk_buff *skb,  		      struct nf_conn *ct,  		      enum ip_conntrack_info ctinfo, +		      unsigned int protoff,  		      unsigned char **data, TransportAddress *taddr, int idx,  		      __be16 port, struct nf_conntrack_expect *exp)  		      __read_mostly; @@ -114,7 +120,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,  			 struct nf_conn *ct, enum ip_conntrack_info ctinfo,  			 unsigned char **data, int *datalen, int *dataoff)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	const struct tcphdr *th;  	struct tcphdr _tcph; @@ -251,6 +257,7 @@ static int get_h245_addr(struct nf_conn *ct, const unsigned char *data,  /****************************************************************************/  static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  			   enum ip_conntrack_info ctinfo, +			   unsigned int protoff,  			   unsigned char **data, int dataoff,  			   H245_TransportAddress *taddr)  { @@ -270,9 +277,8 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  		return 0;  	/* RTP port is even */ -	port &= htons(~1); -	rtp_port = port; -	rtcp_port = htons(ntohs(port) + 1); +	rtp_port = port & ~htons(1); +	rtcp_port = port | htons(1);  	/* Create expect for RTP */  	if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL) @@ -296,9 +302,10 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  		   &ct->tuplehash[!dir].tuple.dst.u3,  		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&  		   (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) && +		   nf_ct_l3num(ct) == NFPROTO_IPV4 &&  		   ct->status & IPS_NAT_MASK) {  		/* NAT needed */ -		ret = nat_rtp_rtcp(skb, ct, ctinfo, data, dataoff, +		ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,  				   taddr, port, rtp_port, rtp_exp, rtcp_exp);  	} else {		/* Conntrack only */  		if (nf_ct_expect_related(rtp_exp) == 0) { @@ -325,6 +332,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  static int expect_t120(struct sk_buff *skb,  		       struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, int dataoff,  		       H245_TransportAddress *taddr)  { @@ -354,9 +362,10 @@ static int expect_t120(struct sk_buff *skb,  		   &ct->tuplehash[!dir].tuple.dst.u3,  		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&  	    (nat_t120 = rcu_dereference(nat_t120_hook)) && +	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&  	    ct->status & IPS_NAT_MASK) {  		/* NAT needed */ -		ret = nat_t120(skb, ct, ctinfo, data, dataoff, taddr, +		ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,  			       port, exp);  	} else {		/* Conntrack only */  		if (nf_ct_expect_related(exp) == 0) { @@ -375,6 +384,7 @@ static int expect_t120(struct sk_buff *skb,  static int process_h245_channel(struct sk_buff *skb,  				struct nf_conn *ct,  				enum ip_conntrack_info ctinfo, +				unsigned int protoff,  				unsigned char **data, int dataoff,  				H2250LogicalChannelParameters *channel)  { @@ -382,7 +392,7 @@ static int process_h245_channel(struct sk_buff *skb,  	if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {  		/* RTP */ -		ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, +		ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,  				      &channel->mediaChannel);  		if (ret < 0)  			return -1; @@ -391,7 +401,7 @@ static int process_h245_channel(struct sk_buff *skb,  	if (channel->  	    options & eH2250LogicalChannelParameters_mediaControlChannel) {  		/* RTCP */ -		ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, +		ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,  				      &channel->mediaControlChannel);  		if (ret < 0)  			return -1; @@ -403,6 +413,7 @@ static int process_h245_channel(struct sk_buff *skb,  /****************************************************************************/  static int process_olc(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, int dataoff,  		       OpenLogicalChannel *olc)  { @@ -413,7 +424,8 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,  	if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==  	    eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)  	{ -		ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, +		ret = process_h245_channel(skb, ct, ctinfo, +					   protoff, data, dataoff,  					   &olc->  					   forwardLogicalChannelParameters.  					   multiplexParameters. @@ -431,7 +443,8 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,  		eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))  	{  		ret = -		    process_h245_channel(skb, ct, ctinfo, data, dataoff, +		    process_h245_channel(skb, ct, ctinfo, +					 protoff, data, dataoff,  					 &olc->  					 reverseLogicalChannelParameters.  					 multiplexParameters. @@ -449,7 +462,7 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,  	    t120.choice == eDataProtocolCapability_separateLANStack &&  	    olc->separateStack.networkAddress.choice ==  	    eNetworkAccessParameters_networkAddress_localAreaAddress) { -		ret = expect_t120(skb, ct, ctinfo, data, dataoff, +		ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,  				  &olc->separateStack.networkAddress.  				  localAreaAddress);  		if (ret < 0) @@ -462,7 +475,7 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_olca(struct sk_buff *skb, struct nf_conn *ct,  			enum ip_conntrack_info ctinfo, -			unsigned char **data, int dataoff, +			unsigned int protoff, unsigned char **data, int dataoff,  			OpenLogicalChannelAck *olca)  {  	H2250LogicalChannelAckParameters *ack; @@ -478,7 +491,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,  		choice ==  		eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))  	{ -		ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, +		ret = process_h245_channel(skb, ct, ctinfo, +					   protoff, data, dataoff,  					   &olca->  					   reverseLogicalChannelParameters.  					   multiplexParameters. @@ -497,7 +511,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,  		if (ack->options &  		    eH2250LogicalChannelAckParameters_mediaChannel) {  			/* RTP */ -			ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, +			ret = expect_rtp_rtcp(skb, ct, ctinfo, +					      protoff, data, dataoff,  					      &ack->mediaChannel);  			if (ret < 0)  				return -1; @@ -506,7 +521,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,  		if (ack->options &  		    eH2250LogicalChannelAckParameters_mediaControlChannel) {  			/* RTCP */ -			ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, +			ret = expect_rtp_rtcp(skb, ct, ctinfo, +					      protoff, data, dataoff,  					      &ack->mediaControlChannel);  			if (ret < 0)  				return -1; @@ -516,7 +532,7 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,  	if ((olca->options & eOpenLogicalChannelAck_separateStack) &&  		olca->separateStack.networkAddress.choice ==  		eNetworkAccessParameters_networkAddress_localAreaAddress) { -		ret = expect_t120(skb, ct, ctinfo, data, dataoff, +		ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,  				  &olca->separateStack.networkAddress.  				  localAreaAddress);  		if (ret < 0) @@ -529,14 +545,15 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_h245(struct sk_buff *skb, struct nf_conn *ct,  			enum ip_conntrack_info ctinfo, -			unsigned char **data, int dataoff, +			unsigned int protoff, unsigned char **data, int dataoff,  			MultimediaSystemControlMessage *mscm)  {  	switch (mscm->choice) {  	case eMultimediaSystemControlMessage_request:  		if (mscm->request.choice ==  		    eRequestMessage_openLogicalChannel) { -			return process_olc(skb, ct, ctinfo, data, dataoff, +			return process_olc(skb, ct, ctinfo, +					   protoff, data, dataoff,  					   &mscm->request.openLogicalChannel);  		}  		pr_debug("nf_ct_h323: H.245 Request %d\n", @@ -545,7 +562,8 @@ static int process_h245(struct sk_buff *skb, struct nf_conn *ct,  	case eMultimediaSystemControlMessage_response:  		if (mscm->response.choice ==  		    eResponseMessage_openLogicalChannelAck) { -			return process_olca(skb, ct, ctinfo, data, dataoff, +			return process_olca(skb, ct, ctinfo, +					    protoff, data, dataoff,  					    &mscm->response.  					    openLogicalChannelAck);  		} @@ -571,10 +589,9 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,  	int ret;  	/* Until there's been traffic both ways, don't look in packets. */ -	if (ctinfo != IP_CT_ESTABLISHED && -	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { +	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)  		return NF_ACCEPT; -	} +  	pr_debug("nf_ct_h245: skblen = %u\n", skb->len);  	spin_lock_bh(&nf_h323_lock); @@ -597,7 +614,8 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,  		}  		/* Process H.245 signal */ -		if (process_h245(skb, ct, ctinfo, &data, dataoff, &mscm) < 0) +		if (process_h245(skb, ct, ctinfo, protoff, +				 &data, dataoff, &mscm) < 0)  			goto drop;  	} @@ -606,8 +624,7 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,        drop:  	spin_unlock_bh(&nf_h323_lock); -	if (net_ratelimit()) -		pr_info("nf_ct_h245: packet dropped\n"); +	nf_ct_helper_log(skb, ct, "cannot process H.245 message");  	return NF_DROP;  } @@ -620,6 +637,7 @@ static const struct nf_conntrack_expect_policy h245_exp_policy = {  static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {  	.name			= "H.245",  	.me			= THIS_MODULE, +	.data_len		= sizeof(struct nf_ct_h323_master),  	.tuple.src.l3num	= AF_UNSPEC,  	.tuple.dst.protonum	= IPPROTO_UDP,  	.help			= h245_help, @@ -661,7 +679,7 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,  /****************************************************************************/  static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, -		       unsigned char **data, int dataoff, +		       unsigned int protoff, unsigned char **data, int dataoff,  		       TransportAddress *taddr)  {  	int dir = CTINFO2DIR(ctinfo); @@ -690,9 +708,10 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,  		   &ct->tuplehash[!dir].tuple.dst.u3,  		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&  	    (nat_h245 = rcu_dereference(nat_h245_hook)) && +	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&  	    ct->status & IPS_NAT_MASK) {  		/* NAT needed */ -		ret = nat_h245(skb, ct, ctinfo, data, dataoff, taddr, +		ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,  			       port, exp);  	} else {		/* Conntrack only */  		if (nf_ct_expect_related(exp) == 0) { @@ -714,7 +733,6 @@ static int callforward_do_filter(const union nf_inet_addr *src,  				 u_int8_t family)  {  	const struct nf_afinfo *afinfo; -	struct flowi fl1, fl2;  	int ret = 0;  	/* rcu_read_lock()ed by nf_hook_slow() */ @@ -722,18 +740,22 @@ static int callforward_do_filter(const union nf_inet_addr *src,  	if (!afinfo)  		return 0; -	memset(&fl1, 0, sizeof(fl1)); -	memset(&fl2, 0, sizeof(fl2)); -  	switch (family) {  	case AF_INET: { +		struct flowi4 fl1, fl2;  		struct rtable *rt1, *rt2; -		fl1.fl4_dst = src->ip; -		fl2.fl4_dst = dst->ip; -		if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) { -			if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { -				if (rt1->rt_gateway == rt2->rt_gateway && +		memset(&fl1, 0, sizeof(fl1)); +		fl1.daddr = src->ip; + +		memset(&fl2, 0, sizeof(fl2)); +		fl2.daddr = dst->ip; +		if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, +				   flowi4_to_flowi(&fl1), false)) { +			if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, +					   flowi4_to_flowi(&fl2), false)) { +				if (rt_nexthop(rt1, fl1.daddr) == +				    rt_nexthop(rt2, fl2.daddr) &&  				    rt1->dst.dev  == rt2->dst.dev)  					ret = 1;  				dst_release(&rt2->dst); @@ -742,17 +764,22 @@ static int callforward_do_filter(const union nf_inet_addr *src,  		}  		break;  	} -#if defined(CONFIG_NF_CONNTRACK_IPV6) || \ -    defined(CONFIG_NF_CONNTRACK_IPV6_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)  	case AF_INET6: { +		struct flowi6 fl1, fl2;  		struct rt6_info *rt1, *rt2; -		memcpy(&fl1.fl6_dst, src, sizeof(fl1.fl6_dst)); -		memcpy(&fl2.fl6_dst, dst, sizeof(fl2.fl6_dst)); -		if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) { -			if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { -				if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, -					    sizeof(rt1->rt6i_gateway)) && +		memset(&fl1, 0, sizeof(fl1)); +		fl1.daddr = src->in6; + +		memset(&fl2, 0, sizeof(fl2)); +		fl2.daddr = dst->in6; +		if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, +				   flowi6_to_flowi(&fl1), false)) { +			if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, +					   flowi6_to_flowi(&fl2), false)) { +				if (ipv6_addr_equal(rt6_nexthop(rt1), +						    rt6_nexthop(rt2)) &&  				    rt1->dst.dev == rt2->dst.dev)  					ret = 1;  				dst_release(&rt2->dst); @@ -771,6 +798,7 @@ static int callforward_do_filter(const union nf_inet_addr *src,  static int expect_callforwarding(struct sk_buff *skb,  				 struct nf_conn *ct,  				 enum ip_conntrack_info ctinfo, +				 unsigned int protoff,  				 unsigned char **data, int dataoff,  				 TransportAddress *taddr)  { @@ -806,9 +834,11 @@ static int expect_callforwarding(struct sk_buff *skb,  		   &ct->tuplehash[!dir].tuple.dst.u3,  		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&  	    (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) && +	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&  	    ct->status & IPS_NAT_MASK) {  		/* Need NAT */ -		ret = nat_callforwarding(skb, ct, ctinfo, data, dataoff, +		ret = nat_callforwarding(skb, ct, ctinfo, +					 protoff, data, dataoff,  					 taddr, port, exp);  	} else {		/* Conntrack only */  		if (nf_ct_expect_related(exp) == 0) { @@ -826,6 +856,7 @@ static int expect_callforwarding(struct sk_buff *skb,  /****************************************************************************/  static int process_setup(struct sk_buff *skb, struct nf_conn *ct,  			 enum ip_conntrack_info ctinfo, +			 unsigned int protoff,  			 unsigned char **data, int dataoff,  			 Setup_UUIE *setup)  { @@ -839,7 +870,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_q931: Setup\n");  	if (setup->options & eSetup_UUIE_h245Address) { -		ret = expect_h245(skb, ct, ctinfo, data, dataoff, +		ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,  				  &setup->h245Address);  		if (ret < 0)  			return -1; @@ -847,14 +878,15 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,  	set_h225_addr = rcu_dereference(set_h225_addr_hook);  	if ((setup->options & eSetup_UUIE_destCallSignalAddress) && -	    (set_h225_addr) && ct->status & IPS_NAT_MASK && +	    (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK &&  	    get_h225_addr(ct, *data, &setup->destCallSignalAddress,  			  &addr, &port) &&  	    memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {  		pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",  			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,  			 ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); -		ret = set_h225_addr(skb, data, dataoff, +		ret = set_h225_addr(skb, protoff, data, dataoff,  				    &setup->destCallSignalAddress,  				    &ct->tuplehash[!dir].tuple.src.u3,  				    ct->tuplehash[!dir].tuple.src.u.tcp.port); @@ -863,14 +895,15 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,  	}  	if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && -	    (set_h225_addr) && ct->status & IPS_NAT_MASK && +	    (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK &&  	    get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,  			  &addr, &port) &&  	    memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {  		pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",  			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,  			 ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); -		ret = set_h225_addr(skb, data, dataoff, +		ret = set_h225_addr(skb, protoff, data, dataoff,  				    &setup->sourceCallSignalAddress,  				    &ct->tuplehash[!dir].tuple.dst.u3,  				    ct->tuplehash[!dir].tuple.dst.u.tcp.port); @@ -880,7 +913,8 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,  	if (setup->options & eSetup_UUIE_fastStart) {  		for (i = 0; i < setup->fastStart.count; i++) { -			ret = process_olc(skb, ct, ctinfo, data, dataoff, +			ret = process_olc(skb, ct, ctinfo, +					  protoff, data, dataoff,  					  &setup->fastStart.item[i]);  			if (ret < 0)  				return -1; @@ -894,6 +928,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,  static int process_callproceeding(struct sk_buff *skb,  				  struct nf_conn *ct,  				  enum ip_conntrack_info ctinfo, +				  unsigned int protoff,  				  unsigned char **data, int dataoff,  				  CallProceeding_UUIE *callproc)  { @@ -903,7 +938,7 @@ static int process_callproceeding(struct sk_buff *skb,  	pr_debug("nf_ct_q931: CallProceeding\n");  	if (callproc->options & eCallProceeding_UUIE_h245Address) { -		ret = expect_h245(skb, ct, ctinfo, data, dataoff, +		ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,  				  &callproc->h245Address);  		if (ret < 0)  			return -1; @@ -911,7 +946,8 @@ static int process_callproceeding(struct sk_buff *skb,  	if (callproc->options & eCallProceeding_UUIE_fastStart) {  		for (i = 0; i < callproc->fastStart.count; i++) { -			ret = process_olc(skb, ct, ctinfo, data, dataoff, +			ret = process_olc(skb, ct, ctinfo, +					  protoff, data, dataoff,  					  &callproc->fastStart.item[i]);  			if (ret < 0)  				return -1; @@ -924,6 +960,7 @@ static int process_callproceeding(struct sk_buff *skb,  /****************************************************************************/  static int process_connect(struct sk_buff *skb, struct nf_conn *ct,  			   enum ip_conntrack_info ctinfo, +			   unsigned int protoff,  			   unsigned char **data, int dataoff,  			   Connect_UUIE *connect)  { @@ -933,7 +970,7 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_q931: Connect\n");  	if (connect->options & eConnect_UUIE_h245Address) { -		ret = expect_h245(skb, ct, ctinfo, data, dataoff, +		ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,  				  &connect->h245Address);  		if (ret < 0)  			return -1; @@ -941,7 +978,8 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct,  	if (connect->options & eConnect_UUIE_fastStart) {  		for (i = 0; i < connect->fastStart.count; i++) { -			ret = process_olc(skb, ct, ctinfo, data, dataoff, +			ret = process_olc(skb, ct, ctinfo, +					  protoff, data, dataoff,  					  &connect->fastStart.item[i]);  			if (ret < 0)  				return -1; @@ -954,6 +992,7 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,  			    enum ip_conntrack_info ctinfo, +			    unsigned int protoff,  			    unsigned char **data, int dataoff,  			    Alerting_UUIE *alert)  { @@ -963,7 +1002,7 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_q931: Alerting\n");  	if (alert->options & eAlerting_UUIE_h245Address) { -		ret = expect_h245(skb, ct, ctinfo, data, dataoff, +		ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,  				  &alert->h245Address);  		if (ret < 0)  			return -1; @@ -971,7 +1010,8 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,  	if (alert->options & eAlerting_UUIE_fastStart) {  		for (i = 0; i < alert->fastStart.count; i++) { -			ret = process_olc(skb, ct, ctinfo, data, dataoff, +			ret = process_olc(skb, ct, ctinfo, +					  protoff, data, dataoff,  					  &alert->fastStart.item[i]);  			if (ret < 0)  				return -1; @@ -984,6 +1024,7 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_facility(struct sk_buff *skb, struct nf_conn *ct,  			    enum ip_conntrack_info ctinfo, +			    unsigned int protoff,  			    unsigned char **data, int dataoff,  			    Facility_UUIE *facility)  { @@ -994,15 +1035,15 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct,  	if (facility->reason.choice == eFacilityReason_callForwarded) {  		if (facility->options & eFacility_UUIE_alternativeAddress) -			return expect_callforwarding(skb, ct, ctinfo, data, -						     dataoff, +			return expect_callforwarding(skb, ct, ctinfo, +						     protoff, data, dataoff,  						     &facility->  						     alternativeAddress);  		return 0;  	}  	if (facility->options & eFacility_UUIE_h245Address) { -		ret = expect_h245(skb, ct, ctinfo, data, dataoff, +		ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,  				  &facility->h245Address);  		if (ret < 0)  			return -1; @@ -1010,7 +1051,8 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct,  	if (facility->options & eFacility_UUIE_fastStart) {  		for (i = 0; i < facility->fastStart.count; i++) { -			ret = process_olc(skb, ct, ctinfo, data, dataoff, +			ret = process_olc(skb, ct, ctinfo, +					  protoff, data, dataoff,  					  &facility->fastStart.item[i]);  			if (ret < 0)  				return -1; @@ -1023,6 +1065,7 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_progress(struct sk_buff *skb, struct nf_conn *ct,  			    enum ip_conntrack_info ctinfo, +			    unsigned int protoff,  			    unsigned char **data, int dataoff,  			    Progress_UUIE *progress)  { @@ -1032,7 +1075,7 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_q931: Progress\n");  	if (progress->options & eProgress_UUIE_h245Address) { -		ret = expect_h245(skb, ct, ctinfo, data, dataoff, +		ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,  				  &progress->h245Address);  		if (ret < 0)  			return -1; @@ -1040,7 +1083,8 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct,  	if (progress->options & eProgress_UUIE_fastStart) {  		for (i = 0; i < progress->fastStart.count; i++) { -			ret = process_olc(skb, ct, ctinfo, data, dataoff, +			ret = process_olc(skb, ct, ctinfo, +					  protoff, data, dataoff,  					  &progress->fastStart.item[i]);  			if (ret < 0)  				return -1; @@ -1053,7 +1097,8 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_q931(struct sk_buff *skb, struct nf_conn *ct,  			enum ip_conntrack_info ctinfo, -			unsigned char **data, int dataoff, Q931 *q931) +			unsigned int protoff, unsigned char **data, int dataoff, +			Q931 *q931)  {  	H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;  	int i; @@ -1061,28 +1106,29 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct,  	switch (pdu->h323_message_body.choice) {  	case eH323_UU_PDU_h323_message_body_setup: -		ret = process_setup(skb, ct, ctinfo, data, dataoff, +		ret = process_setup(skb, ct, ctinfo, protoff, data, dataoff,  				    &pdu->h323_message_body.setup);  		break;  	case eH323_UU_PDU_h323_message_body_callProceeding: -		ret = process_callproceeding(skb, ct, ctinfo, data, dataoff, +		ret = process_callproceeding(skb, ct, ctinfo, +					     protoff, data, dataoff,  					     &pdu->h323_message_body.  					     callProceeding);  		break;  	case eH323_UU_PDU_h323_message_body_connect: -		ret = process_connect(skb, ct, ctinfo, data, dataoff, +		ret = process_connect(skb, ct, ctinfo, protoff, data, dataoff,  				      &pdu->h323_message_body.connect);  		break;  	case eH323_UU_PDU_h323_message_body_alerting: -		ret = process_alerting(skb, ct, ctinfo, data, dataoff, +		ret = process_alerting(skb, ct, ctinfo, protoff, data, dataoff,  				       &pdu->h323_message_body.alerting);  		break;  	case eH323_UU_PDU_h323_message_body_facility: -		ret = process_facility(skb, ct, ctinfo, data, dataoff, +		ret = process_facility(skb, ct, ctinfo, protoff, data, dataoff,  				       &pdu->h323_message_body.facility);  		break;  	case eH323_UU_PDU_h323_message_body_progress: -		ret = process_progress(skb, ct, ctinfo, data, dataoff, +		ret = process_progress(skb, ct, ctinfo, protoff, data, dataoff,  				       &pdu->h323_message_body.progress);  		break;  	default: @@ -1096,7 +1142,8 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct,  	if (pdu->options & eH323_UU_PDU_h245Control) {  		for (i = 0; i < pdu->h245Control.count; i++) { -			ret = process_h245(skb, ct, ctinfo, data, dataoff, +			ret = process_h245(skb, ct, ctinfo, +					   protoff, data, dataoff,  					   &pdu->h245Control.item[i]);  			if (ret < 0)  				return -1; @@ -1117,10 +1164,9 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,  	int ret;  	/* Until there's been traffic both ways, don't look in packets. */ -	if (ctinfo != IP_CT_ESTABLISHED && -	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { +	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)  		return NF_ACCEPT; -	} +  	pr_debug("nf_ct_q931: skblen = %u\n", skb->len);  	spin_lock_bh(&nf_h323_lock); @@ -1142,7 +1188,8 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,  		}  		/* Process Q.931 signal */ -		if (process_q931(skb, ct, ctinfo, &data, dataoff, &q931) < 0) +		if (process_q931(skb, ct, ctinfo, protoff, +				 &data, dataoff, &q931) < 0)  			goto drop;  	} @@ -1151,8 +1198,7 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,        drop:  	spin_unlock_bh(&nf_h323_lock); -	if (net_ratelimit()) -		pr_info("nf_ct_q931: packet dropped\n"); +	nf_ct_helper_log(skb, ct, "cannot process Q.931 message");  	return NF_DROP;  } @@ -1167,6 +1213,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = {  	{  		.name			= "Q.931",  		.me			= THIS_MODULE, +		.data_len		= sizeof(struct nf_ct_h323_master),  		.tuple.src.l3num	= AF_INET,  		.tuple.src.u.tcp.port	= cpu_to_be16(Q931_PORT),  		.tuple.dst.protonum	= IPPROTO_TCP, @@ -1225,7 +1272,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,  /****************************************************************************/  static int set_expect_timeout(struct nf_conntrack_expect *exp, -			      unsigned timeout) +			      unsigned int timeout)  {  	if (!exp || !del_timer(&exp->timeout))  		return 0; @@ -1239,10 +1286,10 @@ static int set_expect_timeout(struct nf_conntrack_expect *exp,  /****************************************************************************/  static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, -		       unsigned char **data, +		       unsigned int protoff, unsigned char **data,  		       TransportAddress *taddr, int count)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	int ret = 0;  	int i; @@ -1274,8 +1321,10 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,  	exp->flags = NF_CT_EXPECT_PERMANENT;	/* Accept multiple calls */  	nat_q931 = rcu_dereference(nat_q931_hook); -	if (nat_q931 && ct->status & IPS_NAT_MASK) {	/* Need NAT */ -		ret = nat_q931(skb, ct, ctinfo, data, taddr, i, port, exp); +	if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) {	/* Need NAT */ +		ret = nat_q931(skb, ct, ctinfo, protoff, data, +			       taddr, i, port, exp);  	} else {		/* Conntrack only */  		if (nf_ct_expect_related(exp) == 0) {  			pr_debug("nf_ct_ras: expect Q.931 "); @@ -1295,6 +1344,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_grq(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, GatekeeperRequest *grq)  {  	typeof(set_ras_addr_hook) set_ras_addr; @@ -1302,8 +1352,9 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_ras: GRQ\n");  	set_ras_addr = rcu_dereference(set_ras_addr_hook); -	if (set_ras_addr && ct->status & IPS_NAT_MASK)	/* NATed */ -		return set_ras_addr(skb, ct, ctinfo, data, +	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK)	/* NATed */ +		return set_ras_addr(skb, ct, ctinfo, protoff, data,  				    &grq->rasAddress, 1);  	return 0;  } @@ -1311,6 +1362,7 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, GatekeeperConfirm *gcf)  {  	int dir = CTINFO2DIR(ctinfo); @@ -1355,23 +1407,25 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, RegistrationRequest *rrq)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int ret;  	typeof(set_ras_addr_hook) set_ras_addr;  	pr_debug("nf_ct_ras: RRQ\n"); -	ret = expect_q931(skb, ct, ctinfo, data, +	ret = expect_q931(skb, ct, ctinfo, protoff, data,  			  rrq->callSignalAddress.item,  			  rrq->callSignalAddress.count);  	if (ret < 0)  		return -1;  	set_ras_addr = rcu_dereference(set_ras_addr_hook); -	if (set_ras_addr && ct->status & IPS_NAT_MASK) { -		ret = set_ras_addr(skb, ct, ctinfo, data, +	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) { +		ret = set_ras_addr(skb, ct, ctinfo, protoff, data,  				   rrq->rasAddress.item,  				   rrq->rasAddress.count);  		if (ret < 0) @@ -1390,9 +1444,10 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, RegistrationConfirm *rcf)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	int ret;  	struct nf_conntrack_expect *exp; @@ -1401,8 +1456,9 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_ras: RCF\n");  	set_sig_addr = rcu_dereference(set_sig_addr_hook); -	if (set_sig_addr && ct->status & IPS_NAT_MASK) { -		ret = set_sig_addr(skb, ct, ctinfo, data, +	if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) { +		ret = set_sig_addr(skb, ct, ctinfo, protoff, data,  					rcf->callSignalAddress.item,  					rcf->callSignalAddress.count);  		if (ret < 0) @@ -1420,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,  		nf_ct_refresh(ct, skb, info->timeout * HZ);  		/* Set expect timeout */ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,  				  info->sig_port[!dir]);  		if (exp) { @@ -1430,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,  			nf_ct_dump_tuple(&exp->tuple);  			set_expect_timeout(exp, info->timeout);  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  	}  	return 0; @@ -1439,9 +1495,10 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_urq(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, UnregistrationRequest *urq)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	int ret;  	typeof(set_sig_addr_hook) set_sig_addr; @@ -1449,8 +1506,9 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_ras: URQ\n");  	set_sig_addr = rcu_dereference(set_sig_addr_hook); -	if (set_sig_addr && ct->status & IPS_NAT_MASK) { -		ret = set_sig_addr(skb, ct, ctinfo, data, +	if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) { +		ret = set_sig_addr(skb, ct, ctinfo, protoff, data,  				   urq->callSignalAddress.item,  				   urq->callSignalAddress.count);  		if (ret < 0) @@ -1471,9 +1529,10 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_arq(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, AdmissionRequest *arq)  { -	const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	const struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	__be16 port;  	union nf_inet_addr addr; @@ -1487,9 +1546,10 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,  			  &addr, &port) &&  	    !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&  	    port == info->sig_port[dir] && +	    nf_ct_l3num(ct) == NFPROTO_IPV4 &&  	    set_h225_addr && ct->status & IPS_NAT_MASK) {  		/* Answering ARQ */ -		return set_h225_addr(skb, data, 0, +		return set_h225_addr(skb, protoff, data, 0,  				     &arq->destCallSignalAddress,  				     &ct->tuplehash[!dir].tuple.dst.u3,  				     info->sig_port[!dir]); @@ -1499,9 +1559,10 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,  	    get_h225_addr(ct, *data, &arq->srcCallSignalAddress,  			  &addr, &port) &&  	    !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && -	    set_h225_addr && ct->status & IPS_NAT_MASK) { +	    set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) {  		/* Calling ARQ */ -		return set_h225_addr(skb, data, 0, +		return set_h225_addr(skb, protoff, data, 0,  				     &arq->srcCallSignalAddress,  				     &ct->tuplehash[!dir].tuple.dst.u3,  				     port); @@ -1513,6 +1574,7 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_acf(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, AdmissionConfirm *acf)  {  	int dir = CTINFO2DIR(ctinfo); @@ -1531,8 +1593,9 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,  	if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {  		/* Answering ACF */  		set_sig_addr = rcu_dereference(set_sig_addr_hook); -		if (set_sig_addr && ct->status & IPS_NAT_MASK) -			return set_sig_addr(skb, ct, ctinfo, data, +		if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +		    ct->status & IPS_NAT_MASK) +			return set_sig_addr(skb, ct, ctinfo, protoff, data,  					    &acf->destCallSignalAddress, 1);  		return 0;  	} @@ -1560,6 +1623,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, LocationRequest *lrq)  {  	typeof(set_ras_addr_hook) set_ras_addr; @@ -1567,8 +1631,9 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_ras: LRQ\n");  	set_ras_addr = rcu_dereference(set_ras_addr_hook); -	if (set_ras_addr && ct->status & IPS_NAT_MASK) -		return set_ras_addr(skb, ct, ctinfo, data, +	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) +		return set_ras_addr(skb, ct, ctinfo, protoff, data,  				    &lrq->replyAddress, 1);  	return 0;  } @@ -1576,6 +1641,7 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, LocationConfirm *lcf)  {  	int dir = CTINFO2DIR(ctinfo); @@ -1615,6 +1681,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_irr(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, InfoRequestResponse *irr)  {  	int ret; @@ -1624,16 +1691,18 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,  	pr_debug("nf_ct_ras: IRR\n");  	set_ras_addr = rcu_dereference(set_ras_addr_hook); -	if (set_ras_addr && ct->status & IPS_NAT_MASK) { -		ret = set_ras_addr(skb, ct, ctinfo, data, +	if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) { +		ret = set_ras_addr(skb, ct, ctinfo, protoff, data,  				   &irr->rasAddress, 1);  		if (ret < 0)  			return -1;  	}  	set_sig_addr = rcu_dereference(set_sig_addr_hook); -	if (set_sig_addr && ct->status & IPS_NAT_MASK) { -		ret = set_sig_addr(skb, ct, ctinfo, data, +	if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && +	    ct->status & IPS_NAT_MASK) { +		ret = set_sig_addr(skb, ct, ctinfo, protoff, data,  					irr->callSignalAddress.item,  					irr->callSignalAddress.count);  		if (ret < 0) @@ -1646,38 +1715,39 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int process_ras(struct sk_buff *skb, struct nf_conn *ct,  		       enum ip_conntrack_info ctinfo, +		       unsigned int protoff,  		       unsigned char **data, RasMessage *ras)  {  	switch (ras->choice) {  	case eRasMessage_gatekeeperRequest: -		return process_grq(skb, ct, ctinfo, data, +		return process_grq(skb, ct, ctinfo, protoff, data,  				   &ras->gatekeeperRequest);  	case eRasMessage_gatekeeperConfirm: -		return process_gcf(skb, ct, ctinfo, data, +		return process_gcf(skb, ct, ctinfo, protoff, data,  				   &ras->gatekeeperConfirm);  	case eRasMessage_registrationRequest: -		return process_rrq(skb, ct, ctinfo, data, +		return process_rrq(skb, ct, ctinfo, protoff, data,  				   &ras->registrationRequest);  	case eRasMessage_registrationConfirm: -		return process_rcf(skb, ct, ctinfo, data, +		return process_rcf(skb, ct, ctinfo, protoff, data,  				   &ras->registrationConfirm);  	case eRasMessage_unregistrationRequest: -		return process_urq(skb, ct, ctinfo, data, +		return process_urq(skb, ct, ctinfo, protoff, data,  				   &ras->unregistrationRequest);  	case eRasMessage_admissionRequest: -		return process_arq(skb, ct, ctinfo, data, +		return process_arq(skb, ct, ctinfo, protoff, data,  				   &ras->admissionRequest);  	case eRasMessage_admissionConfirm: -		return process_acf(skb, ct, ctinfo, data, +		return process_acf(skb, ct, ctinfo, protoff, data,  				   &ras->admissionConfirm);  	case eRasMessage_locationRequest: -		return process_lrq(skb, ct, ctinfo, data, +		return process_lrq(skb, ct, ctinfo, protoff, data,  				   &ras->locationRequest);  	case eRasMessage_locationConfirm: -		return process_lcf(skb, ct, ctinfo, data, +		return process_lcf(skb, ct, ctinfo, protoff, data,  				   &ras->locationConfirm);  	case eRasMessage_infoRequestResponse: -		return process_irr(skb, ct, ctinfo, data, +		return process_irr(skb, ct, ctinfo, protoff, data,  				   &ras->infoRequestResponse);  	default:  		pr_debug("nf_ct_ras: RAS message %d\n", ras->choice); @@ -1717,7 +1787,7 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff,  	}  	/* Process RAS message */ -	if (process_ras(skb, ct, ctinfo, &data, &ras) < 0) +	if (process_ras(skb, ct, ctinfo, protoff, &data, &ras) < 0)  		goto drop;        accept: @@ -1726,8 +1796,7 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff,        drop:  	spin_unlock_bh(&nf_h323_lock); -	if (net_ratelimit()) -		pr_info("nf_ct_ras: packet dropped\n"); +	nf_ct_helper_log(skb, ct, "cannot process RAS message");  	return NF_DROP;  } @@ -1741,6 +1810,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {  	{  		.name			= "RAS",  		.me			= THIS_MODULE, +		.data_len		= sizeof(struct nf_ct_h323_master),  		.tuple.src.l3num	= AF_INET,  		.tuple.src.u.udp.port	= cpu_to_be16(RAS_PORT),  		.tuple.dst.protonum	= IPPROTO_UDP, @@ -1750,6 +1820,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {  	{  		.name			= "RAS",  		.me			= THIS_MODULE, +		.data_len		= sizeof(struct nf_ct_h323_master),  		.tuple.src.l3num	= AF_INET6,  		.tuple.src.u.udp.port	= cpu_to_be16(RAS_PORT),  		.tuple.dst.protonum	= IPPROTO_UDP, @@ -1828,4 +1899,6 @@ MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");  MODULE_DESCRIPTION("H.323 connection tracking helper");  MODULE_LICENSE("GPL");  MODULE_ALIAS("ip_conntrack_h323"); -MODULE_ALIAS_NFCT_HELPER("h323"); +MODULE_ALIAS_NFCT_HELPER("RAS"); +MODULE_ALIAS_NFCT_HELPER("Q.931"); +MODULE_ALIAS_NFCT_HELPER("H.245"); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 59e1a4cd4e8..5b3eae7d4c9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -3,6 +3,7 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>   * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -28,13 +29,80 @@  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_conntrack_core.h>  #include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_log.h>  static DEFINE_MUTEX(nf_ct_helper_mutex); -static struct hlist_head *nf_ct_helper_hash __read_mostly; -static unsigned int nf_ct_helper_hsize __read_mostly; +struct hlist_head *nf_ct_helper_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_helper_hash); +unsigned int nf_ct_helper_hsize __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);  static unsigned int nf_ct_helper_count __read_mostly; -static int nf_ct_helper_vmalloc; +static bool nf_ct_auto_assign_helper __read_mostly = true; +module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); +MODULE_PARM_DESC(nf_conntrack_helper, +		 "Enable automatic conntrack helper assignment (default 1)"); + +#ifdef CONFIG_SYSCTL +static struct ctl_table helper_sysctl_table[] = { +	{ +		.procname	= "nf_conntrack_helper", +		.data		= &init_net.ct.sysctl_auto_assign_helper, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{} +}; + +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ +	struct ctl_table *table; + +	table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table), +			GFP_KERNEL); +	if (!table) +		goto out; + +	table[0].data = &net->ct.sysctl_auto_assign_helper; + +	/* Don't export sysctls to unprivileged users */ +	if (net->user_ns != &init_user_ns) +		table[0].procname = NULL; + +	net->ct.helper_sysctl_header = +		register_net_sysctl(net, "net/netfilter", table); + +	if (!net->ct.helper_sysctl_header) { +		pr_err("nf_conntrack_helper: can't register to sysctl.\n"); +		goto out_register; +	} +	return 0; + +out_register: +	kfree(table); +out: +	return -ENOMEM; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ +	struct ctl_table *table; + +	table = net->ct.helper_sysctl_header->ctl_table_arg; +	unregister_net_sysctl_table(net->ct.helper_sysctl_header); +	kfree(table); +} +#else +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ +	return 0; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */  /* Stupid hash, but collision free for the default registrations of the   * helpers currently in the kernel. */ @@ -49,14 +117,13 @@ __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)  {  	struct nf_conntrack_helper *helper;  	struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; -	struct hlist_node *n;  	unsigned int h;  	if (!nf_ct_helper_count)  		return NULL;  	h = helper_hash(tuple); -	hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) { +	hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {  		if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))  			return helper;  	} @@ -67,11 +134,10 @@ struct nf_conntrack_helper *  __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)  {  	struct nf_conntrack_helper *h; -	struct hlist_node *n;  	unsigned int i;  	for (i = 0; i < nf_ct_helper_hsize; i++) { -		hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) { +		hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) {  			if (!strcmp(h->name, name) &&  			    h->tuple.src.l3num == l3num &&  			    h->tuple.dst.protonum == protonum) @@ -101,11 +167,14 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)  }  EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); -struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) +struct nf_conn_help * +nf_ct_helper_ext_add(struct nf_conn *ct, +		     struct nf_conntrack_helper *helper, gfp_t gfp)  {  	struct nf_conn_help *help; -	help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); +	help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER, +				    helper->data_len, gfp);  	if (help)  		INIT_HLIST_HEAD(&help->expectations);  	else @@ -119,31 +188,60 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,  {  	struct nf_conntrack_helper *helper = NULL;  	struct nf_conn_help *help; +	struct net *net = nf_ct_net(ct);  	int ret = 0; +	/* We already got a helper explicitly attached. The function +	 * nf_conntrack_alter_reply - in case NAT is in use - asks for looking +	 * the helper up again. Since now the user is in full control of +	 * making consistent helper configurations, skip this automatic +	 * re-lookup, otherwise we'll lose the helper. +	 */ +	if (test_bit(IPS_HELPER_BIT, &ct->status)) +		return 0; +  	if (tmpl != NULL) {  		help = nfct_help(tmpl); -		if (help != NULL) +		if (help != NULL) {  			helper = help->helper; +			set_bit(IPS_HELPER_BIT, &ct->status); +		}  	}  	help = nfct_help(ct); -	if (helper == NULL) +	if (net->ct.sysctl_auto_assign_helper && helper == NULL) {  		helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); +		if (unlikely(!net->ct.auto_assign_helper_warned && helper)) { +			pr_info("nf_conntrack: automatic helper " +				"assignment is deprecated and it will " +				"be removed soon. Use the iptables CT target " +				"to attach helpers instead.\n"); +			net->ct.auto_assign_helper_warned = true; +		} +	} +  	if (helper == NULL) {  		if (help) -			rcu_assign_pointer(help->helper, NULL); +			RCU_INIT_POINTER(help->helper, NULL);  		goto out;  	}  	if (help == NULL) { -		help = nf_ct_helper_ext_add(ct, flags); +		help = nf_ct_helper_ext_add(ct, helper, flags);  		if (help == NULL) {  			ret = -ENOMEM;  			goto out;  		}  	} else { -		memset(&help->help, 0, sizeof(help->help)); +		/* We only allow helper re-assignment of the same sort since +		 * we cannot reallocate the helper extension area. +		 */ +		struct nf_conntrack_helper *tmp = rcu_dereference(help->helper); + +		if (tmp && tmp->help != helper->help) { +			RCU_INIT_POINTER(help->helper, NULL); +			goto out; +		}  	}  	rcu_assign_pointer(help->helper, helper); @@ -152,15 +250,16 @@ out:  }  EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); +/* appropiate ct lock protecting must be taken by caller */  static inline int unhelp(struct nf_conntrack_tuple_hash *i,  			 const struct nf_conntrack_helper *me)  {  	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);  	struct nf_conn_help *help = nfct_help(ct); -	if (help && help->helper == me) { +	if (help && rcu_dereference_raw(help->helper) == me) {  		nf_conntrack_event(IPCT_HELPER, ct); -		rcu_assign_pointer(help->helper, NULL); +		RCU_INIT_POINTER(help->helper, NULL);  	}  	return 0;  } @@ -179,8 +278,91 @@ void nf_ct_helper_destroy(struct nf_conn *ct)  	}  } +static LIST_HEAD(nf_ct_helper_expectfn_list); + +void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) +{ +	spin_lock_bh(&nf_conntrack_expect_lock); +	list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); +	spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); + +void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) +{ +	spin_lock_bh(&nf_conntrack_expect_lock); +	list_del_rcu(&n->head); +	spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); + +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_name(const char *name) +{ +	struct nf_ct_helper_expectfn *cur; +	bool found = false; + +	rcu_read_lock(); +	list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { +		if (!strcmp(cur->name, name)) { +			found = true; +			break; +		} +	} +	rcu_read_unlock(); +	return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); + +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_symbol(const void *symbol) +{ +	struct nf_ct_helper_expectfn *cur; +	bool found = false; + +	rcu_read_lock(); +	list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { +		if (cur->expectfn == symbol) { +			found = true; +			break; +		} +	} +	rcu_read_unlock(); +	return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); + +__printf(3, 4) +void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, +		      const char *fmt, ...) +{ +	const struct nf_conn_help *help; +	const struct nf_conntrack_helper *helper; +	struct va_format vaf; +	va_list args; + +	va_start(args, fmt); + +	vaf.fmt = fmt; +	vaf.va = &args; + +	/* Called from the helper function, this call never fails */ +	help = nfct_help(ct); + +	/* rcu_read_lock()ed by nf_hook_slow */ +	helper = rcu_dereference(help->helper); + +	nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, +		      "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); + +	va_end(args); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_log); +  int nf_conntrack_helper_register(struct nf_conntrack_helper *me)  { +	int ret = 0; +	struct nf_conntrack_helper *cur;  	unsigned int h = helper_hash(&me->tuple);  	BUG_ON(me->expect_policy == NULL); @@ -188,11 +370,19 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)  	BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);  	mutex_lock(&nf_ct_helper_mutex); +	hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { +		if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 && +		    cur->tuple.src.l3num == me->tuple.src.l3num && +		    cur->tuple.dst.protonum == me->tuple.dst.protonum) { +			ret = -EEXIST; +			goto out; +		} +	}  	hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);  	nf_ct_helper_count++; +out:  	mutex_unlock(&nf_ct_helper_mutex); - -	return 0; +	return ret;  }  EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); @@ -201,30 +391,48 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,  {  	struct nf_conntrack_tuple_hash *h;  	struct nf_conntrack_expect *exp; -	const struct hlist_node *n, *next; +	const struct hlist_node *next;  	const struct hlist_nulls_node *nn;  	unsigned int i; +	int cpu;  	/* Get rid of expectations */ +	spin_lock_bh(&nf_conntrack_expect_lock);  	for (i = 0; i < nf_ct_expect_hsize; i++) { -		hlist_for_each_entry_safe(exp, n, next, +		hlist_for_each_entry_safe(exp, next,  					  &net->ct.expect_hash[i], hnode) {  			struct nf_conn_help *help = nfct_help(exp->master); -			if ((help->helper == me || exp->helper == me) && +			if ((rcu_dereference_protected( +					help->helper, +					lockdep_is_held(&nf_conntrack_expect_lock) +					) == me || exp->helper == me) &&  			    del_timer(&exp->timeout)) {  				nf_ct_unlink_expect(exp);  				nf_ct_expect_put(exp);  			}  		}  	} +	spin_unlock_bh(&nf_conntrack_expect_lock);  	/* Get rid of expecteds, set helpers to NULL. */ -	hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) -		unhelp(h, me); -	for (i = 0; i < net->ct.htable_size; i++) { -		hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) +	for_each_possible_cpu(cpu) { +		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + +		spin_lock_bh(&pcpu->lock); +		hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)  			unhelp(h, me); +		spin_unlock_bh(&pcpu->lock); +	} +	local_bh_disable(); +	for (i = 0; i < net->ct.htable_size; i++) { +		spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); +		if (i < net->ct.htable_size) { +			hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) +				unhelp(h, me); +		} +		spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);  	} +	local_bh_enable();  }  void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) @@ -242,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)  	synchronize_rcu();  	rtnl_lock(); -	spin_lock_bh(&nf_conntrack_lock);  	for_each_net(net)  		__nf_conntrack_helper_unregister(me, net); -	spin_unlock_bh(&nf_conntrack_lock);  	rtnl_unlock();  }  EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); @@ -256,31 +462,41 @@ static struct nf_ct_ext_type helper_extend __read_mostly = {  	.id	= NF_CT_EXT_HELPER,  }; -int nf_conntrack_helper_init(void) +int nf_conntrack_helper_pernet_init(struct net *net)  { -	int err; +	net->ct.auto_assign_helper_warned = false; +	net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; +	return nf_conntrack_helper_init_sysctl(net); +} + +void nf_conntrack_helper_pernet_fini(struct net *net) +{ +	nf_conntrack_helper_fini_sysctl(net); +} +int nf_conntrack_helper_init(void) +{ +	int ret;  	nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ -	nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, -						  &nf_ct_helper_vmalloc, 0); +	nf_ct_helper_hash = +		nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);  	if (!nf_ct_helper_hash)  		return -ENOMEM; -	err = nf_ct_extend_register(&helper_extend); -	if (err < 0) -		goto err1; +	ret = nf_ct_extend_register(&helper_extend); +	if (ret < 0) { +		pr_err("nf_ct_helper: Unable to register helper extension.\n"); +		goto out_extend; +	}  	return 0; - -err1: -	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, -			     nf_ct_helper_hsize); -	return err; +out_extend: +	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); +	return ret;  }  void nf_conntrack_helper_fini(void)  {  	nf_ct_extend_unregister(&helper_extend); -	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, -			     nf_ct_helper_hsize); +	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);  } diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index b394aa31877..0fd2976db7e 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -1,6 +1,7 @@  /* IRC extension for IP connection tracking, Version 1.21   * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>   * based on RR's ip_conntrack_ftp.c + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public License @@ -33,6 +34,7 @@ static DEFINE_SPINLOCK(irc_buffer_lock);  unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,  				enum ip_conntrack_info ctinfo, +				unsigned int protoff,  				unsigned int matchoff,  				unsigned int matchlen,  				struct nf_conntrack_expect *exp) __read_mostly; @@ -125,8 +127,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,  		return NF_ACCEPT;  	/* Until there's been traffic both ways, don't look in packets. */ -	if (ctinfo != IP_CT_ESTABLISHED && -	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) +	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)  		return NF_ACCEPT;  	/* Not a full tcp header? */ @@ -186,16 +187,16 @@ static int help(struct sk_buff *skb, unsigned int protoff,  			tuple = &ct->tuplehash[dir].tuple;  			if (tuple->src.u3.ip != dcc_ip &&  			    tuple->dst.u3.ip != dcc_ip) { -				if (net_ratelimit()) -					printk(KERN_WARNING -						"Forged DCC command from %pI4: %pI4:%u\n", -						&tuple->src.u3.ip, -						&dcc_ip, dcc_port); +				net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", +						     &tuple->src.u3.ip, +						     &dcc_ip, dcc_port);  				continue;  			}  			exp = nf_ct_expect_alloc(ct);  			if (exp == NULL) { +				nf_ct_helper_log(skb, ct, +						 "cannot alloc expectation");  				ret = NF_DROP;  				goto out;  			} @@ -208,12 +209,15 @@ static int help(struct sk_buff *skb, unsigned int protoff,  			nf_nat_irc = rcu_dereference(nf_nat_irc_hook);  			if (nf_nat_irc && ct->status & IPS_NAT_MASK) -				ret = nf_nat_irc(skb, ctinfo, +				ret = nf_nat_irc(skb, ctinfo, protoff,  						 addr_beg_p - ib_ptr,  						 addr_end_p - addr_beg_p,  						 exp); -			else if (nf_ct_expect_related(exp) != 0) +			else if (nf_ct_expect_related(exp) != 0) { +				nf_ct_helper_log(skb, ct, +						 "cannot add expectation");  				ret = NF_DROP; +			}  			nf_ct_expect_put(exp);  			goto out;  		} @@ -224,7 +228,6 @@ static int help(struct sk_buff *skb, unsigned int protoff,  }  static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; -static char irc_names[MAX_PORTS][sizeof("irc-65535")] __read_mostly;  static struct nf_conntrack_expect_policy irc_exp_policy;  static void nf_conntrack_irc_fini(void); @@ -232,7 +235,6 @@ static void nf_conntrack_irc_fini(void);  static int __init nf_conntrack_irc_init(void)  {  	int i, ret; -	char *tmpname;  	if (max_dcc_channels < 1) {  		printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n"); @@ -258,12 +260,10 @@ static int __init nf_conntrack_irc_init(void)  		irc[i].me = THIS_MODULE;  		irc[i].help = help; -		tmpname = &irc_names[i][0];  		if (ports[i] == IRC_PORT) -			sprintf(tmpname, "irc"); +			sprintf(irc[i].name, "irc");  		else -			sprintf(tmpname, "irc-%u", i); -		irc[i].name = tmpname; +			sprintf(irc[i].name, "irc-%u", i);  		ret = nf_conntrack_helper_register(&irc[i]);  		if (ret) { diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c new file mode 100644 index 00000000000..bb53f120e79 --- /dev/null +++ b/net/netfilter/nf_conntrack_labels.c @@ -0,0 +1,108 @@ +/* + * test/set flag bits stored in conntrack extension area. + * + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/export.h> +#include <linux/types.h> + +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +static unsigned int label_bits(const struct nf_conn_labels *l) +{ +	unsigned int longs = l->words; +	return longs * BITS_PER_LONG; +} + +bool nf_connlabel_match(const struct nf_conn *ct, u16 bit) +{ +	struct nf_conn_labels *labels = nf_ct_labels_find(ct); + +	if (!labels) +		return false; + +	return bit < label_bits(labels) && test_bit(bit, labels->bits); +} +EXPORT_SYMBOL_GPL(nf_connlabel_match); + +int nf_connlabel_set(struct nf_conn *ct, u16 bit) +{ +	struct nf_conn_labels *labels = nf_ct_labels_find(ct); + +	if (!labels || bit >= label_bits(labels)) +		return -ENOSPC; + +	if (test_bit(bit, labels->bits)) +		return 0; + +	if (!test_and_set_bit(bit, labels->bits)) +		nf_conntrack_event_cache(IPCT_LABEL, ct); + +	return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabel_set); + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) +static void replace_u32(u32 *address, u32 mask, u32 new) +{ +	u32 old, tmp; + +	do { +		old = *address; +		tmp = (old & mask) ^ new; +	} while (cmpxchg(address, old, tmp) != old); +} + +int nf_connlabels_replace(struct nf_conn *ct, +			  const u32 *data, +			  const u32 *mask, unsigned int words32) +{ +	struct nf_conn_labels *labels; +	unsigned int size, i; +	u32 *dst; + +	labels = nf_ct_labels_find(ct); +	if (!labels) +		return -ENOSPC; + +	size = labels->words * sizeof(long); +	if (size < (words32 * sizeof(u32))) +		words32 = size / sizeof(u32); + +	dst = (u32 *) labels->bits; +	if (words32) { +		for (i = 0; i < words32; i++) +			replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); +	} + +	size /= sizeof(u32); +	for (i = words32; i < size; i++) /* pad */ +		replace_u32(&dst[i], 0, 0); + +	nf_conntrack_event_cache(IPCT_LABEL, ct); +	return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_replace); +#endif + +static struct nf_ct_ext_type labels_extend __read_mostly = { +	.len    = sizeof(struct nf_conn_labels), +	.align  = __alignof__(struct nf_conn_labels), +	.id     = NF_CT_EXT_LABELS, +}; + +int nf_conntrack_labels_init(void) +{ +	return nf_ct_extend_register(&labels_extend); +} + +void nf_conntrack_labels_fini(void) +{ +	nf_ct_extend_unregister(&labels_extend); +} diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index aadde018a07..4c8f30a3d6d 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -18,14 +18,7 @@  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_addr.h>  #include <linux/in.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <net/route.h>  #include <net/netfilter/nf_conntrack.h>  #include <net/netfilter/nf_conntrack_helper.h> @@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns");  MODULE_ALIAS_NFCT_HELPER("netbios_ns");  static unsigned int timeout __read_mostly = 3; -module_param(timeout, uint, 0400); +module_param(timeout, uint, S_IRUSR);  MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); -static int help(struct sk_buff *skb, unsigned int protoff, -		struct nf_conn *ct, enum ip_conntrack_info ctinfo) -{ -	struct nf_conntrack_expect *exp; -	struct iphdr *iph = ip_hdr(skb); -	struct rtable *rt = skb_rtable(skb); -	struct in_device *in_dev; -	__be32 mask = 0; - -	/* we're only interested in locally generated packets */ -	if (skb->sk == NULL) -		goto out; -	if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) -		goto out; -	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) -		goto out; - -	rcu_read_lock(); -	in_dev = __in_dev_get_rcu(rt->dst.dev); -	if (in_dev != NULL) { -		for_primary_ifa(in_dev) { -			if (ifa->ifa_broadcast == iph->daddr) { -				mask = ifa->ifa_mask; -				break; -			} -		} endfor_ifa(in_dev); -	} -	rcu_read_unlock(); - -	if (mask == 0) -		goto out; - -	exp = nf_ct_expect_alloc(ct); -	if (exp == NULL) -		goto out; - -	exp->tuple                = ct->tuplehash[IP_CT_DIR_REPLY].tuple; -	exp->tuple.src.u.udp.port = htons(NMBD_PORT); - -	exp->mask.src.u3.ip       = mask; -	exp->mask.src.u.udp.port  = htons(0xFFFF); - -	exp->expectfn             = NULL; -	exp->flags                = NF_CT_EXPECT_PERMANENT; -	exp->class		  = NF_CT_EXPECT_CLASS_DEFAULT; -	exp->helper               = NULL; - -	nf_ct_expect_related(exp); -	nf_ct_expect_put(exp); - -	nf_ct_refresh(ct, skb, timeout * HZ); -out: -	return NF_ACCEPT; -} -  static struct nf_conntrack_expect_policy exp_policy = {  	.max_expected	= 1,  }; +static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, +		   struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ +	return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); +} +  static struct nf_conntrack_helper helper __read_mostly = {  	.name			= "netbios-ns", -	.tuple.src.l3num	= AF_INET, +	.tuple.src.l3num	= NFPROTO_IPV4,  	.tuple.src.u.udp.port	= cpu_to_be16(NMBD_PORT),  	.tuple.dst.protonum	= IPPROTO_UDP,  	.me			= THIS_MODULE, -	.help			= help, +	.help			= netbios_ns_help,  	.expect_policy		= &exp_policy,  }; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index b729ace1dcc..300ed1eec72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -4,7 +4,7 @@   * (C) 2001 by Jay Schulist <jschlst@samba.org>   * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>   * (C) 2003 by Patrick Mchardy <kaber@trash.net> - * (C) 2005-2008 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org>   *   * Initial connection tracking via netlink development funded and   * generally made possible by Network Robots, Inc. (www.networkrobots.com) @@ -37,14 +37,18 @@  #include <net/netfilter/nf_conntrack_core.h>  #include <net/netfilter/nf_conntrack_expect.h>  #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h>  #include <net/netfilter/nf_conntrack_l3proto.h>  #include <net/netfilter/nf_conntrack_l4proto.h>  #include <net/netfilter/nf_conntrack_tuple.h>  #include <net/netfilter/nf_conntrack_acct.h>  #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_labels.h>  #ifdef CONFIG_NF_NAT_NEEDED  #include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_protocol.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_helper.h>  #endif  #include <linux/netfilter/nfnetlink.h> @@ -65,7 +69,8 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb,  	nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED);  	if (!nest_parms)  		goto nla_put_failure; -	NLA_PUT_U8(skb, CTA_PROTO_NUM, tuple->dst.protonum); +	if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum)) +		goto nla_put_failure;  	if (likely(l4proto->tuple_to_nlattr))  		ret = l4proto->tuple_to_nlattr(skb, tuple); @@ -109,22 +114,24 @@ ctnetlink_dump_tuples(struct sk_buff *skb,  	struct nf_conntrack_l3proto *l3proto;  	struct nf_conntrack_l4proto *l4proto; +	rcu_read_lock();  	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);  	ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); -	if (unlikely(ret < 0)) -		return ret; - -	l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); -	ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); - +	if (ret >= 0) { +		l4proto = __nf_ct_l4proto_find(tuple->src.l3num, +					       tuple->dst.protonum); +		ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); +	} +	rcu_read_unlock();  	return ret;  }  static inline int  ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)  { -	NLA_PUT_BE32(skb, CTA_STATUS, htonl(ct->status)); +	if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -134,12 +141,13 @@ nla_put_failure:  static inline int  ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)  { -	long timeout = (ct->timeout.expires - jiffies) / HZ; +	long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;  	if (timeout < 0)  		timeout = 0; -	NLA_PUT_BE32(skb, CTA_TIMEOUT, htonl(timeout)); +	if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -188,7 +196,8 @@ ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)  	nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED);  	if (!nest_helper)  		goto nla_put_failure; -	NLA_PUT_STRING(skb, CTA_HELP_NAME, helper->name); +	if (nla_put_string(skb, CTA_HELP_NAME, helper->name)) +		goto nla_put_failure;  	if (helper->to_nlattr)  		helper->to_nlattr(skb, ct); @@ -202,26 +211,72 @@ nla_put_failure:  }  static int -ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, -			enum ip_conntrack_dir dir) +dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, +	      enum ip_conntrack_dir dir, int type)  { -	enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; +	enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; +	struct nf_conn_counter *counter = acct->counter;  	struct nlattr *nest_count; -	const struct nf_conn_counter *acct; +	u64 pkts, bytes; + +	if (type == IPCTNL_MSG_CT_GET_CTRZERO) { +		pkts = atomic64_xchg(&counter[dir].packets, 0); +		bytes = atomic64_xchg(&counter[dir].bytes, 0); +	} else { +		pkts = atomic64_read(&counter[dir].packets); +		bytes = atomic64_read(&counter[dir].bytes); +	} + +	nest_count = nla_nest_start(skb, attr | NLA_F_NESTED); +	if (!nest_count) +		goto nla_put_failure; + +	if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) || +	    nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes))) +		goto nla_put_failure; + +	nla_nest_end(skb, nest_count); + +	return 0; + +nla_put_failure: +	return -1; +} + +static int +ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type) +{ +	struct nf_conn_acct *acct = nf_conn_acct_find(ct); -	acct = nf_conn_acct_find(ct);  	if (!acct)  		return 0; -	nest_count = nla_nest_start(skb, type | NLA_F_NESTED); +	if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0) +		return -1; +	if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0) +		return -1; + +	return 0; +} + +static int +ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ +	struct nlattr *nest_count; +	const struct nf_conn_tstamp *tstamp; + +	tstamp = nf_conn_tstamp_find(ct); +	if (!tstamp) +		return 0; + +	nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);  	if (!nest_count)  		goto nla_put_failure; -	NLA_PUT_BE64(skb, CTA_COUNTERS_PACKETS, -		     cpu_to_be64(acct[dir].packets)); -	NLA_PUT_BE64(skb, CTA_COUNTERS_BYTES, -		     cpu_to_be64(acct[dir].bytes)); - +	if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) || +	    (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, +					       cpu_to_be64(tstamp->stop)))) +		goto nla_put_failure;  	nla_nest_end(skb, nest_count);  	return 0; @@ -234,7 +289,8 @@ nla_put_failure:  static inline int  ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)  { -	NLA_PUT_BE32(skb, CTA_MARK, htonl(ct->mark)); +	if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -254,14 +310,15 @@ ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)  	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);  	if (ret) -		return ret; +		return 0;  	ret = -1;  	nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);  	if (!nest_secctx)  		goto nla_put_failure; -	NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx); +	if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) +		goto nla_put_failure;  	nla_nest_end(skb, nest_secctx);  	ret = 0; @@ -273,6 +330,40 @@ nla_put_failure:  #define ctnetlink_dump_secctx(a, b) (0)  #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS +static int ctnetlink_label_size(const struct nf_conn *ct) +{ +	struct nf_conn_labels *labels = nf_ct_labels_find(ct); + +	if (!labels) +		return 0; +	return nla_total_size(labels->words * sizeof(long)); +} + +static int +ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) +{ +	struct nf_conn_labels *labels = nf_ct_labels_find(ct); +	unsigned int len, i; + +	if (!labels) +		return 0; + +	len = labels->words * sizeof(long); +	i = 0; +	do { +		if (labels->bits[i] != 0) +			return nla_put(skb, CTA_LABELS, len, labels->bits); +		i++; +	} while (i < labels->words); + +	return 0; +} +#else +#define ctnetlink_dump_labels(a, b) (0) +#define ctnetlink_label_size(a)	(0) +#endif +  #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)  static inline int @@ -296,9 +387,8 @@ nla_put_failure:  	return -1;  } -#ifdef CONFIG_NF_NAT_NEEDED  static int -dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)  {  	struct nlattr *nest_parms; @@ -306,12 +396,13 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)  	if (!nest_parms)  		goto nla_put_failure; -	NLA_PUT_BE32(skb, CTA_NAT_SEQ_CORRECTION_POS, -		     htonl(natseq->correction_pos)); -	NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, -		     htonl(natseq->offset_before)); -	NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_AFTER, -		     htonl(natseq->offset_after)); +	if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, +			 htonl(seq->correction_pos)) || +	    nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, +			 htonl(seq->offset_before)) || +	    nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, +			 htonl(seq->offset_after))) +		goto nla_put_failure;  	nla_nest_end(skb, nest_parms); @@ -322,32 +413,30 @@ nla_put_failure:  }  static inline int -ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)  { -	struct nf_nat_seq *natseq; -	struct nf_conn_nat *nat = nfct_nat(ct); +	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); +	struct nf_ct_seqadj *seq; -	if (!(ct->status & IPS_SEQ_ADJUST) || !nat) +	if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)  		return 0; -	natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; -	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) +	seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; +	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)  		return -1; -	natseq = &nat->seq[IP_CT_DIR_REPLY]; -	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) +	seq = &seqadj->seq[IP_CT_DIR_REPLY]; +	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)  		return -1;  	return 0;  } -#else -#define ctnetlink_dump_nat_seq_adj(a, b) (0) -#endif  static inline int  ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)  { -	NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct)); +	if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -357,7 +446,8 @@ nla_put_failure:  static inline int  ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)  { -	NLA_PUT_BE32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))); +	if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -365,16 +455,16 @@ nla_put_failure:  }  static int -ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, -		    int event, struct nf_conn *ct) +ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, +		    struct nf_conn *ct)  {  	struct nlmsghdr *nlh;  	struct nfgenmsg *nfmsg;  	struct nlattr *nest_parms; -	unsigned int flags = pid ? NLM_F_MULTI : 0; +	unsigned int flags = portid ? NLM_F_MULTI : 0, event; -	event |= NFNL_SUBSYS_CTNETLINK << 8; -	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); +	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);  	if (nlh == NULL)  		goto nlmsg_failure; @@ -397,21 +487,23 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,  		goto nla_put_failure;  	nla_nest_end(skb, nest_parms); -	if (nf_ct_zone(ct)) -		NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); +	if (nf_ct_zone(ct) && +	    nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) +		goto nla_put_failure;  	if (ctnetlink_dump_status(skb, ct) < 0 ||  	    ctnetlink_dump_timeout(skb, ct) < 0 || -	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || -	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || +	    ctnetlink_dump_acct(skb, ct, type) < 0 || +	    ctnetlink_dump_timestamp(skb, ct) < 0 ||  	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||  	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||  	    ctnetlink_dump_mark(skb, ct) < 0 ||  	    ctnetlink_dump_secctx(skb, ct) < 0 || +	    ctnetlink_dump_labels(skb, ct) < 0 ||  	    ctnetlink_dump_id(skb, ct) < 0 ||  	    ctnetlink_dump_use(skb, ct) < 0 ||  	    ctnetlink_dump_master(skb, ct) < 0 || -	    ctnetlink_dump_nat_seq_adj(skb, ct) < 0) +	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)  		goto nla_put_failure;  	nlmsg_end(skb, nlh); @@ -423,7 +515,6 @@ nla_put_failure:  	return -1;  } -#ifdef CONFIG_NF_CONNTRACK_EVENTS  static inline size_t  ctnetlink_proto_size(const struct nf_conn *ct)  { @@ -443,7 +534,7 @@ ctnetlink_proto_size(const struct nf_conn *ct)  }  static inline size_t -ctnetlink_counters_size(const struct nf_conn *ct) +ctnetlink_acct_size(const struct nf_conn *ct)  {  	if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))  		return 0; @@ -453,16 +544,34 @@ ctnetlink_counters_size(const struct nf_conn *ct)  	       ;  } -#ifdef CONFIG_NF_CONNTRACK_SECMARK -static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct) +static inline int +ctnetlink_secctx_size(const struct nf_conn *ct)  { -	int len; +#ifdef CONFIG_NF_CONNTRACK_SECMARK +	int len, ret; -	security_secid_to_secctx(ct->secmark, NULL, &len); +	ret = security_secid_to_secctx(ct->secmark, NULL, &len); +	if (ret) +		return 0; -	return sizeof(char) * len; +	return nla_total_size(0) /* CTA_SECCTX */ +	       + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ +#else +	return 0; +#endif  } + +static inline size_t +ctnetlink_timestamp_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +	if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) +		return 0; +	return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); +#else +	return 0;  #endif +}  static inline size_t  ctnetlink_nlmsg_size(const struct nf_conn *ct) @@ -474,15 +583,13 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)  	       + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ -	       + ctnetlink_counters_size(ct) +	       + ctnetlink_acct_size(ct) +	       + ctnetlink_timestamp_size(ct)  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */  	       + nla_total_size(0) /* CTA_PROTOINFO */  	       + nla_total_size(0) /* CTA_HELP */  	       + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ -#ifdef CONFIG_NF_CONNTRACK_SECMARK -	       + nla_total_size(0) /* CTA_SECCTX */ -	       + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */ -#endif +	       + ctnetlink_secctx_size(ct)  #ifdef CONFIG_NF_NAT_NEEDED  	       + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */  	       + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ @@ -490,10 +597,15 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)  #ifdef CONFIG_NF_CONNTRACK_MARK  	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */  #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES +	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif  	       + ctnetlink_proto_size(ct) +	       + ctnetlink_label_size(ct)  	       ;  } +#ifdef CONFIG_NF_CONNTRACK_EVENTS  static int  ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)  { @@ -533,7 +645,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)  		goto errout;  	type |= NFNL_SUBSYS_CTNETLINK << 8; -	nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); +	nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);  	if (nlh == NULL)  		goto nlmsg_failure; @@ -557,8 +669,9 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)  		goto nla_put_failure;  	nla_nest_end(skb, nest_parms); -	if (nf_ct_zone(ct)) -		NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); +	if (nf_ct_zone(ct) && +	    nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) +		goto nla_put_failure;  	if (ctnetlink_dump_id(skb, ct) < 0)  		goto nla_put_failure; @@ -567,8 +680,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)  		goto nla_put_failure;  	if (events & (1 << IPCT_DESTROY)) { -		if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || -		    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) +		if (ctnetlink_dump_acct(skb, ct, type) < 0 || +		    ctnetlink_dump_timestamp(skb, ct) < 0)  			goto nla_put_failure;  	} else {  		if (ctnetlink_dump_timeout(skb, ct) < 0) @@ -587,13 +700,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)  		    && ctnetlink_dump_secctx(skb, ct) < 0)  			goto nla_put_failure;  #endif +		if (events & (1 << IPCT_LABEL) && +		     ctnetlink_dump_labels(skb, ct) < 0) +			goto nla_put_failure;  		if (events & (1 << IPCT_RELATED) &&  		    ctnetlink_dump_master(skb, ct) < 0)  			goto nla_put_failure; -		if (events & (1 << IPCT_NATSEQADJ) && -		    ctnetlink_dump_nat_seq_adj(skb, ct) < 0) +		if (events & (1 << IPCT_SEQADJ) && +		    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)  			goto nla_put_failure;  	} @@ -605,7 +721,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)  	rcu_read_unlock();  	nlmsg_end(skb, nlh); -	err = nfnetlink_send(skb, net, item->pid, group, item->report, +	err = nfnetlink_send(skb, net, item->portid, group, item->report,  			     GFP_ATOMIC);  	if (err == -ENOBUFS || err == -EAGAIN)  		return -ENOBUFS; @@ -629,9 +745,18 @@ static int ctnetlink_done(struct netlink_callback *cb)  {  	if (cb->args[1])  		nf_ct_put((struct nf_conn *)cb->args[1]); +	if (cb->data) +		kfree(cb->data);  	return 0;  } +struct ctnetlink_dump_filter { +	struct { +		u_int32_t val; +		u_int32_t mask; +	} mark; +}; +  static int  ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)  { @@ -641,53 +766,67 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)  	struct hlist_nulls_node *n;  	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);  	u_int8_t l3proto = nfmsg->nfgen_family; +	int res; +	spinlock_t *lockp; + +#ifdef CONFIG_NF_CONNTRACK_MARK +	const struct ctnetlink_dump_filter *filter = cb->data; +#endif -	rcu_read_lock();  	last = (struct nf_conn *)cb->args[1]; + +	local_bh_disable();  	for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {  restart: -		hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[cb->args[0]], +		lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; +		spin_lock(lockp); +		if (cb->args[0] >= net->ct.htable_size) { +			spin_unlock(lockp); +			goto out; +		} +		hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],  					 hnnode) {  			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)  				continue;  			ct = nf_ct_tuplehash_to_ctrack(h); -			if (!atomic_inc_not_zero(&ct->ct_general.use)) -				continue;  			/* Dump entries of a given L3 protocol number.  			 * If it is not specified, ie. l3proto == 0,  			 * then dump everything. */  			if (l3proto && nf_ct_l3num(ct) != l3proto) -				goto releasect; +				continue;  			if (cb->args[1]) {  				if (ct != last) -					goto releasect; +					continue;  				cb->args[1] = 0;  			} -			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, -						cb->nlh->nlmsg_seq, -						IPCTNL_MSG_CT_NEW, ct) < 0) { +#ifdef CONFIG_NF_CONNTRACK_MARK +			if (filter && !((ct->mark & filter->mark.mask) == +					filter->mark.val)) { +				continue; +			} +#endif +			rcu_read_lock(); +			res = +			ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, +					    cb->nlh->nlmsg_seq, +					    NFNL_MSG_TYPE(cb->nlh->nlmsg_type), +					    ct); +			rcu_read_unlock(); +			if (res < 0) { +				nf_conntrack_get(&ct->ct_general);  				cb->args[1] = (unsigned long)ct; +				spin_unlock(lockp);  				goto out;  			} - -			if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == -						IPCTNL_MSG_CT_GET_CTRZERO) { -				struct nf_conn_counter *acct; - -				acct = nf_conn_acct_find(ct); -				if (acct) -					memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX])); -			} -releasect: -		nf_ct_put(ct);  		} +		spin_unlock(lockp);  		if (cb->args[1]) {  			cb->args[1] = 0;  			goto restart;  		}  	}  out: -	rcu_read_unlock(); +	local_bh_enable();  	if (last)  		nf_ct_put(last); @@ -701,7 +840,9 @@ ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)  	struct nf_conntrack_l3proto *l3proto;  	int ret = 0; -	nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); +	ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); +	if (ret < 0) +		return ret;  	rcu_read_lock();  	l3proto = __nf_ct_l3proto_find(tuple->src.l3num); @@ -761,14 +902,16 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {  static int  ctnetlink_parse_tuple(const struct nlattr * const cda[],  		      struct nf_conntrack_tuple *tuple, -		      enum ctattr_tuple type, u_int8_t l3num) +		      enum ctattr_type type, u_int8_t l3num)  {  	struct nlattr *tb[CTA_TUPLE_MAX+1];  	int err;  	memset(tuple, 0, sizeof(*tuple)); -	nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); +	err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); +	if (err < 0) +		return err;  	if (!tb[CTA_TUPLE_IP])  		return -EINVAL; @@ -811,21 +954,29 @@ ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)  }  static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { -	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING }, +	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING, +				    .len = NF_CT_HELPER_NAME_LEN - 1 },  };  static inline int -ctnetlink_parse_help(const struct nlattr *attr, char **helper_name) +ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, +		     struct nlattr **helpinfo)  { +	int err;  	struct nlattr *tb[CTA_HELP_MAX+1]; -	nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); +	err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); +	if (err < 0) +		return err;  	if (!tb[CTA_HELP_NAME])  		return -EINVAL;  	*helper_name = nla_data(tb[CTA_HELP_NAME]); +	if (tb[CTA_HELP_INFO]) +		*helpinfo = tb[CTA_HELP_INFO]; +  	return 0;  } @@ -841,7 +992,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {  	[CTA_ID]		= { .type = NLA_U32 },  	[CTA_NAT_DST]		= { .type = NLA_NESTED },  	[CTA_TUPLE_MASTER]	= { .type = NLA_NESTED }, +	[CTA_NAT_SEQ_ADJ_ORIG]  = { .type = NLA_NESTED }, +	[CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED },  	[CTA_ZONE]		= { .type = NLA_U16 }, +	[CTA_MARK_MASK]		= { .type = NLA_U32 }, +	[CTA_LABELS]		= { .type = NLA_BINARY, +				    .len = NF_CT_LABELS_MAX_SIZE }, +	[CTA_LABELS_MASK]	= { .type = NLA_BINARY, +				    .len = NF_CT_LABELS_MAX_SIZE },  };  static int @@ -869,7 +1027,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,  	else {  		/* Flush the whole table */  		nf_conntrack_flush_report(net, -					 NETLINK_CB(skb).pid, +					 NETLINK_CB(skb).portid,  					 nlmsg_report(nlh));  		return 0;  	} @@ -891,20 +1049,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,  		}  	} -	if (nf_conntrack_event_report(IPCT_DESTROY, ct, -				      NETLINK_CB(skb).pid, -				      nlmsg_report(nlh)) < 0) { -		nf_ct_delete_from_lists(ct); -		/* we failed to report the event, try later */ -		nf_ct_insert_dying_list(ct); -		nf_ct_put(ct); -		return 0; -	} +	if (del_timer(&ct->timeout)) +		nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); -	/* death_by_timeout would report the event again */ -	set_bit(IPS_DYING_BIT, &ct->status); - -	nf_ct_kill(ct);  	nf_ct_put(ct);  	return 0; @@ -925,9 +1072,28 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,  	u16 zone;  	int err; -	if (nlh->nlmsg_flags & NLM_F_DUMP) -		return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, -					  ctnetlink_done); +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = ctnetlink_dump_table, +			.done = ctnetlink_done, +		}; +#ifdef CONFIG_NF_CONNTRACK_MARK +		if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { +			struct ctnetlink_dump_filter *filter; + +			filter = kzalloc(sizeof(struct ctnetlink_dump_filter), +					 GFP_ATOMIC); +			if (filter == NULL) +				return -ENOMEM; + +			filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); +			filter->mark.mask = +				ntohl(nla_get_be32(cda[CTA_MARK_MASK])); +			c.data = filter; +		} +#endif +		return netlink_dump_start(ctnl, skb, nlh, &c); +	}  	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);  	if (err < 0) @@ -957,14 +1123,14 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,  	}  	rcu_read_lock(); -	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, -				  IPCTNL_MSG_CT_NEW, ct); +	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, +				  NFNL_MSG_TYPE(nlh->nlmsg_type), ct);  	rcu_read_unlock();  	nf_ct_put(ct);  	if (err <= 0)  		goto free; -	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); +	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);  	if (err < 0)  		goto out; @@ -973,7 +1139,125 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,  free:  	kfree_skb(skb2);  out: -	return err; +	/* this avoids a loop in nfnetlink. */ +	return err == -EAGAIN ? -ENOBUFS : err; +} + +static int ctnetlink_done_list(struct netlink_callback *cb) +{ +	if (cb->args[1]) +		nf_ct_put((struct nf_conn *)cb->args[1]); +	return 0; +} + +static int +ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) +{ +	struct nf_conn *ct, *last; +	struct nf_conntrack_tuple_hash *h; +	struct hlist_nulls_node *n; +	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	u_int8_t l3proto = nfmsg->nfgen_family; +	int res; +	int cpu; +	struct hlist_nulls_head *list; +	struct net *net = sock_net(skb->sk); + +	if (cb->args[2]) +		return 0; + +	last = (struct nf_conn *)cb->args[1]; + +	for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { +		struct ct_pcpu *pcpu; + +		if (!cpu_possible(cpu)) +			continue; + +		pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); +		spin_lock_bh(&pcpu->lock); +		list = dying ? &pcpu->dying : &pcpu->unconfirmed; +restart: +		hlist_nulls_for_each_entry(h, n, list, hnnode) { +			ct = nf_ct_tuplehash_to_ctrack(h); +			if (l3proto && nf_ct_l3num(ct) != l3proto) +				continue; +			if (cb->args[1]) { +				if (ct != last) +					continue; +				cb->args[1] = 0; +			} +			rcu_read_lock(); +			res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, +						  cb->nlh->nlmsg_seq, +						  NFNL_MSG_TYPE(cb->nlh->nlmsg_type), +						  ct); +			rcu_read_unlock(); +			if (res < 0) { +				if (!atomic_inc_not_zero(&ct->ct_general.use)) +					continue; +				cb->args[0] = cpu; +				cb->args[1] = (unsigned long)ct; +				spin_unlock_bh(&pcpu->lock); +				goto out; +			} +		} +		if (cb->args[1]) { +			cb->args[1] = 0; +			goto restart; +		} +		spin_unlock_bh(&pcpu->lock); +	} +	cb->args[2] = 1; +out: +	if (last) +		nf_ct_put(last); + +	return skb->len; +} + +static int +ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) +{ +	return ctnetlink_dump_list(skb, cb, true); +} + +static int +ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb, +		       const struct nlmsghdr *nlh, +		       const struct nlattr * const cda[]) +{ +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = ctnetlink_dump_dying, +			.done = ctnetlink_done_list, +		}; +		return netlink_dump_start(ctnl, skb, nlh, &c); +	} + +	return -EOPNOTSUPP; +} + +static int +ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) +{ +	return ctnetlink_dump_list(skb, cb, false); +} + +static int +ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb, +			     const struct nlmsghdr *nlh, +			     const struct nlattr * const cda[]) +{ +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = ctnetlink_dump_unconfirmed, +			.done = ctnetlink_done_list, +		}; +		return netlink_dump_start(ctnl, skb, nlh, &c); +	} + +	return -EOPNOTSUPP;  }  #ifdef CONFIG_NF_NAT_NEEDED @@ -983,21 +1267,19 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,  			  const struct nlattr *attr)  {  	typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; +	int err;  	parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);  	if (!parse_nat_setup) {  #ifdef CONFIG_MODULES  		rcu_read_unlock(); -		spin_unlock_bh(&nf_conntrack_lock); -		nfnl_unlock(); -		if (request_module("nf-nat-ipv4") < 0) { -			nfnl_lock(); -			spin_lock_bh(&nf_conntrack_lock); +		nfnl_unlock(NFNL_SUBSYS_CTNETLINK); +		if (request_module("nf-nat") < 0) { +			nfnl_lock(NFNL_SUBSYS_CTNETLINK);  			rcu_read_lock();  			return -EOPNOTSUPP;  		} -		nfnl_lock(); -		spin_lock_bh(&nf_conntrack_lock); +		nfnl_lock(NFNL_SUBSYS_CTNETLINK);  		rcu_read_lock();  		if (nfnetlink_parse_nat_setup_hook)  			return -EAGAIN; @@ -1005,7 +1287,23 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,  		return -EOPNOTSUPP;  	} -	return parse_nat_setup(ct, manip, attr); +	err = parse_nat_setup(ct, manip, attr); +	if (err == -EAGAIN) { +#ifdef CONFIG_MODULES +		rcu_read_unlock(); +		nfnl_unlock(NFNL_SUBSYS_CTNETLINK); +		if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) { +			nfnl_lock(NFNL_SUBSYS_CTNETLINK); +			rcu_read_lock(); +			return -EOPNOTSUPP; +		} +		nfnl_lock(NFNL_SUBSYS_CTNETLINK); +		rcu_read_lock(); +#else +		err = -EOPNOTSUPP; +#endif +	} +	return err;  }  #endif @@ -1036,27 +1334,25 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])  }  static int -ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])  {  #ifdef CONFIG_NF_NAT_NEEDED  	int ret; -	if (cda[CTA_NAT_DST]) { -		ret = ctnetlink_parse_nat_setup(ct, -						IP_NAT_MANIP_DST, -						cda[CTA_NAT_DST]); -		if (ret < 0) -			return ret; -	} -	if (cda[CTA_NAT_SRC]) { -		ret = ctnetlink_parse_nat_setup(ct, -						IP_NAT_MANIP_SRC, -						cda[CTA_NAT_SRC]); -		if (ret < 0) -			return ret; -	} -	return 0; +	if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) +		return 0; + +	ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, +					cda[CTA_NAT_DST]); +	if (ret < 0) +		return ret; + +	ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, +					cda[CTA_NAT_SRC]); +	return ret;  #else +	if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) +		return 0;  	return -EOPNOTSUPP;  #endif  } @@ -1067,13 +1363,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])  	struct nf_conntrack_helper *helper;  	struct nf_conn_help *help = nfct_help(ct);  	char *helpname = NULL; +	struct nlattr *helpinfo = NULL;  	int err;  	/* don't change helper of sibling connections */  	if (ct->master)  		return -EBUSY; -	err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); +	err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);  	if (err < 0)  		return err; @@ -1081,7 +1378,7 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])  		if (help && help->helper) {  			/* we had a helper before ... */  			nf_ct_remove_expectations(ct); -			rcu_assign_pointer(help->helper, NULL); +			RCU_INIT_POINTER(help->helper, NULL);  		}  		return 0; @@ -1091,14 +1388,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])  					    nf_ct_protonum(ct));  	if (helper == NULL) {  #ifdef CONFIG_MODULES -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		if (request_module("nfct-helper-%s", helpname) < 0) { -			spin_lock_bh(&nf_conntrack_lock); +			spin_lock_bh(&nf_conntrack_expect_lock);  			return -EOPNOTSUPP;  		} -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),  						    nf_ct_protonum(ct));  		if (helper) @@ -1108,20 +1405,17 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])  	}  	if (help) { -		if (help->helper == helper) +		if (help->helper == helper) { +			/* update private helper data if allowed. */ +			if (helper->from_nlattr) +				helper->from_nlattr(helpinfo, ct);  			return 0; -		if (help->helper) +		} else  			return -EBUSY; -		/* need to zero data of old helper */ -		memset(&help->help, 0, sizeof(help->help)); -	} else { -		/* we cannot set a helper for an existing conntrack */ -		return -EOPNOTSUPP;  	} -	rcu_assign_pointer(help->helper, helper); - -	return 0; +	/* we cannot set a helper for an existing conntrack */ +	return -EOPNOTSUPP;  }  static inline int @@ -1152,7 +1446,9 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]  	struct nf_conntrack_l4proto *l4proto;  	int err = 0; -	nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); +	err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); +	if (err < 0) +		return err;  	rcu_read_lock();  	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); @@ -1163,63 +1459,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]  	return err;  } -#ifdef CONFIG_NF_NAT_NEEDED -static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = { -	[CTA_NAT_SEQ_CORRECTION_POS]	= { .type = NLA_U32 }, -	[CTA_NAT_SEQ_OFFSET_BEFORE]	= { .type = NLA_U32 }, -	[CTA_NAT_SEQ_OFFSET_AFTER]	= { .type = NLA_U32 }, +static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { +	[CTA_SEQADJ_CORRECTION_POS]	= { .type = NLA_U32 }, +	[CTA_SEQADJ_OFFSET_BEFORE]	= { .type = NLA_U32 }, +	[CTA_SEQADJ_OFFSET_AFTER]	= { .type = NLA_U32 },  };  static inline int -change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr) +change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)  { -	struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; +	int err; +	struct nlattr *cda[CTA_SEQADJ_MAX+1]; -	nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy); +	err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy); +	if (err < 0) +		return err; -	if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) +	if (!cda[CTA_SEQADJ_CORRECTION_POS])  		return -EINVAL; -	natseq->correction_pos = -		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); +	seq->correction_pos = +		ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); -	if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) +	if (!cda[CTA_SEQADJ_OFFSET_BEFORE])  		return -EINVAL; -	natseq->offset_before = -		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); +	seq->offset_before = +		ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); -	if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) +	if (!cda[CTA_SEQADJ_OFFSET_AFTER])  		return -EINVAL; -	natseq->offset_after = -		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); +	seq->offset_after = +		ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER]));  	return 0;  }  static int -ctnetlink_change_nat_seq_adj(struct nf_conn *ct, -			     const struct nlattr * const cda[]) +ctnetlink_change_seq_adj(struct nf_conn *ct, +			 const struct nlattr * const cda[])  { +	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);  	int ret = 0; -	struct nf_conn_nat *nat = nfct_nat(ct); -	if (!nat) +	if (!seqadj)  		return 0; -	if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { -		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], -					 cda[CTA_NAT_SEQ_ADJ_ORIG]); +	if (cda[CTA_SEQ_ADJ_ORIG]) { +		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], +				     cda[CTA_SEQ_ADJ_ORIG]);  		if (ret < 0)  			return ret;  		ct->status |= IPS_SEQ_ADJUST;  	} -	if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { -		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], -					 cda[CTA_NAT_SEQ_ADJ_REPLY]); +	if (cda[CTA_SEQ_ADJ_REPLY]) { +		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], +				     cda[CTA_SEQ_ADJ_REPLY]);  		if (ret < 0)  			return ret; @@ -1228,7 +1526,31 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct,  	return 0;  } + +static int +ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS +	size_t len = nla_len(cda[CTA_LABELS]); +	const void *mask = cda[CTA_LABELS_MASK]; + +	if (len & (sizeof(u32)-1)) /* must be multiple of u32 */ +		return -EINVAL; + +	if (mask) { +		if (nla_len(cda[CTA_LABELS_MASK]) == 0 || +		    nla_len(cda[CTA_LABELS_MASK]) != len) +			return -EINVAL; +		mask = nla_data(cda[CTA_LABELS_MASK]); +	} + +	len /= sizeof(u32); + +	return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len); +#else +	return -EOPNOTSUPP;  #endif +}  static int  ctnetlink_change_conntrack(struct nf_conn *ct, @@ -1269,13 +1591,17 @@ ctnetlink_change_conntrack(struct nf_conn *ct,  		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));  #endif -#ifdef CONFIG_NF_NAT_NEEDED -	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { -		err = ctnetlink_change_nat_seq_adj(ct, cda); +	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { +		err = ctnetlink_change_seq_adj(ct, cda); +		if (err < 0) +			return err; +	} + +	if (cda[CTA_LABELS]) { +		err = ctnetlink_attach_labels(ct, cda);  		if (err < 0)  			return err;  	} -#endif  	return 0;  } @@ -1290,6 +1616,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,  	struct nf_conn *ct;  	int err = -EINVAL;  	struct nf_conntrack_helper *helper; +	struct nf_conn_tstamp *tstamp;  	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);  	if (IS_ERR(ct)) @@ -1304,8 +1631,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,  	rcu_read_lock();   	if (cda[CTA_HELP]) {  		char *helpname = NULL; -  - 		err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); +		struct nlattr *helpinfo = NULL; + +		err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);   		if (err < 0)  			goto err2; @@ -1334,14 +1662,17 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,  		} else {  			struct nf_conn_help *help; -			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); +			help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);  			if (help == NULL) {  				err = -ENOMEM;  				goto err2;  			} +			/* set private helper data if allowed. */ +			if (helper->from_nlattr) +				helper->from_nlattr(helpinfo, ct);  			/* not in hash table yet so not strictly necessary */ -			rcu_assign_pointer(help->helper, helper); +			RCU_INIT_POINTER(help->helper, helper);  		}  	} else {  		/* try an implicit helper assignation */ @@ -1350,14 +1681,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,  			goto err2;  	} -	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { -		err = ctnetlink_change_nat(ct, cda); -		if (err < 0) -			goto err2; -	} +	err = ctnetlink_setup_nat(ct, cda); +	if (err < 0) +		goto err2;  	nf_ct_acct_ext_add(ct, GFP_ATOMIC); +	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);  	nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); +	nf_ct_labels_ext_add(ct); +  	/* we must add conntrack extensions before confirmation. */  	ct->status |= IPS_CONFIRMED; @@ -1367,14 +1699,13 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,  			goto err2;  	} -#ifdef CONFIG_NF_NAT_NEEDED -	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { -		err = ctnetlink_change_nat_seq_adj(ct, cda); +	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { +		err = ctnetlink_change_seq_adj(ct, cda);  		if (err < 0)  			goto err2;  	} -#endif +	memset(&ct->proto, 0, sizeof(ct->proto));  	if (cda[CTA_PROTOINFO]) {  		err = ctnetlink_change_protoinfo(ct, cda);  		if (err < 0) @@ -1405,9 +1736,14 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,  		__set_bit(IPS_EXPECTED_BIT, &ct->status);  		ct->master = master_ct;  	} +	tstamp = nf_conn_tstamp_find(ct); +	if (tstamp) +		tstamp->start = ktime_to_ns(ktime_get_real()); + +	err = nf_conntrack_hash_check_insert(ct); +	if (err < 0) +		goto err2; -	add_timer(&ct->timeout); -	nf_conntrack_hash_insert(ct);  	rcu_read_unlock();  	return ct; @@ -1428,6 +1764,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,  	struct nf_conntrack_tuple otuple, rtuple;  	struct nf_conntrack_tuple_hash *h = NULL;  	struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nf_conn *ct;  	u_int8_t u3 = nfmsg->nfgen_family;  	u16 zone;  	int err; @@ -1448,78 +1785,460 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,  			return err;  	} -	spin_lock_bh(&nf_conntrack_lock);  	if (cda[CTA_TUPLE_ORIG]) -		h = __nf_conntrack_find(net, zone, &otuple); +		h = nf_conntrack_find_get(net, zone, &otuple);  	else if (cda[CTA_TUPLE_REPLY]) -		h = __nf_conntrack_find(net, zone, &rtuple); +		h = nf_conntrack_find_get(net, zone, &rtuple);  	if (h == NULL) {  		err = -ENOENT;  		if (nlh->nlmsg_flags & NLM_F_CREATE) { -			struct nf_conn *ct;  			enum ip_conntrack_events events; +			if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) +				return -EINVAL; +  			ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,  							&rtuple, u3); -			if (IS_ERR(ct)) { -				err = PTR_ERR(ct); -				goto out_unlock; -			} +			if (IS_ERR(ct)) +				return PTR_ERR(ct); +  			err = 0; -			nf_conntrack_get(&ct->ct_general); -			spin_unlock_bh(&nf_conntrack_lock);  			if (test_bit(IPS_EXPECTED_BIT, &ct->status))  				events = IPCT_RELATED;  			else  				events = IPCT_NEW; +			if (cda[CTA_LABELS] && +			    ctnetlink_attach_labels(ct, cda) == 0) +				events |= (1 << IPCT_LABEL); +  			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |  						      (1 << IPCT_ASSURED) |  						      (1 << IPCT_HELPER) |  						      (1 << IPCT_PROTOINFO) | -						      (1 << IPCT_NATSEQADJ) | +						      (1 << IPCT_SEQADJ) |  						      (1 << IPCT_MARK) | events, -						      ct, NETLINK_CB(skb).pid, +						      ct, NETLINK_CB(skb).portid,  						      nlmsg_report(nlh));  			nf_ct_put(ct); -		} else -			spin_unlock_bh(&nf_conntrack_lock); +		}  		return err;  	}  	/* implicit 'else' */ -	/* We manipulate the conntrack inside the global conntrack table lock, -	 * so there's no need to increase the refcount */  	err = -EEXIST; +	ct = nf_ct_tuplehash_to_ctrack(h);  	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { -		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - +		spin_lock_bh(&nf_conntrack_expect_lock);  		err = ctnetlink_change_conntrack(ct, cda); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		if (err == 0) { -			nf_conntrack_get(&ct->ct_general); -			spin_unlock_bh(&nf_conntrack_lock);  			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |  						      (1 << IPCT_ASSURED) |  						      (1 << IPCT_HELPER) | +						      (1 << IPCT_LABEL) |  						      (1 << IPCT_PROTOINFO) | -						      (1 << IPCT_NATSEQADJ) | +						      (1 << IPCT_SEQADJ) |  						      (1 << IPCT_MARK), -						      ct, NETLINK_CB(skb).pid, +						      ct, NETLINK_CB(skb).portid,  						      nlmsg_report(nlh)); -			nf_ct_put(ct); -		} else -			spin_unlock_bh(&nf_conntrack_lock); +		} +	} + +	nf_ct_put(ct); +	return err; +} + +static int +ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, +				__u16 cpu, const struct ip_conntrack_stat *st) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0, event; + +	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version      = NFNETLINK_V0; +	nfmsg->res_id	    = htons(cpu); + +	if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) || +	    nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || +	    nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) || +	    nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || +	    nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || +	    nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) || +	    nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) || +	    nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || +	    nla_put_be32(skb, CTA_STATS_INSERT_FAILED, +				htonl(st->insert_failed)) || +	    nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) || +	    nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || +	    nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || +	    nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, +				htonl(st->search_restart))) +		goto nla_put_failure; + +	nlmsg_end(skb, nlh); +	return skb->len; + +nla_put_failure: +nlmsg_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int cpu; +	struct net *net = sock_net(skb->sk); + +	if (cb->args[0] == nr_cpu_ids) +		return 0; + +	for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { +		const struct ip_conntrack_stat *st; + +		if (!cpu_possible(cpu)) +			continue; + +		st = per_cpu_ptr(net->ct.stat, cpu); +		if (ctnetlink_ct_stat_cpu_fill_info(skb, +						    NETLINK_CB(cb->skb).portid, +						    cb->nlh->nlmsg_seq, +						    cpu, st) < 0) +				break; +	} +	cb->args[0] = cpu; + +	return skb->len; +} + +static int +ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb, +		      const struct nlmsghdr *nlh, +		      const struct nlattr * const cda[]) +{ +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = ctnetlink_ct_stat_cpu_dump, +		}; +		return netlink_dump_start(ctnl, skb, nlh, &c); +	} + +	return 0; +} + +static int +ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, +			    struct net *net) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0, event; +	unsigned int nr_conntracks = atomic_read(&net->ct.count); + +	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version      = NFNETLINK_V0; +	nfmsg->res_id	    = 0; + +	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) +		goto nla_put_failure; + +	nlmsg_end(skb, nlh); +	return skb->len; + +nla_put_failure: +nlmsg_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb, +		  const struct nlmsghdr *nlh, +		  const struct nlattr * const cda[]) +{ +	struct sk_buff *skb2; +	int err; + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, +					  nlh->nlmsg_seq, +					  NFNL_MSG_TYPE(nlh->nlmsg_type), +					  sock_net(skb->sk)); +	if (err <= 0) +		goto free; + +	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +	if (err < 0) +		goto out; + +	return 0; + +free: +	kfree_skb(skb2); +out: +	/* this avoids a loop in nfnetlink. */ +	return err == -EAGAIN ? -ENOBUFS : err; +} + +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { +	[CTA_EXPECT_MASTER]	= { .type = NLA_NESTED }, +	[CTA_EXPECT_TUPLE]	= { .type = NLA_NESTED }, +	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED }, +	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 }, +	[CTA_EXPECT_ID]		= { .type = NLA_U32 }, +	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING, +				    .len = NF_CT_HELPER_NAME_LEN - 1 }, +	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 }, +	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 }, +	[CTA_EXPECT_CLASS]	= { .type = NLA_U32 }, +	[CTA_EXPECT_NAT]	= { .type = NLA_NESTED }, +	[CTA_EXPECT_FN]		= { .type = NLA_NUL_STRING }, +}; + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, +		       struct nf_conntrack_helper *helper, +		       struct nf_conntrack_tuple *tuple, +		       struct nf_conntrack_tuple *mask); + +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +static size_t +ctnetlink_nfqueue_build_size(const struct nf_conn *ct) +{ +	return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ +	       + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ +	       + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ +	       + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ +	       + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ +	       + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ +	       + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ +	       + nla_total_size(0) /* CTA_PROTOINFO */ +	       + nla_total_size(0) /* CTA_HELP */ +	       + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ +	       + ctnetlink_secctx_size(ct) +#ifdef CONFIG_NF_NAT_NEEDED +	       + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ +	       + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK +	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES +	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif +	       + ctnetlink_proto_size(ct) +	       ; +} + +static int +ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) +{ +	struct nlattr *nest_parms; + +	rcu_read_lock(); +	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); +	if (!nest_parms) +		goto nla_put_failure; +	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) +		goto nla_put_failure; +	nla_nest_end(skb, nest_parms); + +	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); +	if (!nest_parms) +		goto nla_put_failure; +	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) +		goto nla_put_failure; +	nla_nest_end(skb, nest_parms); + +	if (nf_ct_zone(ct)) { +		if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) +			goto nla_put_failure; +	} + +	if (ctnetlink_dump_id(skb, ct) < 0) +		goto nla_put_failure; + +	if (ctnetlink_dump_status(skb, ct) < 0) +		goto nla_put_failure; + +	if (ctnetlink_dump_timeout(skb, ct) < 0) +		goto nla_put_failure; + +	if (ctnetlink_dump_protoinfo(skb, ct) < 0) +		goto nla_put_failure; + +	if (ctnetlink_dump_helpinfo(skb, ct) < 0) +		goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_SECMARK +	if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0) +		goto nla_put_failure; +#endif +	if (ct->master && ctnetlink_dump_master(skb, ct) < 0) +		goto nla_put_failure; + +	if ((ct->status & IPS_SEQ_ADJUST) && +	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0) +		goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_MARK +	if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) +		goto nla_put_failure; +#endif +	if (ctnetlink_dump_labels(skb, ct) < 0) +		goto nla_put_failure; +	rcu_read_unlock(); +	return 0; + +nla_put_failure: +	rcu_read_unlock(); +	return -ENOSPC; +} + +static int +ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) +{ +	int err; + +	if (cda[CTA_TIMEOUT]) { +		err = ctnetlink_change_timeout(ct, cda); +		if (err < 0) +			return err; +	} +	if (cda[CTA_STATUS]) { +		err = ctnetlink_change_status(ct, cda); +		if (err < 0) +			return err; +	} +	if (cda[CTA_HELP]) { +		err = ctnetlink_change_helper(ct, cda); +		if (err < 0) +			return err; +	} +	if (cda[CTA_LABELS]) { +		err = ctnetlink_attach_labels(ct, cda); +		if (err < 0) +			return err; +	} +#if defined(CONFIG_NF_CONNTRACK_MARK) +	if (cda[CTA_MARK]) { +		u32 mask = 0, mark, newmark; +		if (cda[CTA_MARK_MASK]) +			mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + +		mark = ntohl(nla_get_be32(cda[CTA_MARK])); +		newmark = (ct->mark & mask) ^ mark; +		if (newmark != ct->mark) +			ct->mark = newmark; +	} +#endif +	return 0; +} + +static int +ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) +{ +	struct nlattr *cda[CTA_MAX+1]; +	int ret; + +	ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); +	if (ret < 0) +		return ret; + +	spin_lock_bh(&nf_conntrack_expect_lock); +	ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); +	spin_unlock_bh(&nf_conntrack_expect_lock); + +	return ret; +} + +static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, +				       const struct nf_conn *ct, +				       struct nf_conntrack_tuple *tuple, +				       struct nf_conntrack_tuple *mask) +{ +	int err; + +	err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, +				    nf_ct_l3num(ct)); +	if (err < 0) +		return err; + +	return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, +				     nf_ct_l3num(ct)); +} + +static int +ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, +				u32 portid, u32 report) +{ +	struct nlattr *cda[CTA_EXPECT_MAX+1]; +	struct nf_conntrack_tuple tuple, mask; +	struct nf_conntrack_helper *helper = NULL; +	struct nf_conntrack_expect *exp; +	int err; + +	err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy); +	if (err < 0) +		return err; + +	err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, +					  ct, &tuple, &mask); +	if (err < 0) +		return err; + +	if (cda[CTA_EXPECT_HELP_NAME]) { +		const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + +		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), +						    nf_ct_protonum(ct)); +		if (helper == NULL) +			return -EOPNOTSUPP; +	} +	exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, +				     helper, &tuple, &mask); +	if (IS_ERR(exp)) +		return PTR_ERR(exp); + +	err = nf_ct_expect_related_report(exp, portid, report); +	if (err < 0) { +		nf_ct_expect_put(exp);  		return err;  	} -out_unlock: -	spin_unlock_bh(&nf_conntrack_lock); -	return err; +	return 0;  } +static struct nfq_ct_hook ctnetlink_nfqueue_hook = { +	.build_size	= ctnetlink_nfqueue_build_size, +	.build		= ctnetlink_nfqueue_build, +	.parse		= ctnetlink_nfqueue_parse, +	.attach_expect	= ctnetlink_nfqueue_attach_expect, +	.seq_adjust	= nf_ct_tcp_seqadj_set, +}; +#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ +  /***********************************************************************   * EXPECT   ***********************************************************************/ @@ -1564,14 +2283,16 @@ ctnetlink_exp_dump_mask(struct sk_buff *skb,  	if (!nest_parms)  		goto nla_put_failure; +	rcu_read_lock();  	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);  	ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); - -	if (unlikely(ret < 0)) -		goto nla_put_failure; - -	l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); +	if (ret >= 0) { +		l4proto = __nf_ct_l4proto_find(tuple->src.l3num, +					       tuple->dst.protonum);  	ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); +	} +	rcu_read_unlock(); +  	if (unlikely(ret < 0))  		goto nla_put_failure; @@ -1583,13 +2304,20 @@ nla_put_failure:  	return -1;  } +static const union nf_inet_addr any_addr; +  static int  ctnetlink_exp_dump_expect(struct sk_buff *skb,  			  const struct nf_conntrack_expect *exp)  {  	struct nf_conn *master = exp->master; -	long timeout = (exp->timeout.expires - jiffies) / HZ; +	long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;  	struct nf_conn_help *help; +#ifdef CONFIG_NF_NAT_NEEDED +	struct nlattr *nest_parms; +	struct nf_conntrack_tuple nat_tuple = {}; +#endif +	struct nf_ct_helper_expectfn *expfn;  	if (timeout < 0)  		timeout = 0; @@ -1603,17 +2331,45 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,  				 CTA_EXPECT_MASTER) < 0)  		goto nla_put_failure; -	NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); -	NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); -	NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); +#ifdef CONFIG_NF_NAT_NEEDED +	if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || +	    exp->saved_proto.all) { +		nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED); +		if (!nest_parms) +			goto nla_put_failure; + +		if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir))) +			goto nla_put_failure; + +		nat_tuple.src.l3num = nf_ct_l3num(master); +		nat_tuple.src.u3 = exp->saved_addr; +		nat_tuple.dst.protonum = nf_ct_protonum(master); +		nat_tuple.src.u = exp->saved_proto; + +		if (ctnetlink_exp_dump_tuple(skb, &nat_tuple, +						CTA_EXPECT_NAT_TUPLE) < 0) +	                goto nla_put_failure; +	        nla_nest_end(skb, nest_parms); +	} +#endif +	if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || +	    nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || +	    nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || +	    nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) +		goto nla_put_failure;  	help = nfct_help(master);  	if (help) {  		struct nf_conntrack_helper *helper;  		helper = rcu_dereference(help->helper); -		if (helper) -			NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); +		if (helper && +		    nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) +			goto nla_put_failure;  	} +	expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); +	if (expfn != NULL && +	    nla_put_string(skb, CTA_EXPECT_FN, expfn->name)) +		goto nla_put_failure;  	return 0; @@ -1622,15 +2378,15 @@ nla_put_failure:  }  static int -ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,  			int event, const struct nf_conntrack_expect *exp)  {  	struct nlmsghdr *nlh;  	struct nfgenmsg *nfmsg; -	unsigned int flags = pid ? NLM_F_MULTI : 0; +	unsigned int flags = portid ? NLM_F_MULTI : 0;  	event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; -	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);  	if (nlh == NULL)  		goto nlmsg_failure; @@ -1681,7 +2437,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)  		goto errout;  	type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; -	nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); +	nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);  	if (nlh == NULL)  		goto nlmsg_failure; @@ -1696,7 +2452,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)  	rcu_read_unlock();  	nlmsg_end(skb, nlh); -	nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC); +	nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC);  	return 0;  nla_put_failure: @@ -1722,14 +2478,13 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)  	struct net *net = sock_net(skb->sk);  	struct nf_conntrack_expect *exp, *last;  	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); -	struct hlist_node *n;  	u_int8_t l3proto = nfmsg->nfgen_family;  	rcu_read_lock();  	last = (struct nf_conntrack_expect *)cb->args[1];  	for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {  restart: -		hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]], +		hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],  				     hnode) {  			if (l3proto && exp->tuple.src.l3num != l3proto)  				continue; @@ -1739,7 +2494,7 @@ restart:  				cb->args[1] = 0;  			}  			if (ctnetlink_exp_fill_info(skb, -						    NETLINK_CB(cb->skb).pid, +						    NETLINK_CB(cb->skb).portid,  						    cb->nlh->nlmsg_seq,  						    IPCTNL_MSG_EXP_NEW,  						    exp) < 0) { @@ -1762,16 +2517,91 @@ out:  	return skb->len;  } -static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { -	[CTA_EXPECT_MASTER]	= { .type = NLA_NESTED }, -	[CTA_EXPECT_TUPLE]	= { .type = NLA_NESTED }, -	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED }, -	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 }, -	[CTA_EXPECT_ID]		= { .type = NLA_U32 }, -	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING }, -	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 }, -	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 }, -}; +static int +ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct nf_conntrack_expect *exp, *last; +	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	struct nf_conn *ct = cb->data; +	struct nf_conn_help *help = nfct_help(ct); +	u_int8_t l3proto = nfmsg->nfgen_family; + +	if (cb->args[0]) +		return 0; + +	rcu_read_lock(); +	last = (struct nf_conntrack_expect *)cb->args[1]; +restart: +	hlist_for_each_entry(exp, &help->expectations, lnode) { +		if (l3proto && exp->tuple.src.l3num != l3proto) +			continue; +		if (cb->args[1]) { +			if (exp != last) +				continue; +			cb->args[1] = 0; +		} +		if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, +					    cb->nlh->nlmsg_seq, +					    IPCTNL_MSG_EXP_NEW, +					    exp) < 0) { +			if (!atomic_inc_not_zero(&exp->use)) +				continue; +			cb->args[1] = (unsigned long)exp; +			goto out; +		} +	} +	if (cb->args[1]) { +		cb->args[1] = 0; +		goto restart; +	} +	cb->args[0] = 1; +out: +	rcu_read_unlock(); +	if (last) +		nf_ct_expect_put(last); + +	return skb->len; +} + +static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, +				 const struct nlmsghdr *nlh, +				 const struct nlattr * const cda[]) +{ +	int err; +	struct net *net = sock_net(ctnl); +	struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	u_int8_t u3 = nfmsg->nfgen_family; +	struct nf_conntrack_tuple tuple; +	struct nf_conntrack_tuple_hash *h; +	struct nf_conn *ct; +	u16 zone = 0; +	struct netlink_dump_control c = { +		.dump = ctnetlink_exp_ct_dump_table, +		.done = ctnetlink_exp_done, +	}; + +	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); +	if (err < 0) +		return err; + +	if (cda[CTA_EXPECT_ZONE]) { +		err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); +		if (err < 0) +			return err; +	} + +	h = nf_conntrack_find_get(net, zone, &tuple); +	if (!h) +		return -ENOENT; + +	ct = nf_ct_tuplehash_to_ctrack(h); +	c.data = ct; + +	err = netlink_dump_start(ctnl, skb, nlh, &c); +	nf_ct_put(ct); + +	return err; +}  static int  ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, @@ -1788,16 +2618,24 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,  	int err;  	if (nlh->nlmsg_flags & NLM_F_DUMP) { -		return netlink_dump_start(ctnl, skb, nlh, -					  ctnetlink_exp_dump_table, -					  ctnetlink_exp_done); +		if (cda[CTA_EXPECT_MASTER]) +			return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda); +		else { +			struct netlink_dump_control c = { +				.dump = ctnetlink_exp_dump_table, +				.done = ctnetlink_exp_done, +			}; +			return netlink_dump_start(ctnl, skb, nlh, &c); +		}  	}  	err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);  	if (err < 0)  		return err; -	if (cda[CTA_EXPECT_MASTER]) +	if (cda[CTA_EXPECT_TUPLE]) +		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); +	else if (cda[CTA_EXPECT_MASTER])  		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);  	else  		return -EINVAL; @@ -1819,25 +2657,30 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,  	err = -ENOMEM;  	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); -	if (skb2 == NULL) +	if (skb2 == NULL) { +		nf_ct_expect_put(exp);  		goto out; +	}  	rcu_read_lock(); -	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, +	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,  				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);  	rcu_read_unlock(); +	nf_ct_expect_put(exp);  	if (err <= 0)  		goto free; -	nf_ct_expect_put(exp); +	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +	if (err < 0) +		goto out; -	return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); +	return 0;  free:  	kfree_skb(skb2);  out: -	nf_ct_expect_put(exp); -	return err; +	/* this avoids a loop in nfnetlink. */ +	return err == -EAGAIN ? -ENOBUFS : err;  }  static int @@ -1849,7 +2692,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,  	struct nf_conntrack_expect *exp;  	struct nf_conntrack_tuple tuple;  	struct nfgenmsg *nfmsg = nlmsg_data(nlh); -	struct hlist_node *n, *next; +	struct hlist_node *next;  	u_int8_t u3 = nfmsg->nfgen_family;  	unsigned int i;  	u16 zone; @@ -1879,13 +2722,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,  		}  		/* after list removal, usage count == 1 */ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		if (del_timer(&exp->timeout)) { -			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid, +			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,  						   nlmsg_report(nlh));  			nf_ct_expect_put(exp);  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		/* have to put what we 'get' above.  		 * after this line usage count == 0 */  		nf_ct_expect_put(exp); @@ -1894,38 +2737,38 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,  		struct nf_conn_help *m_help;  		/* delete all expectations for this helper */ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		for (i = 0; i < nf_ct_expect_hsize; i++) { -			hlist_for_each_entry_safe(exp, n, next, +			hlist_for_each_entry_safe(exp, next,  						  &net->ct.expect_hash[i],  						  hnode) {  				m_help = nfct_help(exp->master);  				if (!strcmp(m_help->helper->name, name) &&  				    del_timer(&exp->timeout)) {  					nf_ct_unlink_expect_report(exp, -							NETLINK_CB(skb).pid, +							NETLINK_CB(skb).portid,  							nlmsg_report(nlh));  					nf_ct_expect_put(exp);  				}  			}  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  	} else {  		/* This basically means we have to flush everything*/ -		spin_lock_bh(&nf_conntrack_lock); +		spin_lock_bh(&nf_conntrack_expect_lock);  		for (i = 0; i < nf_ct_expect_hsize; i++) { -			hlist_for_each_entry_safe(exp, n, next, +			hlist_for_each_entry_safe(exp, next,  						  &net->ct.expect_hash[i],  						  hnode) {  				if (del_timer(&exp->timeout)) {  					nf_ct_unlink_expect_report(exp, -							NETLINK_CB(skb).pid, +							NETLINK_CB(skb).portid,  							nlmsg_report(nlh));  					nf_ct_expect_put(exp);  				}  			}  		} -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  	}  	return 0; @@ -1934,48 +2777,79 @@ static int  ctnetlink_change_expect(struct nf_conntrack_expect *x,  			const struct nlattr * const cda[])  { -	return -EOPNOTSUPP; +	if (cda[CTA_EXPECT_TIMEOUT]) { +		if (!del_timer(&x->timeout)) +			return -ETIME; + +		x->timeout.expires = jiffies + +			ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; +		add_timer(&x->timeout); +	} +	return 0;  } +static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { +	[CTA_EXPECT_NAT_DIR]	= { .type = NLA_U32 }, +	[CTA_EXPECT_NAT_TUPLE]	= { .type = NLA_NESTED }, +}; +  static int -ctnetlink_create_expect(struct net *net, u16 zone, -			const struct nlattr * const cda[], -			u_int8_t u3, -			u32 pid, int report) +ctnetlink_parse_expect_nat(const struct nlattr *attr, +			   struct nf_conntrack_expect *exp, +			   u_int8_t u3)  { -	struct nf_conntrack_tuple tuple, mask, master_tuple; -	struct nf_conntrack_tuple_hash *h = NULL; -	struct nf_conntrack_expect *exp; -	struct nf_conn *ct; -	struct nf_conn_help *help; -	int err = 0; +#ifdef CONFIG_NF_NAT_NEEDED +	struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; +	struct nf_conntrack_tuple nat_tuple = {}; +	int err; -	/* caller guarantees that those three CTA_EXPECT_* exist */ -	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); +	err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy);  	if (err < 0)  		return err; -	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); -	if (err < 0) -		return err; -	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + +	if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE]) +		return -EINVAL; + +	err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, +					&nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);  	if (err < 0)  		return err; -	/* Look for master conntrack of this expectation */ -	h = nf_conntrack_find_get(net, zone, &master_tuple); -	if (!h) -		return -ENOENT; -	ct = nf_ct_tuplehash_to_ctrack(h); -	exp = nf_ct_expect_alloc(ct); -	if (!exp) { -		err = -ENOMEM; -		goto out; +	exp->saved_addr = nat_tuple.src.u3; +	exp->saved_proto = nat_tuple.src.u; +	exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR])); + +	return 0; +#else +	return -EOPNOTSUPP; +#endif +} + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, +		       struct nf_conntrack_helper *helper, +		       struct nf_conntrack_tuple *tuple, +		       struct nf_conntrack_tuple *mask) +{ +	u_int32_t class = 0; +	struct nf_conntrack_expect *exp; +	struct nf_conn_help *help; +	int err; + +	if (cda[CTA_EXPECT_CLASS] && helper) { +		class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); +		if (class > helper->expect_class_max) +			return ERR_PTR(-EINVAL);  	} +	exp = nf_ct_expect_alloc(ct); +	if (!exp) +		return ERR_PTR(-ENOMEM); +  	help = nfct_help(ct);  	if (!help) {  		if (!cda[CTA_EXPECT_TIMEOUT]) {  			err = -EINVAL; -			goto out; +			goto err_out;  		}  		exp->timeout.expires =  		  jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; @@ -1992,20 +2866,105 @@ ctnetlink_create_expect(struct net *net, u16 zone,  		} else  			exp->flags = 0;  	} +	if (cda[CTA_EXPECT_FN]) { +		const char *name = nla_data(cda[CTA_EXPECT_FN]); +		struct nf_ct_helper_expectfn *expfn; -	exp->class = 0; -	exp->expectfn = NULL; -	exp->master = ct; -	exp->helper = NULL; -	memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); -	memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3)); -	exp->mask.src.u.all = mask.src.u.all; +		expfn = nf_ct_helper_expectfn_find_by_name(name); +		if (expfn == NULL) { +			err = -EINVAL; +			goto err_out; +		} +		exp->expectfn = expfn->expectfn; +	} else +		exp->expectfn = NULL; -	err = nf_ct_expect_related_report(exp, pid, report); +	exp->class = class; +	exp->master = ct; +	exp->helper = helper; +	exp->tuple = *tuple; +	exp->mask.src.u3 = mask->src.u3; +	exp->mask.src.u.all = mask->src.u.all; + +	if (cda[CTA_EXPECT_NAT]) { +		err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], +						 exp, nf_ct_l3num(ct)); +		if (err < 0) +			goto err_out; +	} +	return exp; +err_out:  	nf_ct_expect_put(exp); +	return ERR_PTR(err); +} -out: -	nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); +static int +ctnetlink_create_expect(struct net *net, u16 zone, +			const struct nlattr * const cda[], +			u_int8_t u3, u32 portid, int report) +{ +	struct nf_conntrack_tuple tuple, mask, master_tuple; +	struct nf_conntrack_tuple_hash *h = NULL; +	struct nf_conntrack_helper *helper = NULL; +	struct nf_conntrack_expect *exp; +	struct nf_conn *ct; +	int err; + +	/* caller guarantees that those three CTA_EXPECT_* exist */ +	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); +	if (err < 0) +		return err; +	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); +	if (err < 0) +		return err; +	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); +	if (err < 0) +		return err; + +	/* Look for master conntrack of this expectation */ +	h = nf_conntrack_find_get(net, zone, &master_tuple); +	if (!h) +		return -ENOENT; +	ct = nf_ct_tuplehash_to_ctrack(h); + +	if (cda[CTA_EXPECT_HELP_NAME]) { +		const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + +		helper = __nf_conntrack_helper_find(helpname, u3, +						    nf_ct_protonum(ct)); +		if (helper == NULL) { +#ifdef CONFIG_MODULES +			if (request_module("nfct-helper-%s", helpname) < 0) { +				err = -EOPNOTSUPP; +				goto err_ct; +			} +			helper = __nf_conntrack_helper_find(helpname, u3, +							    nf_ct_protonum(ct)); +			if (helper) { +				err = -EAGAIN; +				goto err_ct; +			} +#endif +			err = -EOPNOTSUPP; +			goto err_ct; +		} +	} + +	exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); +	if (IS_ERR(exp)) { +		err = PTR_ERR(exp); +		goto err_ct; +	} + +	err = nf_ct_expect_related_report(exp, portid, report); +	if (err < 0) +		goto err_exp; + +	return 0; +err_exp: +	nf_ct_expect_put(exp); +err_ct: +	nf_ct_put(ct);  	return err;  } @@ -2035,16 +2994,16 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,  	if (err < 0)  		return err; -	spin_lock_bh(&nf_conntrack_lock); +	spin_lock_bh(&nf_conntrack_expect_lock);  	exp = __nf_ct_expect_find(net, zone, &tuple);  	if (!exp) { -		spin_unlock_bh(&nf_conntrack_lock); +		spin_unlock_bh(&nf_conntrack_expect_lock);  		err = -ENOENT;  		if (nlh->nlmsg_flags & NLM_F_CREATE) {  			err = ctnetlink_create_expect(net, zone, cda,  						      u3, -						      NETLINK_CB(skb).pid, +						      NETLINK_CB(skb).portid,  						      nlmsg_report(nlh));  		}  		return err; @@ -2053,11 +3012,84 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,  	err = -EEXIST;  	if (!(nlh->nlmsg_flags & NLM_F_EXCL))  		err = ctnetlink_change_expect(exp, cda); -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	return err;  } +static int +ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, +			     const struct ip_conntrack_stat *st) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0, event; + +	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version      = NFNETLINK_V0; +	nfmsg->res_id	    = htons(cpu); + +	if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || +	    nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || +	    nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) +		goto nla_put_failure; + +	nlmsg_end(skb, nlh); +	return skb->len; + +nla_put_failure: +nlmsg_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int cpu; +	struct net *net = sock_net(skb->sk); + +	if (cb->args[0] == nr_cpu_ids) +		return 0; + +	for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { +		const struct ip_conntrack_stat *st; + +		if (!cpu_possible(cpu)) +			continue; + +		st = per_cpu_ptr(net->ct.stat, cpu); +		if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid, +						 cb->nlh->nlmsg_seq, +						 cpu, st) < 0) +			break; +	} +	cb->args[0] = cpu; + +	return skb->len; +} + +static int +ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb, +		       const struct nlmsghdr *nlh, +		       const struct nlattr * const cda[]) +{ +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = ctnetlink_exp_stat_cpu_dump, +		}; +		return netlink_dump_start(ctnl, skb, nlh, &c); +	} + +	return 0; +} +  #ifdef CONFIG_NF_CONNTRACK_EVENTS  static struct nf_ct_event_notifier ctnl_notifier = {  	.fcn = ctnetlink_conntrack_event, @@ -2081,6 +3113,10 @@ static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {  	[IPCTNL_MSG_CT_GET_CTRZERO] 	= { .call = ctnetlink_get_conntrack,  					    .attr_count = CTA_MAX,  					    .policy = ct_nla_policy }, +	[IPCTNL_MSG_CT_GET_STATS_CPU]	= { .call = ctnetlink_stat_ct_cpu }, +	[IPCTNL_MSG_CT_GET_STATS]	= { .call = ctnetlink_stat_ct }, +	[IPCTNL_MSG_CT_GET_DYING]	= { .call = ctnetlink_get_ct_dying }, +	[IPCTNL_MSG_CT_GET_UNCONFIRMED]	= { .call = ctnetlink_get_ct_unconfirmed },  };  static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { @@ -2093,6 +3129,7 @@ static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {  	[IPCTNL_MSG_EXP_DELETE]		= { .call = ctnetlink_del_expect,  					    .attr_count = CTA_EXPECT_MAX,  					    .policy = exp_nla_policy }, +	[IPCTNL_MSG_EXP_GET_STATS_CPU]	= { .call = ctnetlink_stat_exp_cpu },  };  static const struct nfnetlink_subsystem ctnl_subsys = { @@ -2113,6 +3150,54 @@ MODULE_ALIAS("ip_conntrack_netlink");  MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);  MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); +static int __net_init ctnetlink_net_init(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS +	int ret; + +	ret = nf_conntrack_register_notifier(net, &ctnl_notifier); +	if (ret < 0) { +		pr_err("ctnetlink_init: cannot register notifier.\n"); +		goto err_out; +	} + +	ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp); +	if (ret < 0) { +		pr_err("ctnetlink_init: cannot expect register notifier.\n"); +		goto err_unreg_notifier; +	} +#endif +	return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +err_unreg_notifier: +	nf_conntrack_unregister_notifier(net, &ctnl_notifier); +err_out: +	return ret; +#endif +} + +static void ctnetlink_net_exit(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS +	nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp); +	nf_conntrack_unregister_notifier(net, &ctnl_notifier); +#endif +} + +static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) +{ +	struct net *net; + +	list_for_each_entry(net, net_exit_list, exit_list) +		ctnetlink_net_exit(net); +} + +static struct pernet_operations ctnetlink_net_ops = { +	.init		= ctnetlink_net_init, +	.exit_batch	= ctnetlink_net_exit_batch, +}; +  static int __init ctnetlink_init(void)  {  	int ret; @@ -2130,28 +3215,19 @@ static int __init ctnetlink_init(void)  		goto err_unreg_subsys;  	} -#ifdef CONFIG_NF_CONNTRACK_EVENTS -	ret = nf_conntrack_register_notifier(&ctnl_notifier); +	ret = register_pernet_subsys(&ctnetlink_net_ops);  	if (ret < 0) { -		pr_err("ctnetlink_init: cannot register notifier.\n"); +		pr_err("ctnetlink_init: cannot register pernet operations\n");  		goto err_unreg_exp_subsys;  	} - -	ret = nf_ct_expect_register_notifier(&ctnl_notifier_exp); -	if (ret < 0) { -		pr_err("ctnetlink_init: cannot expect register notifier.\n"); -		goto err_unreg_notifier; -	} +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +	/* setup interaction between nf_queue and nf_conntrack_netlink. */ +	RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);  #endif -  	return 0; -#ifdef CONFIG_NF_CONNTRACK_EVENTS -err_unreg_notifier: -	nf_conntrack_unregister_notifier(&ctnl_notifier);  err_unreg_exp_subsys:  	nfnetlink_subsys_unregister(&ctnl_exp_subsys); -#endif  err_unreg_subsys:  	nfnetlink_subsys_unregister(&ctnl_subsys);  err_out: @@ -2162,14 +3238,12 @@ static void __exit ctnetlink_exit(void)  {  	pr_info("ctnetlink: unregistering from nfnetlink.\n"); -	nf_ct_remove_userspace_expectations(); -#ifdef CONFIG_NF_CONNTRACK_EVENTS -	nf_ct_expect_unregister_notifier(&ctnl_notifier_exp); -	nf_conntrack_unregister_notifier(&ctnl_notifier); -#endif - +	unregister_pernet_subsys(&ctnetlink_net_ops);  	nfnetlink_subsys_unregister(&ctnl_exp_subsys);  	nfnetlink_subsys_unregister(&ctnl_subsys); +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +	RCU_INIT_POINTER(nfq_ct_hook, NULL); +#endif  }  module_init(ctnetlink_init); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 088944824e1..825c3e3f830 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -11,10 +11,12 @@   *   * Development of this code funded by Astaro AG (http://www.astaro.com/)   * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + *   * Limitations:   * 	 - We blindly assume that control connections are always   * 	   established in PNS->PAC direction.  This is a violation - * 	   of RFFC2673 + *	   of RFC 2637   * 	 - We can only support one single call within each session   * TODO:   *	 - testing of incoming PPTP calls @@ -45,14 +47,14 @@ static DEFINE_SPINLOCK(nf_pptp_lock);  int  (*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,  			     struct nf_conn *ct, enum ip_conntrack_info ctinfo, -			     struct PptpControlHeader *ctlh, +			     unsigned int protoff, struct PptpControlHeader *ctlh,  			     union pptp_ctrl_union *pptpReq) __read_mostly;  EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);  int  (*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,  			    struct nf_conn *ct, enum ip_conntrack_info ctinfo, -			    struct PptpControlHeader *ctlh, +			    unsigned int protoff, struct PptpControlHeader *ctlh,  			    union pptp_ctrl_union *pptpReq) __read_mostly;  EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound); @@ -174,7 +176,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,  static void pptp_destroy_siblings(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); -	const struct nf_conn_help *help = nfct_help(ct); +	const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);  	struct nf_conntrack_tuple t;  	nf_ct_gre_keymap_destroy(ct); @@ -182,16 +184,16 @@ static void pptp_destroy_siblings(struct nf_conn *ct)  	/* try original (pns->pac) tuple */  	memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));  	t.dst.protonum = IPPROTO_GRE; -	t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id; -	t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id; +	t.src.u.gre.key = ct_pptp_info->pns_call_id; +	t.dst.u.gre.key = ct_pptp_info->pac_call_id;  	if (!destroy_sibling_or_exp(net, ct, &t))  		pr_debug("failed to timeout original pns->pac ct/exp\n");  	/* try reply (pac->pns) tuple */  	memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));  	t.dst.protonum = IPPROTO_GRE; -	t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id; -	t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id; +	t.src.u.gre.key = ct_pptp_info->pac_call_id; +	t.dst.u.gre.key = ct_pptp_info->pns_call_id;  	if (!destroy_sibling_or_exp(net, ct, &t))  		pr_debug("failed to timeout reply pac->pns ct/exp\n");  } @@ -262,14 +264,14 @@ out_unexpect_orig:  }  static inline int -pptp_inbound_pkt(struct sk_buff *skb, +pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,  		 struct PptpControlHeader *ctlh,  		 union pptp_ctrl_union *pptpReq,  		 unsigned int reqlen,  		 struct nf_conn *ct,  		 enum ip_conntrack_info ctinfo)  { -	struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; +	struct nf_ct_pptp_master *info = nfct_help_data(ct);  	u_int16_t msg;  	__be16 cid = 0, pcid = 0;  	typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; @@ -364,6 +366,7 @@ pptp_inbound_pkt(struct sk_buff *skb,  		break;  	case PPTP_WAN_ERROR_NOTIFY: +	case PPTP_SET_LINK_INFO:  	case PPTP_ECHO_REQUEST:  	case PPTP_ECHO_REPLY:  		/* I don't have to explain these ;) */ @@ -375,7 +378,8 @@ pptp_inbound_pkt(struct sk_buff *skb,  	nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);  	if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK) -		return nf_nat_pptp_inbound(skb, ct, ctinfo, ctlh, pptpReq); +		return nf_nat_pptp_inbound(skb, ct, ctinfo, +					   protoff, ctlh, pptpReq);  	return NF_ACCEPT;  invalid: @@ -388,14 +392,14 @@ invalid:  }  static inline int -pptp_outbound_pkt(struct sk_buff *skb, +pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,  		  struct PptpControlHeader *ctlh,  		  union pptp_ctrl_union *pptpReq,  		  unsigned int reqlen,  		  struct nf_conn *ct,  		  enum ip_conntrack_info ctinfo)  { -	struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; +	struct nf_ct_pptp_master *info = nfct_help_data(ct);  	u_int16_t msg;  	__be16 cid = 0, pcid = 0;  	typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; @@ -470,7 +474,8 @@ pptp_outbound_pkt(struct sk_buff *skb,  	nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);  	if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK) -		return nf_nat_pptp_outbound(skb, ct, ctinfo, ctlh, pptpReq); +		return nf_nat_pptp_outbound(skb, ct, ctinfo, +					    protoff, ctlh, pptpReq);  	return NF_ACCEPT;  invalid: @@ -505,7 +510,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,  {  	int dir = CTINFO2DIR(ctinfo); -	const struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; +	const struct nf_ct_pptp_master *info = nfct_help_data(ct);  	const struct tcphdr *tcph;  	struct tcphdr _tcph;  	const struct pptp_pkt_hdr *pptph; @@ -519,8 +524,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,  	u_int16_t msg;  	/* don't do any tracking before tcp handshake complete */ -	if (ctinfo != IP_CT_ESTABLISHED && -	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) +	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)  		return NF_ACCEPT;  	nexthdr_off = protoff; @@ -570,11 +574,11 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,  	 * established from PNS->PAC.  However, RFC makes no guarantee */  	if (dir == IP_CT_DIR_ORIGINAL)  		/* client -> server (PNS -> PAC) */ -		ret = pptp_outbound_pkt(skb, ctlh, pptpReq, reqlen, ct, +		ret = pptp_outbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,  					ctinfo);  	else  		/* server -> client (PAC -> PNS) */ -		ret = pptp_inbound_pkt(skb, ctlh, pptpReq, reqlen, ct, +		ret = pptp_inbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,  				       ctinfo);  	pr_debug("sstate: %d->%d, cstate: %d->%d\n",  		 oldsstate, info->sstate, oldcstate, info->cstate); @@ -592,6 +596,7 @@ static const struct nf_conntrack_expect_policy pptp_exp_policy = {  static struct nf_conntrack_helper pptp __read_mostly = {  	.name			= "pptp",  	.me			= THIS_MODULE, +	.data_len		= sizeof(struct nf_ct_pptp_master),  	.tuple.src.l3num	= AF_INET,  	.tuple.src.u.tcp.port	= cpu_to_be16(PPTP_CONTROL_PORT),  	.tuple.dst.protonum	= IPPROTO_TCP, @@ -600,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = {  	.expect_policy		= &pptp_exp_policy,  }; -static void nf_conntrack_pptp_net_exit(struct net *net) -{ -	nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations nf_conntrack_pptp_net_ops = { -	.exit = nf_conntrack_pptp_net_exit, -}; -  static int __init nf_conntrack_pptp_init(void)  { -	int rv; - -	rv = nf_conntrack_helper_register(&pptp); -	if (rv < 0) -		return rv; -	rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); -	if (rv < 0) -		nf_conntrack_helper_unregister(&pptp); -	return rv; +	return nf_conntrack_helper_register(&pptp);  }  static void __exit nf_conntrack_pptp_fini(void)  {  	nf_conntrack_helper_unregister(&pptp); -	unregister_pernet_subsys(&nf_conntrack_pptp_net_ops);  }  module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index dc7bb74110d..b65d5864b6d 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -3,6 +3,7 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>   * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -21,7 +22,6 @@  #include <linux/notifier.h>  #include <linux/kernel.h>  #include <linux/netdevice.h> -#include <linux/rtnetlink.h>  #include <net/netfilter/nf_conntrack.h>  #include <net/netfilter/nf_conntrack_l3proto.h> @@ -36,28 +36,32 @@ static DEFINE_MUTEX(nf_ct_proto_mutex);  #ifdef CONFIG_SYSCTL  static int -nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path, -		      struct ctl_table *table, unsigned int *users) +nf_ct_register_sysctl(struct net *net, +		      struct ctl_table_header **header, +		      const char *path, +		      struct ctl_table *table)  {  	if (*header == NULL) { -		*header = register_sysctl_paths(path, table); +		*header = register_net_sysctl(net, path, table);  		if (*header == NULL)  			return -ENOMEM;  	} -	if (users != NULL) -		(*users)++; +  	return 0;  }  static void  nf_ct_unregister_sysctl(struct ctl_table_header **header, -			struct ctl_table *table, unsigned int *users) +			struct ctl_table **table, +			unsigned int users)  { -	if (users != NULL && --*users > 0) +	if (users > 0)  		return; -	unregister_sysctl_table(*header); +	unregister_net_sysctl_table(*header); +	kfree(*table);  	*header = NULL; +	*table = NULL;  }  #endif @@ -88,12 +92,6 @@ nf_ct_l3proto_find_get(u_int16_t l3proto)  }  EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); -void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) -{ -	module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_put); -  int  nf_ct_l3proto_try_module_get(unsigned short l3proto)  { @@ -127,6 +125,27 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)  }  EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); +struct nf_conntrack_l4proto * +nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) +{ +	struct nf_conntrack_l4proto *p; + +	rcu_read_lock(); +	p = __nf_ct_l4proto_find(l3num, l4num); +	if (!try_module_get(p->me)) +		p = &nf_conntrack_l4proto_generic; +	rcu_read_unlock(); + +	return p; +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); + +void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) +{ +	module_put(p->me); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); +  static int kill_l3proto(struct nf_conn *i, void *data)  {  	return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto; @@ -140,32 +159,58 @@ static int kill_l4proto(struct nf_conn *i, void *data)  	       nf_ct_l3num(i) == l4proto->l3proto;  } -static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto) +static struct nf_ip_net *nf_ct_l3proto_net(struct net *net, +					   struct nf_conntrack_l3proto *l3proto)  { -	int err = 0; +	if (l3proto->l3proto == PF_INET) +		return &net->ct.nf_ct_proto; +	else +		return NULL; +} -#ifdef CONFIG_SYSCTL -	if (l3proto->ctl_table != NULL) { -		err = nf_ct_register_sysctl(&l3proto->ctl_table_header, +static int nf_ct_l3proto_register_sysctl(struct net *net, +					 struct nf_conntrack_l3proto *l3proto) +{ +	int err = 0; +	struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); +	/* nf_conntrack_l3proto_ipv6 doesn't support sysctl */ +	if (in == NULL) +		return 0; + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +	if (in->ctl_table != NULL) { +		err = nf_ct_register_sysctl(net, +					    &in->ctl_table_header,  					    l3proto->ctl_table_path, -					    l3proto->ctl_table, NULL); +					    in->ctl_table); +		if (err < 0) { +			kfree(in->ctl_table); +			in->ctl_table = NULL; +		}  	}  #endif  	return err;  } -static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto) +static void nf_ct_l3proto_unregister_sysctl(struct net *net, +					    struct nf_conntrack_l3proto *l3proto)  { -#ifdef CONFIG_SYSCTL -	if (l3proto->ctl_table_header != NULL) -		nf_ct_unregister_sysctl(&l3proto->ctl_table_header, -					l3proto->ctl_table, NULL); +	struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); + +	if (in == NULL) +		return; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +	if (in->ctl_table_header != NULL) +		nf_ct_unregister_sysctl(&in->ctl_table_header, +					&in->ctl_table, +					0);  #endif  } -int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)  {  	int ret = 0; +	struct nf_conntrack_l3proto *old;  	if (proto->l3proto >= AF_MAX)  		return -EBUSY; @@ -174,15 +219,13 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)  		return -EINVAL;  	mutex_lock(&nf_ct_proto_mutex); -	if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { +	old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], +					lockdep_is_held(&nf_ct_proto_mutex)); +	if (old != &nf_conntrack_l3proto_generic) {  		ret = -EBUSY;  		goto out_unlock;  	} -	ret = nf_ct_l3proto_register_sysctl(proto); -	if (ret < 0) -		goto out_unlock; -  	if (proto->nlattr_tuple_size)  		proto->nla_size = 3 * proto->nlattr_tuple_size(); @@ -191,81 +234,131 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)  out_unlock:  	mutex_unlock(&nf_ct_proto_mutex);  	return ret; +  } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); -void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +int nf_ct_l3proto_pernet_register(struct net *net, +				  struct nf_conntrack_l3proto *proto)  { -	struct net *net; +	int ret = 0; + +	if (proto->init_net) { +		ret = proto->init_net(net); +		if (ret < 0) +			return ret; +	} +	return nf_ct_l3proto_register_sysctl(net, proto); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); + +void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) +{  	BUG_ON(proto->l3proto >= AF_MAX);  	mutex_lock(&nf_ct_proto_mutex); -	BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); +	BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], +					 lockdep_is_held(&nf_ct_proto_mutex) +					 ) != proto);  	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],  			   &nf_conntrack_l3proto_generic); -	nf_ct_l3proto_unregister_sysctl(proto);  	mutex_unlock(&nf_ct_proto_mutex);  	synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); + +void nf_ct_l3proto_pernet_unregister(struct net *net, +				     struct nf_conntrack_l3proto *proto) +{ +	nf_ct_l3proto_unregister_sysctl(net, proto);  	/* Remove all contrack entries for this protocol */ -	rtnl_lock(); -	for_each_net(net) -		nf_ct_iterate_cleanup(net, kill_l3proto, proto); -	rtnl_unlock(); +	nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);  } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); -static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto) +static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, +					      struct nf_conntrack_l4proto *l4proto) +{ +	if (l4proto->get_net_proto) { +		/* statically built-in protocols use static per-net */ +		return l4proto->get_net_proto(net); +	} else if (l4proto->net_id) { +		/* ... and loadable protocols use dynamic per-net */ +		return net_generic(net, *l4proto->net_id); +	} +	return NULL; +} + +static +int nf_ct_l4proto_register_sysctl(struct net *net, +				  struct nf_proto_net *pn, +				  struct nf_conntrack_l4proto *l4proto)  {  	int err = 0;  #ifdef CONFIG_SYSCTL -	if (l4proto->ctl_table != NULL) { -		err = nf_ct_register_sysctl(l4proto->ctl_table_header, -					    nf_net_netfilter_sysctl_path, -					    l4proto->ctl_table, -					    l4proto->ctl_table_users); -		if (err < 0) -			goto out; +	if (pn->ctl_table != NULL) { +		err = nf_ct_register_sysctl(net, +					    &pn->ctl_table_header, +					    "net/netfilter", +					    pn->ctl_table); +		if (err < 0) { +			if (!pn->users) { +				kfree(pn->ctl_table); +				pn->ctl_table = NULL; +			} +		}  	}  #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -	if (l4proto->ctl_compat_table != NULL) { -		err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header, -					    nf_net_ipv4_netfilter_sysctl_path, -					    l4proto->ctl_compat_table, NULL); +	if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) { +		if (err < 0) { +			nf_ct_kfree_compat_sysctl_table(pn); +			goto out; +		} +		err = nf_ct_register_sysctl(net, +					    &pn->ctl_compat_header, +					    "net/ipv4/netfilter", +					    pn->ctl_compat_table);  		if (err == 0)  			goto out; -		nf_ct_unregister_sysctl(l4proto->ctl_table_header, -					l4proto->ctl_table, -					l4proto->ctl_table_users); + +		nf_ct_kfree_compat_sysctl_table(pn); +		nf_ct_unregister_sysctl(&pn->ctl_table_header, +					&pn->ctl_table, +					pn->users);  	} -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  out: +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  #endif /* CONFIG_SYSCTL */  	return err;  } -static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto) +static +void nf_ct_l4proto_unregister_sysctl(struct net *net, +				     struct nf_proto_net *pn, +				     struct nf_conntrack_l4proto *l4proto)  {  #ifdef CONFIG_SYSCTL -	if (l4proto->ctl_table_header != NULL && -	    *l4proto->ctl_table_header != NULL) -		nf_ct_unregister_sysctl(l4proto->ctl_table_header, -					l4proto->ctl_table, -					l4proto->ctl_table_users); +	if (pn->ctl_table_header != NULL) +		nf_ct_unregister_sysctl(&pn->ctl_table_header, +					&pn->ctl_table, +					pn->users); +  #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -	if (l4proto->ctl_compat_table_header != NULL) -		nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header, -					l4proto->ctl_compat_table, NULL); +	if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL) +		nf_ct_unregister_sysctl(&pn->ctl_compat_header, +					&pn->ctl_compat_table, +					0);  #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  #endif /* CONFIG_SYSCTL */  }  /* FIXME: Allow NULL functions and sub in pointers to generic for     them. --RR */ -int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)  {  	int ret = 0; @@ -279,7 +372,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)  	mutex_lock(&nf_ct_proto_mutex);  	if (!nf_ct_protos[l4proto->l3proto]) {  		/* l3proto may be loaded latter. */ -		struct nf_conntrack_l4proto **proto_array; +		struct nf_conntrack_l4proto __rcu **proto_array;  		int i;  		proto_array = kmalloc(MAX_NF_CT_PROTO * @@ -291,7 +384,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)  		}  		for (i = 0; i < MAX_NF_CT_PROTO; i++) -			proto_array[i] = &nf_conntrack_l4proto_generic; +			RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);  		/* Before making proto_array visible to lockless readers,  		 * we must make sure its content is committed to memory. @@ -299,16 +392,14 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)  		smp_wmb();  		nf_ct_protos[l4proto->l3proto] = proto_array; -	} else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != -					&nf_conntrack_l4proto_generic) { +	} else if (rcu_dereference_protected( +			nf_ct_protos[l4proto->l3proto][l4proto->l4proto], +			lockdep_is_held(&nf_ct_proto_mutex) +			) != &nf_conntrack_l4proto_generic) {  		ret = -EBUSY;  		goto out_unlock;  	} -	ret = nf_ct_l4proto_register_sysctl(l4proto); -	if (ret < 0) -		goto out_unlock; -  	l4proto->nla_size = 0;  	if (l4proto->nlattr_size)  		l4proto->nla_size += l4proto->nlattr_size(); @@ -317,45 +408,106 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)  	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],  			   l4proto); -  out_unlock:  	mutex_unlock(&nf_ct_proto_mutex);  	return ret;  } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); -void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_pernet_register(struct net *net, +				  struct nf_conntrack_l4proto *l4proto)  { -	struct net *net; +	int ret = 0; +	struct nf_proto_net *pn = NULL; +	if (l4proto->init_net) { +		ret = l4proto->init_net(net, l4proto->l3proto); +		if (ret < 0) +			goto out; +	} + +	pn = nf_ct_l4proto_net(net, l4proto); +	if (pn == NULL) +		goto out; + +	ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto); +	if (ret < 0) +		goto out; + +	pn->users++; +out: +	return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); + +void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) +{  	BUG_ON(l4proto->l3proto >= PF_MAX);  	mutex_lock(&nf_ct_proto_mutex); -	BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); +	BUG_ON(rcu_dereference_protected( +			nf_ct_protos[l4proto->l3proto][l4proto->l4proto], +			lockdep_is_held(&nf_ct_proto_mutex) +			) != l4proto);  	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],  			   &nf_conntrack_l4proto_generic); -	nf_ct_l4proto_unregister_sysctl(l4proto);  	mutex_unlock(&nf_ct_proto_mutex);  	synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); + +void nf_ct_l4proto_pernet_unregister(struct net *net, +				     struct nf_conntrack_l4proto *l4proto) +{ +	struct nf_proto_net *pn = NULL; + +	pn = nf_ct_l4proto_net(net, l4proto); +	if (pn == NULL) +		return; + +	pn->users--; +	nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);  	/* Remove all contrack entries for this protocol */ -	rtnl_lock(); -	for_each_net(net) -		nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); -	rtnl_unlock(); +	nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);  } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); -int nf_conntrack_proto_init(void) +int nf_conntrack_proto_pernet_init(struct net *net)  { -	unsigned int i;  	int err; +	struct nf_proto_net *pn = nf_ct_l4proto_net(net, +					&nf_conntrack_l4proto_generic); -	err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); +	err = nf_conntrack_l4proto_generic.init_net(net, +					nf_conntrack_l4proto_generic.l3proto);  	if (err < 0)  		return err; +	err = nf_ct_l4proto_register_sysctl(net, +					    pn, +					    &nf_conntrack_l4proto_generic); +	if (err < 0) +		return err; + +	pn->users++; +	return 0; +} + +void nf_conntrack_proto_pernet_fini(struct net *net) +{ +	struct nf_proto_net *pn = nf_ct_l4proto_net(net, +					&nf_conntrack_l4proto_generic); + +	pn->users--; +	nf_ct_l4proto_unregister_sysctl(net, +					pn, +					&nf_conntrack_l4proto_generic); +} +int nf_conntrack_proto_init(void) +{ +	unsigned int i;  	for (i = 0; i < AF_MAX; i++)  		rcu_assign_pointer(nf_ct_l3protos[i],  				   &nf_conntrack_l3proto_generic); @@ -365,9 +517,6 @@ int nf_conntrack_proto_init(void)  void nf_conntrack_proto_fini(void)  {  	unsigned int i; - -	nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); -  	/* free l3proto protocol tables */  	for (i = 0; i < PF_MAX; i++)  		kfree(nf_ct_protos[i]); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 5292560d6d4..cb372f96f10 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -98,7 +98,7 @@ static const char * const dccp_state_names[] = {  #define sIV	CT_DCCP_INVALID  /* - * DCCP state transistion table + * DCCP state transition table   *   * The assumption is the same as for TCP tracking:   * @@ -387,12 +387,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =  /* this module per-net specifics */  static int dccp_net_id __read_mostly;  struct dccp_net { +	struct nf_proto_net pn;  	int dccp_loose;  	unsigned int dccp_timeout[CT_DCCP_MAX + 1]; -#ifdef CONFIG_SYSCTL -	struct ctl_table_header *sysctl_header; -	struct ctl_table *sysctl_table; -#endif  };  static inline struct dccp_net *dccp_pernet(struct net *net) @@ -423,7 +420,7 @@ static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,  }  static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, -		     unsigned int dataoff) +		     unsigned int dataoff, unsigned int *timeouts)  {  	struct net *net = nf_ct_net(ct);  	struct dccp_net *dn; @@ -431,7 +428,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,  	const char *msg;  	u_int8_t state; -	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); +	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);  	BUG_ON(dh == NULL);  	state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; @@ -452,11 +449,15 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,  	ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;  	ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;  	ct->proto.dccp.state = CT_DCCP_NONE; +	ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; +	ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; +	ct->proto.dccp.handshake_seq = 0;  	return true;  out_invalid:  	if (LOG_INVALID(net, IPPROTO_DCCP)) -		nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); +		nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, +			      NULL, "%s", msg);  	return false;  } @@ -469,18 +470,23 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)  		     ntohl(dhack->dccph_ack_nr_low);  } +static unsigned int *dccp_get_timeouts(struct net *net) +{ +	return dccp_pernet(net)->dccp_timeout; +} +  static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,  		       unsigned int dataoff, enum ip_conntrack_info ctinfo, -		       u_int8_t pf, unsigned int hooknum) +		       u_int8_t pf, unsigned int hooknum, +		       unsigned int *timeouts)  {  	struct net *net = nf_ct_net(ct); -	struct dccp_net *dn;  	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);  	struct dccp_hdr _dh, *dh;  	u_int8_t type, old_state, new_state;  	enum ct_dccp_roles role; -	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); +	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);  	BUG_ON(dh == NULL);  	type = dh->dccph_type; @@ -537,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,  		spin_unlock_bh(&ct->lock);  		if (LOG_INVALID(net, IPPROTO_DCCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				      "nf_ct_dccp: invalid packet ignored ");  		return NF_ACCEPT;  	case CT_DCCP_INVALID:  		spin_unlock_bh(&ct->lock);  		if (LOG_INVALID(net, IPPROTO_DCCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				      "nf_ct_dccp: invalid state transition ");  		return -NF_ACCEPT;  	} @@ -556,8 +562,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,  	if (new_state != old_state)  		nf_conntrack_event_cache(IPCT_PROTOINFO, ct); -	dn = dccp_pernet(net); -	nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]); +	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);  	return NF_ACCEPT;  } @@ -572,7 +577,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,  	unsigned int cscov;  	const char *msg; -	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); +	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);  	if (dh == NULL) {  		msg = "nf_ct_dccp: short packet ";  		goto out_invalid; @@ -609,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,  out_invalid:  	if (LOG_INVALID(net, IPPROTO_DCCP)) -		nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); +		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg);  	return -NF_ACCEPT;  } @@ -626,7 +631,7 @@ static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)  	return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);  } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,  			  struct nf_conn *ct)  { @@ -636,11 +641,12 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,  	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED);  	if (!nest_parms)  		goto nla_put_failure; -	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state); -	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE, -		   ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]); -	NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, -		     cpu_to_be64(ct->proto.dccp.handshake_seq)); +	if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) || +	    nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, +		       ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || +	    nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, +			 cpu_to_be64(ct->proto.dccp.handshake_seq))) +		goto nla_put_failure;  	nla_nest_end(skb, nest_parms);  	spin_unlock_bh(&ct->lock);  	return 0; @@ -699,8 +705,62 @@ static int dccp_nlattr_size(void)  	return nla_total_size(0)	/* CTA_PROTOINFO_DCCP */  		+ nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1);  } +  #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], +				      struct net *net, void *data) +{ +	struct dccp_net *dn = dccp_pernet(net); +	unsigned int *timeouts = data; +	int i; + +	/* set default DCCP timeouts. */ +	for (i=0; i<CT_DCCP_MAX; i++) +		timeouts[i] = dn->dccp_timeout[i]; + +	/* there's a 1:1 mapping between attributes and protocol states. */ +	for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { +		if (tb[i]) { +			timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; +		} +	} +	return 0; +} + +static int +dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +        const unsigned int *timeouts = data; +	int i; + +	for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { +		if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) +			goto nla_put_failure; +	} +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static const struct nla_policy +dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { +	[CTA_TIMEOUT_DCCP_REQUEST]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_DCCP_RESPOND]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_DCCP_PARTOPEN]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_DCCP_OPEN]		= { .type = NLA_U32 }, +	[CTA_TIMEOUT_DCCP_CLOSEREQ]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_DCCP_CLOSING]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_DCCP_TIMEWAIT]	= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +  #ifdef CONFIG_SYSCTL  /* template, data assigned later */  static struct ctl_table dccp_sysctl_table[] = { @@ -756,6 +816,55 @@ static struct ctl_table dccp_sysctl_table[] = {  };  #endif /* CONFIG_SYSCTL */ +static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, +				     struct dccp_net *dn) +{ +#ifdef CONFIG_SYSCTL +	if (pn->ctl_table) +		return 0; + +	pn->ctl_table = kmemdup(dccp_sysctl_table, +				sizeof(dccp_sysctl_table), +				GFP_KERNEL); +	if (!pn->ctl_table) +		return -ENOMEM; + +	pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; +	pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; +	pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; +	pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; +	pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; +	pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; +	pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; +	pn->ctl_table[7].data = &dn->dccp_loose; + +	/* Don't export sysctls to unprivileged users */ +	if (net->user_ns != &init_user_ns) +		pn->ctl_table[0].procname = NULL; +#endif +	return 0; +} + +static int dccp_init_net(struct net *net, u_int16_t proto) +{ +	struct dccp_net *dn = dccp_pernet(net); +	struct nf_proto_net *pn = &dn->pn; + +	if (!pn->users) { +		/* default values */ +		dn->dccp_loose = 1; +		dn->dccp_timeout[CT_DCCP_REQUEST]	= 2 * DCCP_MSL; +		dn->dccp_timeout[CT_DCCP_RESPOND]	= 4 * DCCP_MSL; +		dn->dccp_timeout[CT_DCCP_PARTOPEN]	= 4 * DCCP_MSL; +		dn->dccp_timeout[CT_DCCP_OPEN]		= 12 * 3600 * HZ; +		dn->dccp_timeout[CT_DCCP_CLOSEREQ]	= 64 * HZ; +		dn->dccp_timeout[CT_DCCP_CLOSING]	= 64 * HZ; +		dn->dccp_timeout[CT_DCCP_TIMEWAIT]	= 2 * DCCP_MSL; +	} + +	return dccp_kmemdup_sysctl_table(net, pn, dn); +} +  static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {  	.l3proto		= AF_INET,  	.l4proto		= IPPROTO_DCCP, @@ -764,10 +873,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {  	.invert_tuple		= dccp_invert_tuple,  	.new			= dccp_new,  	.packet			= dccp_packet, +	.get_timeouts		= dccp_get_timeouts,  	.error			= dccp_error,  	.print_tuple		= dccp_print_tuple,  	.print_conntrack	= dccp_print_conntrack, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.to_nlattr		= dccp_to_nlattr,  	.nlattr_size		= dccp_nlattr_size,  	.from_nlattr		= nlattr_to_dccp, @@ -776,6 +886,17 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nla_policy		= nf_ct_port_nla_policy,  #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= dccp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= dccp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_DCCP_MAX, +		.obj_size	= sizeof(unsigned int) * CT_DCCP_MAX, +		.nla_policy	= dccp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.net_id			= &dccp_net_id, +	.init_net		= dccp_init_net,  };  static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { @@ -786,10 +907,11 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {  	.invert_tuple		= dccp_invert_tuple,  	.new			= dccp_new,  	.packet			= dccp_packet, +	.get_timeouts		= dccp_get_timeouts,  	.error			= dccp_error,  	.print_tuple		= dccp_print_tuple,  	.print_conntrack	= dccp_print_conntrack, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.to_nlattr		= dccp_to_nlattr,  	.nlattr_size		= dccp_nlattr_size,  	.from_nlattr		= nlattr_to_dccp, @@ -798,55 +920,43 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nla_policy		= nf_ct_port_nla_policy,  #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= dccp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= dccp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_DCCP_MAX, +		.obj_size	= sizeof(unsigned int) * CT_DCCP_MAX, +		.nla_policy	= dccp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.net_id			= &dccp_net_id, +	.init_net		= dccp_init_net,  };  static __net_init int dccp_net_init(struct net *net)  { -	struct dccp_net *dn = dccp_pernet(net); - -	/* default values */ -	dn->dccp_loose = 1; -	dn->dccp_timeout[CT_DCCP_REQUEST]	= 2 * DCCP_MSL; -	dn->dccp_timeout[CT_DCCP_RESPOND]	= 4 * DCCP_MSL; -	dn->dccp_timeout[CT_DCCP_PARTOPEN]	= 4 * DCCP_MSL; -	dn->dccp_timeout[CT_DCCP_OPEN]		= 12 * 3600 * HZ; -	dn->dccp_timeout[CT_DCCP_CLOSEREQ]	= 64 * HZ; -	dn->dccp_timeout[CT_DCCP_CLOSING]	= 64 * HZ; -	dn->dccp_timeout[CT_DCCP_TIMEWAIT]	= 2 * DCCP_MSL; - -#ifdef CONFIG_SYSCTL -	dn->sysctl_table = kmemdup(dccp_sysctl_table, -			sizeof(dccp_sysctl_table), GFP_KERNEL); -	if (!dn->sysctl_table) -		return -ENOMEM; - -	dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; -	dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; -	dn->sysctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; -	dn->sysctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; -	dn->sysctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; -	dn->sysctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; -	dn->sysctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; -	dn->sysctl_table[7].data = &dn->dccp_loose; - -	dn->sysctl_header = register_net_sysctl_table(net, -			nf_net_netfilter_sysctl_path, dn->sysctl_table); -	if (!dn->sysctl_header) { -		kfree(dn->sysctl_table); -		return -ENOMEM; +	int ret = 0; +	ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4); +	if (ret < 0) { +		pr_err("nf_conntrack_dccp4: pernet registration failed.\n"); +		goto out; +	} +	ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6); +	if (ret < 0) { +		pr_err("nf_conntrack_dccp6: pernet registration failed.\n"); +		goto cleanup_dccp4;  	} -#endif -  	return 0; +cleanup_dccp4: +	nf_ct_l4proto_pernet_unregister(net, &dccp_proto4); +out: +	return ret;  }  static __net_exit void dccp_net_exit(struct net *net)  { -	struct dccp_net *dn = dccp_pernet(net); -#ifdef CONFIG_SYSCTL -	unregister_net_sysctl_table(dn->sysctl_header); -	kfree(dn->sysctl_table); -#endif +	nf_ct_l4proto_pernet_unregister(net, &dccp_proto6); +	nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);  }  static struct pernet_operations dccp_net_ops = { @@ -858,34 +968,34 @@ static struct pernet_operations dccp_net_ops = {  static int __init nf_conntrack_proto_dccp_init(void)  { -	int err; +	int ret; -	err = register_pernet_subsys(&dccp_net_ops); -	if (err < 0) -		goto err1; +	ret = register_pernet_subsys(&dccp_net_ops); +	if (ret < 0) +		goto out_pernet; -	err = nf_conntrack_l4proto_register(&dccp_proto4); -	if (err < 0) -		goto err2; +	ret = nf_ct_l4proto_register(&dccp_proto4); +	if (ret < 0) +		goto out_dccp4; -	err = nf_conntrack_l4proto_register(&dccp_proto6); -	if (err < 0) -		goto err3; -	return 0; +	ret = nf_ct_l4proto_register(&dccp_proto6); +	if (ret < 0) +		goto out_dccp6; -err3: -	nf_conntrack_l4proto_unregister(&dccp_proto4); -err2: +	return 0; +out_dccp6: +	nf_ct_l4proto_unregister(&dccp_proto4); +out_dccp4:  	unregister_pernet_subsys(&dccp_net_ops); -err1: -	return err; +out_pernet: +	return ret;  }  static void __exit nf_conntrack_proto_dccp_fini(void)  { +	nf_ct_l4proto_unregister(&dccp_proto6); +	nf_ct_l4proto_unregister(&dccp_proto4);  	unregister_pernet_subsys(&dccp_net_ops); -	nf_conntrack_l4proto_unregister(&dccp_proto6); -	nf_conntrack_l4proto_unregister(&dccp_proto4);  }  module_init(nf_conntrack_proto_dccp_init); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e2091d0c7a2..d25f2937764 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -14,6 +14,11 @@  static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; +static inline struct nf_generic_net *generic_pernet(struct net *net) +{ +	return &net->ct.nf_ct_proto.generic; +} +  static bool generic_pkt_to_tuple(const struct sk_buff *skb,  				 unsigned int dataoff,  				 struct nf_conntrack_tuple *tuple) @@ -40,31 +45,77 @@ static int generic_print_tuple(struct seq_file *s,  	return 0;  } +static unsigned int *generic_get_timeouts(struct net *net) +{ +	return &(generic_pernet(net)->timeout); +} +  /* Returns verdict for packet, or -1 for invalid. */ -static int packet(struct nf_conn *ct, -		  const struct sk_buff *skb, -		  unsigned int dataoff, -		  enum ip_conntrack_info ctinfo, -		  u_int8_t pf, -		  unsigned int hooknum) +static int generic_packet(struct nf_conn *ct, +			  const struct sk_buff *skb, +			  unsigned int dataoff, +			  enum ip_conntrack_info ctinfo, +			  u_int8_t pf, +			  unsigned int hooknum, +			  unsigned int *timeout)  { -	nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout); +	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);  	return NF_ACCEPT;  }  /* Called when a new connection for this protocol found. */ -static bool new(struct nf_conn *ct, const struct sk_buff *skb, -		unsigned int dataoff) +static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, +			unsigned int dataoff, unsigned int *timeouts)  {  	return true;  } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], +					 struct net *net, void *data) +{ +	unsigned int *timeout = data; +	struct nf_generic_net *gn = generic_pernet(net); + +	if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) +		*timeout = +		    ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ; +	else { +		/* Set default generic timeout. */ +		*timeout = gn->timeout; +	} + +	return 0; +} + +static int +generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +	const unsigned int *timeout = data; + +	if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ))) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +        return -ENOSPC; +} + +static const struct nla_policy +generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { +	[CTA_TIMEOUT_GENERIC_TIMEOUT]	= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +  #ifdef CONFIG_SYSCTL -static struct ctl_table_header *generic_sysctl_header;  static struct ctl_table generic_sysctl_table[] = {  	{  		.procname	= "nf_conntrack_generic_timeout", -		.data		= &nf_ct_generic_timeout,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -75,7 +126,6 @@ static struct ctl_table generic_sysctl_table[] = {  static struct ctl_table generic_compat_sysctl_table[] = {  	{  		.procname	= "ip_conntrack_generic_timeout", -		.data		= &nf_ct_generic_timeout,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -85,6 +135,62 @@ static struct ctl_table generic_compat_sysctl_table[] = {  #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  #endif /* CONFIG_SYSCTL */ +static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, +					struct nf_generic_net *gn) +{ +#ifdef CONFIG_SYSCTL +	pn->ctl_table = kmemdup(generic_sysctl_table, +				sizeof(generic_sysctl_table), +				GFP_KERNEL); +	if (!pn->ctl_table) +		return -ENOMEM; + +	pn->ctl_table[0].data = &gn->timeout; +#endif +	return 0; +} + +static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, +					       struct nf_generic_net *gn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +	pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table, +				       sizeof(generic_compat_sysctl_table), +				       GFP_KERNEL); +	if (!pn->ctl_compat_table) +		return -ENOMEM; + +	pn->ctl_compat_table[0].data = &gn->timeout; +#endif +#endif +	return 0; +} + +static int generic_init_net(struct net *net, u_int16_t proto) +{ +	int ret; +	struct nf_generic_net *gn = generic_pernet(net); +	struct nf_proto_net *pn = &gn->pn; + +	gn->timeout = nf_ct_generic_timeout; + +	ret = generic_kmemdup_compat_sysctl_table(pn, gn); +	if (ret < 0) +		return ret; + +	ret = generic_kmemdup_sysctl_table(pn, gn); +	if (ret < 0) +		nf_ct_kfree_compat_sysctl_table(pn); + +	return ret; +} + +static struct nf_proto_net *generic_get_net_proto(struct net *net) +{ +	return &net->ct.nf_ct_proto.generic.pn; +} +  struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =  {  	.l3proto		= PF_UNSPEC, @@ -93,13 +199,18 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =  	.pkt_to_tuple		= generic_pkt_to_tuple,  	.invert_tuple		= generic_invert_tuple,  	.print_tuple		= generic_print_tuple, -	.packet			= packet, -	.new			= new, -#ifdef CONFIG_SYSCTL -	.ctl_table_header	= &generic_sysctl_header, -	.ctl_table		= generic_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -	.ctl_compat_table	= generic_compat_sysctl_table, -#endif -#endif +	.packet			= generic_packet, +	.get_timeouts		= generic_get_timeouts, +	.new			= generic_new, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= generic_timeout_nlattr_to_obj, +		.obj_to_nlattr	= generic_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_GENERIC_MAX, +		.obj_size	= sizeof(unsigned int), +		.nla_policy	= generic_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.init_net		= generic_init_net, +	.get_net_proto		= generic_get_net_proto,  }; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index cf616e55ca4..d5665739e3b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -21,6 +21,7 @@   *   * Development of this code funded by Astaro AG (http://www.astaro.com/)   * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   */  #include <linux/module.h> @@ -41,18 +42,33 @@  #include <linux/netfilter/nf_conntrack_proto_gre.h>  #include <linux/netfilter/nf_conntrack_pptp.h> -#define GRE_TIMEOUT		(30 * HZ) -#define GRE_STREAM_TIMEOUT	(180 * HZ) +enum grep_conntrack { +	GRE_CT_UNREPLIED, +	GRE_CT_REPLIED, +	GRE_CT_MAX +}; + +static unsigned int gre_timeouts[GRE_CT_MAX] = { +	[GRE_CT_UNREPLIED]	= 30*HZ, +	[GRE_CT_REPLIED]	= 180*HZ, +};  static int proto_gre_net_id __read_mostly;  struct netns_proto_gre { +	struct nf_proto_net	nf;  	rwlock_t		keymap_lock;  	struct list_head	keymap_list; +	unsigned int		gre_timeouts[GRE_CT_MAX];  }; -void nf_ct_gre_keymap_flush(struct net *net) +static inline struct netns_proto_gre *gre_pernet(struct net *net) +{ +	return net_generic(net, proto_gre_net_id); +} + +static void nf_ct_gre_keymap_flush(struct net *net)  { -	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); +	struct netns_proto_gre *net_gre = gre_pernet(net);  	struct nf_ct_gre_keymap *km, *tmp;  	write_lock_bh(&net_gre->keymap_lock); @@ -62,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net)  	}  	write_unlock_bh(&net_gre->keymap_lock);  } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush);  static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,  				const struct nf_conntrack_tuple *t) @@ -77,7 +92,7 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,  /* look up the source key for a given tuple */  static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)  { -	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); +	struct netns_proto_gre *net_gre = gre_pernet(net);  	struct nf_ct_gre_keymap *km;  	__be16 key = 0; @@ -101,11 +116,11 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,  			 struct nf_conntrack_tuple *t)  {  	struct net *net = nf_ct_net(ct); -	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); -	struct nf_conn_help *help = nfct_help(ct); +	struct netns_proto_gre *net_gre = gre_pernet(net); +	struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);  	struct nf_ct_gre_keymap **kmp, *km; -	kmp = &help->help.ct_pptp_info.keymap[dir]; +	kmp = &ct_pptp_info->keymap[dir];  	if (*kmp) {  		/* check whether it's a retransmission */  		read_lock_bh(&net_gre->keymap_lock); @@ -142,20 +157,20 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);  void nf_ct_gre_keymap_destroy(struct nf_conn *ct)  {  	struct net *net = nf_ct_net(ct); -	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); -	struct nf_conn_help *help = nfct_help(ct); +	struct netns_proto_gre *net_gre = gre_pernet(net); +	struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);  	enum ip_conntrack_dir dir;  	pr_debug("entering for ct %p\n", ct);  	write_lock_bh(&net_gre->keymap_lock);  	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { -		if (help->help.ct_pptp_info.keymap[dir]) { +		if (ct_pptp_info->keymap[dir]) {  			pr_debug("removing %p from list\n", -				 help->help.ct_pptp_info.keymap[dir]); -			list_del(&help->help.ct_pptp_info.keymap[dir]->list); -			kfree(help->help.ct_pptp_info.keymap[dir]); -			help->help.ct_pptp_info.keymap[dir] = NULL; +				 ct_pptp_info->keymap[dir]); +			list_del(&ct_pptp_info->keymap[dir]->list); +			kfree(ct_pptp_info->keymap[dir]); +			ct_pptp_info->keymap[dir] = NULL;  		}  	}  	write_unlock_bh(&net_gre->keymap_lock); @@ -227,13 +242,19 @@ static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)  			  (ct->proto.gre.stream_timeout / HZ));  } +static unsigned int *gre_get_timeouts(struct net *net) +{ +	return gre_pernet(net)->gre_timeouts; +} +  /* Returns verdict for packet, and may modify conntrack */  static int gre_packet(struct nf_conn *ct,  		      const struct sk_buff *skb,  		      unsigned int dataoff,  		      enum ip_conntrack_info ctinfo,  		      u_int8_t pf, -		      unsigned int hooknum) +		      unsigned int hooknum, +		      unsigned int *timeouts)  {  	/* If we've seen traffic both ways, this is a GRE connection.  	 * Extend timeout. */ @@ -241,8 +262,8 @@ static int gre_packet(struct nf_conn *ct,  		nf_ct_refresh_acct(ct, ctinfo, skb,  				   ct->proto.gre.stream_timeout);  		/* Also, more likely to be important, and not a probe. */ -		set_bit(IPS_ASSURED_BIT, &ct->status); -		nf_conntrack_event_cache(IPCT_ASSURED, ct); +		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) +			nf_conntrack_event_cache(IPCT_ASSURED, ct);  	} else  		nf_ct_refresh_acct(ct, ctinfo, skb,  				   ct->proto.gre.timeout); @@ -252,15 +273,15 @@ static int gre_packet(struct nf_conn *ct,  /* Called when a new connection for this protocol found. */  static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, -		    unsigned int dataoff) +		    unsigned int dataoff, unsigned int *timeouts)  {  	pr_debug(": ");  	nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);  	/* initialize to sane value.  Ideally a conntrack helper  	 * (e.g. in case of pptp) is increasing them */ -	ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; -	ct->proto.gre.timeout = GRE_TIMEOUT; +	ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; +	ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];  	return true;  } @@ -278,6 +299,68 @@ static void gre_destroy(struct nf_conn *ct)  		nf_ct_gre_keymap_destroy(master);  } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], +				     struct net *net, void *data) +{ +	unsigned int *timeouts = data; +	struct netns_proto_gre *net_gre = gre_pernet(net); + +	/* set default timeouts for GRE. */ +	timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; +	timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; + +	if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) { +		timeouts[GRE_CT_UNREPLIED] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ; +	} +	if (tb[CTA_TIMEOUT_GRE_REPLIED]) { +		timeouts[GRE_CT_REPLIED] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ; +	} +	return 0; +} + +static int +gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +	const unsigned int *timeouts = data; + +	if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED, +			 htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED, +			 htonl(timeouts[GRE_CT_REPLIED] / HZ))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static const struct nla_policy +gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { +	[CTA_TIMEOUT_GRE_UNREPLIED]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_GRE_REPLIED]	= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +static int gre_init_net(struct net *net, u_int16_t proto) +{ +	struct netns_proto_gre *net_gre = gre_pernet(net); +	int i; + +	rwlock_init(&net_gre->keymap_lock); +	INIT_LIST_HEAD(&net_gre->keymap_list); +	for (i = 0; i < GRE_CT_MAX; i++) +		net_gre->gre_timeouts[i] = gre_timeouts[i]; + +	return 0; +} +  /* protocol helper struct */  static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {  	.l3proto	 = AF_INET, @@ -287,30 +370,42 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {  	.invert_tuple	 = gre_invert_tuple,  	.print_tuple	 = gre_print_tuple,  	.print_conntrack = gre_print_conntrack, +	.get_timeouts    = gre_get_timeouts,  	.packet		 = gre_packet,  	.new		 = gre_new,  	.destroy	 = gre_destroy,  	.me 		 = THIS_MODULE, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,  	.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,  	.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,  	.nla_policy	 = nf_ct_port_nla_policy,  #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout    = { +		.nlattr_to_obj	= gre_timeout_nlattr_to_obj, +		.obj_to_nlattr	= gre_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_GRE_MAX, +		.obj_size	= sizeof(unsigned int) * GRE_CT_MAX, +		.nla_policy	= gre_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.net_id		= &proto_gre_net_id, +	.init_net	= gre_init_net,  };  static int proto_gre_net_init(struct net *net)  { -	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); - -	rwlock_init(&net_gre->keymap_lock); -	INIT_LIST_HEAD(&net_gre->keymap_list); - -	return 0; +	int ret = 0; +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4); +	if (ret < 0) +		pr_err("nf_conntrack_gre4: pernet registration failed.\n"); +	return ret;  }  static void proto_gre_net_exit(struct net *net)  { +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);  	nf_ct_gre_keymap_flush(net);  } @@ -323,20 +418,26 @@ static struct pernet_operations proto_gre_net_ops = {  static int __init nf_ct_proto_gre_init(void)  { -	int rv; - -	rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4); -	if (rv < 0) -		return rv; -	rv = register_pernet_subsys(&proto_gre_net_ops); -	if (rv < 0) -		nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); -	return rv; +	int ret; + +	ret = register_pernet_subsys(&proto_gre_net_ops); +	if (ret < 0) +		goto out_pernet; + +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); +	if (ret < 0) +		goto out_gre4; + +	return 0; +out_gre4: +	unregister_pernet_subsys(&proto_gre_net_ops); +out_pernet: +	return ret;  }  static void __exit nf_ct_proto_gre_fini(void)  { -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);  	unregister_pernet_subsys(&proto_gre_net_ops);  } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index c6049c2d5ea..1314d33f6bc 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -1,6 +1,9 @@  /*   * Connection tracking protocol helper module for SCTP.   * + * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> + * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> + *   * SCTP is defined in RFC 2960. References to various sections in this code   * are to this RFC.   * @@ -107,9 +110,9 @@ static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {  /* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},  /* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},  /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/  /* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */  /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}  	},  	{ @@ -121,12 +124,23 @@ static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {  /* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},  /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},  /* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */  /* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},  /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}  	}  }; +static int sctp_net_id	__read_mostly; +struct sctp_net { +	struct nf_proto_net pn; +	unsigned int timeouts[SCTP_CONNTRACK_MAX]; +}; + +static inline struct sctp_net *sctp_pernet(struct net *net) +{ +	return net_generic(net, sctp_net_id); +} +  static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,  			      struct nf_conntrack_tuple *tuple)  { @@ -279,13 +293,19 @@ static int sctp_new_state(enum ip_conntrack_dir dir,  	return sctp_conntracks[dir][i][cur_state];  } +static unsigned int *sctp_get_timeouts(struct net *net) +{ +	return sctp_pernet(net)->timeouts; +} +  /* Returns verdict for packet, or -NF_ACCEPT for invalid. */  static int sctp_packet(struct nf_conn *ct,  		       const struct sk_buff *skb,  		       unsigned int dataoff,  		       enum ip_conntrack_info ctinfo,  		       u_int8_t pf, -		       unsigned int hooknum) +		       unsigned int hooknum, +		       unsigned int *timeouts)  {  	enum sctp_conntrack new_state, old_state;  	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -370,7 +390,7 @@ static int sctp_packet(struct nf_conn *ct,  	}  	spin_unlock_bh(&ct->lock); -	nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]); +	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);  	if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&  	    dir == IP_CT_DIR_REPLY && @@ -390,7 +410,7 @@ out:  /* Called when a new connection for this protocol found. */  static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, -		     unsigned int dataoff) +		     unsigned int dataoff, unsigned int *timeouts)  {  	enum sctp_conntrack new_state;  	const struct sctphdr *sh; @@ -413,6 +433,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,  	    test_bit(SCTP_CID_COOKIE_ACK, map))  		return false; +	memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));  	new_state = SCTP_CONNTRACK_MAX;  	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {  		/* Don't need lock here: this conntrack not in circulation yet */ @@ -460,7 +481,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,  	return true;  } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  #include <linux/netfilter/nfnetlink.h>  #include <linux/netfilter/nfnetlink_conntrack.h> @@ -475,15 +496,12 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,  	if (!nest_parms)  		goto nla_put_failure; -	NLA_PUT_U8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state); - -	NLA_PUT_BE32(skb, -		     CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, -		     ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]); - -	NLA_PUT_BE32(skb, -		     CTA_PROTOINFO_SCTP_VTAG_REPLY, -		     ct->proto.sctp.vtag[IP_CT_DIR_REPLY]); +	if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) || +	    nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, +			 ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || +	    nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, +			 ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) +		goto nla_put_failure;  	spin_unlock_bh(&ct->lock); @@ -542,55 +560,100 @@ static int sctp_nlattr_size(void)  }  #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], +				      struct net *net, void *data) +{ +	unsigned int *timeouts = data; +	struct sctp_net *sn = sctp_pernet(net); +	int i; + +	/* set default SCTP timeouts. */ +	for (i=0; i<SCTP_CONNTRACK_MAX; i++) +		timeouts[i] = sn->timeouts[i]; + +	/* there's a 1:1 mapping between attributes and protocol states. */ +	for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { +		if (tb[i]) { +			timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; +		} +	} +	return 0; +} + +static int +sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +        const unsigned int *timeouts = data; +	int i; + +	for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { +	        if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) +			goto nla_put_failure; +	} +        return 0; + +nla_put_failure: +        return -ENOSPC; +} + +static const struct nla_policy +sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { +	[CTA_TIMEOUT_SCTP_CLOSED]		= { .type = NLA_U32 }, +	[CTA_TIMEOUT_SCTP_COOKIE_WAIT]		= { .type = NLA_U32 }, +	[CTA_TIMEOUT_SCTP_COOKIE_ECHOED]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_SCTP_ESTABLISHED]		= { .type = NLA_U32 }, +	[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT]	= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +  #ifdef CONFIG_SYSCTL -static unsigned int sctp_sysctl_table_users; -static struct ctl_table_header *sctp_sysctl_header;  static struct ctl_table sctp_sysctl_table[] = {  	{  		.procname	= "nf_conntrack_sctp_timeout_closed", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_CLOSED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_sctp_timeout_cookie_wait", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_sctp_timeout_cookie_echoed", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_sctp_timeout_established", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_sctp_timeout_shutdown_sent", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_sctp_timeout_shutdown_recd", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_sctp_timeout_shutdown_ack_sent", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -602,49 +665,42 @@ static struct ctl_table sctp_sysctl_table[] = {  static struct ctl_table sctp_compat_sysctl_table[] = {  	{  		.procname	= "ip_conntrack_sctp_timeout_closed", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_CLOSED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_sctp_timeout_cookie_wait", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_sctp_timeout_cookie_echoed", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_sctp_timeout_established", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_sctp_timeout_shutdown_sent", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_sctp_timeout_shutdown_recd", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_sctp_timeout_shutdown_ack_sent", -		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -654,6 +710,80 @@ static struct ctl_table sctp_compat_sysctl_table[] = {  #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  #endif +static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, +				     struct sctp_net *sn) +{ +#ifdef CONFIG_SYSCTL +	if (pn->ctl_table) +		return 0; + +	pn->ctl_table = kmemdup(sctp_sysctl_table, +				sizeof(sctp_sysctl_table), +				GFP_KERNEL); +	if (!pn->ctl_table) +		return -ENOMEM; + +	pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; +	pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; +	pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; +	pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; +	pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; +	pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; +	pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; +#endif +	return 0; +} + +static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, +					    struct sctp_net *sn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +	pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table, +				       sizeof(sctp_compat_sysctl_table), +				       GFP_KERNEL); +	if (!pn->ctl_compat_table) +		return -ENOMEM; + +	pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; +	pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; +	pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; +	pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; +	pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; +	pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; +	pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; +#endif +#endif +	return 0; +} + +static int sctp_init_net(struct net *net, u_int16_t proto) +{ +	int ret; +	struct sctp_net *sn = sctp_pernet(net); +	struct nf_proto_net *pn = &sn->pn; + +	if (!pn->users) { +		int i; + +		for (i = 0; i < SCTP_CONNTRACK_MAX; i++) +			sn->timeouts[i] = sctp_timeouts[i]; +	} + +	if (proto == AF_INET) { +		ret = sctp_kmemdup_compat_sysctl_table(pn, sn); +		if (ret < 0) +			return ret; + +		ret = sctp_kmemdup_sysctl_table(pn, sn); +		if (ret < 0) +			nf_ct_kfree_compat_sysctl_table(pn); +	} else +		ret = sctp_kmemdup_sysctl_table(pn, sn); + +	return ret; +} +  static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {  	.l3proto		= PF_INET,  	.l4proto 		= IPPROTO_SCTP, @@ -663,9 +793,10 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {  	.print_tuple 		= sctp_print_tuple,  	.print_conntrack	= sctp_print_conntrack,  	.packet 		= sctp_packet, +	.get_timeouts		= sctp_get_timeouts,  	.new 			= sctp_new,  	.me 			= THIS_MODULE, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.to_nlattr		= sctp_to_nlattr,  	.nlattr_size		= sctp_nlattr_size,  	.from_nlattr		= nlattr_to_sctp, @@ -674,14 +805,17 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nla_policy		= nf_ct_port_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &sctp_sysctl_table_users, -	.ctl_table_header	= &sctp_sysctl_header, -	.ctl_table		= sctp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -	.ctl_compat_table	= sctp_compat_sysctl_table, -#endif -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= sctp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= sctp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_SCTP_MAX, +		.obj_size	= sizeof(unsigned int) * SCTP_CONNTRACK_MAX, +		.nla_policy	= sctp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.net_id			= &sctp_net_id, +	.init_net		= sctp_init_net,  };  static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { @@ -693,9 +827,10 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {  	.print_tuple 		= sctp_print_tuple,  	.print_conntrack	= sctp_print_conntrack,  	.packet 		= sctp_packet, +	.get_timeouts		= sctp_get_timeouts,  	.new 			= sctp_new,  	.me 			= THIS_MODULE, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.to_nlattr		= sctp_to_nlattr,  	.nlattr_size		= sctp_nlattr_size,  	.from_nlattr		= nlattr_to_sctp, @@ -703,41 +838,85 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {  	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nla_policy		= nf_ct_port_nla_policy, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= sctp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= sctp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_SCTP_MAX, +		.obj_size	= sizeof(unsigned int) * SCTP_CONNTRACK_MAX, +		.nla_policy	= sctp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &sctp_sysctl_table_users, -	.ctl_table_header	= &sctp_sysctl_header, -	.ctl_table		= sctp_sysctl_table, -#endif +	.net_id			= &sctp_net_id, +	.init_net		= sctp_init_net,  }; -static int __init nf_conntrack_proto_sctp_init(void) +static int sctp_net_init(struct net *net)  { -	int ret; +	int ret = 0; -	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4); -	if (ret) { -		pr_err("nf_conntrack_l4proto_sctp4: protocol register failed\n"); +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4); +	if (ret < 0) { +		pr_err("nf_conntrack_sctp4: pernet registration failed.\n");  		goto out;  	} -	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6); -	if (ret) { -		pr_err("nf_conntrack_l4proto_sctp6: protocol register failed\n"); +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6); +	if (ret < 0) { +		pr_err("nf_conntrack_sctp6: pernet registration failed.\n");  		goto cleanup_sctp4;  	} +	return 0; +cleanup_sctp4: +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); +out:  	return ret; +} + +static void sctp_net_exit(struct net *net) +{ +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6); +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); +} + +static struct pernet_operations sctp_net_ops = { +	.init = sctp_net_init, +	.exit = sctp_net_exit, +	.id   = &sctp_net_id, +	.size = sizeof(struct sctp_net), +}; - cleanup_sctp4: -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); - out: +static int __init nf_conntrack_proto_sctp_init(void) +{ +	int ret; + +	ret = register_pernet_subsys(&sctp_net_ops); +	if (ret < 0) +		goto out_pernet; + +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4); +	if (ret < 0) +		goto out_sctp4; + +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6); +	if (ret < 0) +		goto out_sctp6; + +	return 0; +out_sctp6: +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +out_sctp4: +	unregister_pernet_subsys(&sctp_net_ops); +out_pernet:  	return ret;  }  static void __exit nf_conntrack_proto_sctp_fini(void)  { -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6); -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +	unregister_pernet_subsys(&sctp_net_ops);  }  module_init(nf_conntrack_proto_sctp_init); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 3fb2b73b24d..44d1ea32570 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,5 +1,7 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -25,6 +27,8 @@  #include <net/netfilter/nf_conntrack.h>  #include <net/netfilter/nf_conntrack_l4proto.h>  #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h>  #include <net/netfilter/nf_log.h>  #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>  #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> @@ -64,13 +68,7 @@ static const char *const tcp_conntrack_names[] = {  #define HOURS * 60 MINS  #define DAYS * 24 HOURS -/* RFC1122 says the R2 limit should be at least 100 seconds. -   Linux uses 15 packets as limit, which corresponds -   to ~13-30min depending on RTO. */ -static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly    =   5 MINS; -static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly =   5 MINS; - -static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { +static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = {  	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,  	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,  	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS, @@ -80,6 +78,11 @@ static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {  	[TCP_CONNTRACK_TIME_WAIT]	= 2 MINS,  	[TCP_CONNTRACK_CLOSE]		= 10 SECS,  	[TCP_CONNTRACK_SYN_SENT2]	= 2 MINS, +/* RFC1122 says the R2 limit should be at least 100 seconds. +   Linux uses 15 packets as limit, which corresponds +   to ~13-30min depending on RTO. */ +	[TCP_CONNTRACK_RETRANS]		= 5 MINS, +	[TCP_CONNTRACK_UNACK]		= 5 MINS,  };  #define sNO TCP_CONNTRACK_NONE @@ -159,21 +162,18 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {   *	sCL -> sSS   */  /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/ -/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },  /*   *	sNO -> sIV	Too late and no reason to do anything   *	sSS -> sIV	Client can't send SYN and then SYN/ACK   *	sS2 -> sSR	SYN/ACK sent to SYN2 in simultaneous open - *	sSR -> sIG - *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state - *			are errors. Receiver will reply with RST - *			and close the connection. - *			Or we are not in sync and hold a dead connection. - *	sFW -> sIG - *	sCW -> sIG - *	sLA -> sIG - *	sTW -> sIG - *	sCL -> sIG + *	sSR -> sSR	Late retransmitted SYN/ACK in simultaneous open + *	sES -> sIV	Invalid SYN/ACK packets sent by the client + *	sFW -> sIV + *	sCW -> sIV + *	sLA -> sIV + *	sTW -> sIV + *	sCL -> sIV   */  /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/  /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, @@ -227,11 +227,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {   *	sCL -> sIV   */  /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/ -/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },  /*   *	sSS -> sSR	Standard open.   *	sS2 -> sSR	Simultaneous open - *	sSR -> sSR	Retransmitted SYN/ACK. + *	sSR -> sIG	Retransmitted SYN/ACK, ignore it.   *	sES -> sIG	Late retransmitted SYN/ACK?   *	sFW -> sIG	Might be SYN/ACK answering ignored SYN   *	sCW -> sIG @@ -271,6 +271,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {  	}  }; +static inline struct nf_tcp_net *tcp_pernet(struct net *net) +{ +	return &net->ct.nf_ct_proto.tcp; +} +  static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,  			     struct nf_conntrack_tuple *tuple)  { @@ -409,7 +414,7 @@ static void tcp_options(const struct sk_buff *skb,  			if (opsize < 2) /* "silly options" */  				return;  			if (opsize > length) -				break;	/* don't parse partial options */ +				return;	/* don't parse partial options */  			if (opcode == TCPOPT_SACK_PERM  			    && opsize == TCPOLEN_SACK_PERM) @@ -447,7 +452,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,  	BUG_ON(ptr == NULL);  	/* Fast path for timestamp-only option */ -	if (length == TCPOLEN_TSTAMP_ALIGNED*4 +	if (length == TCPOLEN_TSTAMP_ALIGNED  	    && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)  				       | (TCPOPT_NOP << 16)  				       | (TCPOPT_TIMESTAMP << 8) @@ -469,7 +474,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,  			if (opsize < 2) /* "silly options" */  				return;  			if (opsize > length) -				break;	/* don't parse partial options */ +				return;	/* don't parse partial options */  			if (opcode == TCPOPT_SACK  			    && opsize >= (TCPOLEN_SACK_BASE @@ -492,21 +497,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,  	}  } -#ifdef CONFIG_NF_NAT_NEEDED -static inline s16 nat_offset(const struct nf_conn *ct, -			     enum ip_conntrack_dir dir, -			     u32 seq) -{ -	typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); - -	return get_offset != NULL ? get_offset(ct, dir, seq) : 0; -} -#define NAT_OFFSET(pf, ct, dir, seq) \ -	(pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0) -#else -#define NAT_OFFSET(pf, ct, dir, seq)	0 -#endif -  static bool tcp_in_window(const struct nf_conn *ct,  			  struct ip_ct_tcp *state,  			  enum ip_conntrack_dir dir, @@ -517,12 +507,13 @@ static bool tcp_in_window(const struct nf_conn *ct,  			  u_int8_t pf)  {  	struct net *net = nf_ct_net(ct); +	struct nf_tcp_net *tn = tcp_pernet(net);  	struct ip_ct_tcp_state *sender = &state->seen[dir];  	struct ip_ct_tcp_state *receiver = &state->seen[!dir];  	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;  	__u32 seq, ack, sack, end, win, swin; -	s16 receiver_offset; -	bool res; +	s32 receiver_offset; +	bool res, in_recv_win;  	/*  	 * Get the required data from the packet. @@ -536,7 +527,7 @@ static bool tcp_in_window(const struct nf_conn *ct,  		tcp_sack(skb, dataoff, tcph, &sack);  	/* Take into account NAT sequence number mangling */ -	receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1); +	receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);  	ack -= receiver_offset;  	sack -= receiver_offset; @@ -585,8 +576,8 @@ static bool tcp_in_window(const struct nf_conn *ct,  			 * Let's try to use the data from the packet.  			 */  			sender->td_end = end; -			win <<= sender->td_scale; -			sender->td_maxwin = (win == 0 ? 1 : win); +			swin = win << sender->td_scale; +			sender->td_maxwin = (swin == 0 ? 1 : swin);  			sender->td_maxend = end + sender->td_maxwin;  			/*  			 * We haven't seen traffic in the other direction yet @@ -628,15 +619,9 @@ static bool tcp_in_window(const struct nf_conn *ct,  		ack = sack = receiver->td_end;  	} -	if (seq == end -	    && (!tcph->rst -		|| (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) +	if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)  		/* -		 * Packets contains no data: we assume it is valid -		 * and check the ack value only. -		 * However RST segments are always validated by their -		 * SEQ number, except when seq == 0 (reset sent answering -		 * SYN. +		 * RST sent answering SYN.  		 */  		seq = end = sender->td_end; @@ -651,14 +636,18 @@ static bool tcp_in_window(const struct nf_conn *ct,  		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,  		 receiver->td_scale); +	/* Is the ending sequence in the receive window (if available)? */ +	in_recv_win = !receiver->td_maxwin || +		      after(end, sender->td_end - receiver->td_maxwin - 1); +  	pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",  		 before(seq, sender->td_maxend + 1), -		 after(end, sender->td_end - receiver->td_maxwin - 1), +		 (in_recv_win ? 1 : 0),  		 before(sack, receiver->td_end + 1),  		 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));  	if (before(seq, sender->td_maxend + 1) && -	    after(end, sender->td_end - receiver->td_maxwin - 1) && +	    in_recv_win &&  	    before(sack, receiver->td_end + 1) &&  	    after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {  		/* @@ -721,13 +710,13 @@ static bool tcp_in_window(const struct nf_conn *ct,  	} else {  		res = false;  		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || -		    nf_ct_tcp_be_liberal) +		    tn->tcp_be_liberal)  			res = true;  		if (!res && LOG_INVALID(net, IPPROTO_TCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  			"nf_ct_tcp: %s ",  			before(seq, sender->td_maxend + 1) ? -			after(end, sender->td_end - receiver->td_maxwin - 1) ? +			in_recv_win ?  			before(sack, receiver->td_end + 1) ?  			after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"  			: "ACK is under the lower bound (possible overly delayed ACK)" @@ -776,7 +765,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,  	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);  	if (th == NULL) {  		if (LOG_INVALID(net, IPPROTO_TCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				"nf_ct_tcp: short packet ");  		return -NF_ACCEPT;  	} @@ -784,7 +773,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,  	/* Not whole TCP header or malformed packet */  	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {  		if (LOG_INVALID(net, IPPROTO_TCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				"nf_ct_tcp: truncated/malformed packet ");  		return -NF_ACCEPT;  	} @@ -797,7 +786,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,  	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&  	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {  		if (LOG_INVALID(net, IPPROTO_TCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				  "nf_ct_tcp: bad TCP checksum ");  		return -NF_ACCEPT;  	} @@ -806,7 +795,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,  	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));  	if (!tcp_valid_flags[tcpflags]) {  		if (LOG_INVALID(net, IPPROTO_TCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				  "nf_ct_tcp: invalid TCP flag combination ");  		return -NF_ACCEPT;  	} @@ -814,15 +803,22 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,  	return NF_ACCEPT;  } +static unsigned int *tcp_get_timeouts(struct net *net) +{ +	return tcp_pernet(net)->timeouts; +} +  /* Returns verdict for packet, or -1 for invalid. */  static int tcp_packet(struct nf_conn *ct,  		      const struct sk_buff *skb,  		      unsigned int dataoff,  		      enum ip_conntrack_info ctinfo,  		      u_int8_t pf, -		      unsigned int hooknum) +		      unsigned int hooknum, +		      unsigned int *timeouts)  {  	struct net *net = nf_ct_net(ct); +	struct nf_tcp_net *tn = tcp_pernet(net);  	struct nf_conntrack_tuple *tuple;  	enum tcp_conntrack new_state, old_state;  	enum ip_conntrack_dir dir; @@ -946,16 +942,32 @@ static int tcp_packet(struct nf_conn *ct,  		}  		spin_unlock_bh(&ct->lock);  		if (LOG_INVALID(net, IPPROTO_TCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, -				  "nf_ct_tcp: invalid packet ignored "); +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, +				  "nf_ct_tcp: invalid packet ignored in " +				  "state %s ", tcp_conntrack_names[old_state]);  		return NF_ACCEPT;  	case TCP_CONNTRACK_MAX: +		/* Special case for SYN proxy: when the SYN to the server or +		 * the SYN/ACK from the server is lost, the client may transmit +		 * a keep-alive packet while in SYN_SENT state. This needs to +		 * be associated with the original conntrack entry in order to +		 * generate a new SYN with the correct sequence number. +		 */ +		if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && +		    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && +		    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && +		    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { +			pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); +			spin_unlock_bh(&ct->lock); +			return NF_ACCEPT; +		} +  		/* Invalid packet */  		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",  			 dir, get_conntrack_index(th), old_state);  		spin_unlock_bh(&ct->lock);  		if (LOG_INVALID(net, IPPROTO_TCP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				  "nf_ct_tcp: invalid state ");  		return -NF_ACCEPT;  	case TCP_CONNTRACK_CLOSE: @@ -965,8 +977,8 @@ static int tcp_packet(struct nf_conn *ct,  			/* Invalid RST  */  			spin_unlock_bh(&ct->lock);  			if (LOG_INVALID(net, IPPROTO_TCP)) -				nf_log_packet(pf, 0, skb, NULL, NULL, NULL, -					  "nf_ct_tcp: invalid RST "); +				nf_log_packet(net, pf, 0, skb, NULL, NULL, +					      NULL, "nf_ct_tcp: invalid RST ");  			return -NF_ACCEPT;  		}  		if (index == TCP_RST_SET @@ -1014,15 +1026,15 @@ static int tcp_packet(struct nf_conn *ct,  	    && new_state == TCP_CONNTRACK_FIN_WAIT)  		ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; -	if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans && -	    tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans) -		timeout = nf_ct_tcp_timeout_max_retrans; +	if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && +	    timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) +		timeout = timeouts[TCP_CONNTRACK_RETRANS];  	else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &  		 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && -		 tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged) -		timeout = nf_ct_tcp_timeout_unacknowledged; +		 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) +		timeout = timeouts[TCP_CONNTRACK_UNACK];  	else -		timeout = tcp_timeouts[new_state]; +		timeout = timeouts[new_state];  	spin_unlock_bh(&ct->lock);  	if (new_state != old_state) @@ -1037,6 +1049,12 @@ static int tcp_packet(struct nf_conn *ct,  			nf_ct_kill_acct(ct, ctinfo, skb);  			return NF_ACCEPT;  		} +		/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection +		 * pickup with loose=1. Avoid large ESTABLISHED timeout. +		 */ +		if (new_state == TCP_CONNTRACK_ESTABLISHED && +		    timeout > timeouts[TCP_CONNTRACK_UNACK]) +			timeout = timeouts[TCP_CONNTRACK_UNACK];  	} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)  		   && (old_state == TCP_CONNTRACK_SYN_RECV  		       || old_state == TCP_CONNTRACK_ESTABLISHED) @@ -1054,11 +1072,13 @@ static int tcp_packet(struct nf_conn *ct,  /* Called when a new connection for this protocol found. */  static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, -		    unsigned int dataoff) +		    unsigned int dataoff, unsigned int *timeouts)  {  	enum tcp_conntrack new_state;  	const struct tcphdr *th;  	struct tcphdr _tcph; +	struct net *net = nf_ct_net(ct); +	struct nf_tcp_net *tn = tcp_pernet(net);  	const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];  	const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; @@ -1066,9 +1086,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,  	BUG_ON(th == NULL);  	/* Don't need lock here: this conntrack not in circulation yet */ -	new_state -		= tcp_conntracks[0][get_conntrack_index(th)] -		[TCP_CONNTRACK_NONE]; +	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];  	/* Invalid: delete conntrack */  	if (new_state >= TCP_CONNTRACK_MAX) { @@ -1077,6 +1095,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,  	}  	if (new_state == TCP_CONNTRACK_SYN_SENT) { +		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));  		/* SYN packet */  		ct->proto.tcp.seen[0].td_end =  			segment_seq_plus_len(ntohl(th->seq), skb->len, @@ -1088,11 +1107,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,  			ct->proto.tcp.seen[0].td_end;  		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); -		ct->proto.tcp.seen[1].flags = 0; -	} else if (nf_ct_tcp_loose == 0) { +	} else if (tn->tcp_loose == 0) {  		/* Don't try to pick up connections. */  		return false;  	} else { +		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));  		/*  		 * We are in the middle of a connection,  		 * its history is lost for us. @@ -1107,7 +1126,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,  		ct->proto.tcp.seen[0].td_maxend =  			ct->proto.tcp.seen[0].td_end +  			ct->proto.tcp.seen[0].td_maxwin; -		ct->proto.tcp.seen[0].td_scale = 0;  		/* We assume SACK and liberal window checking to handle  		 * window scaling */ @@ -1116,13 +1134,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,  					      IP_CT_TCP_FLAG_BE_LIBERAL;  	} -	ct->proto.tcp.seen[1].td_end = 0; -	ct->proto.tcp.seen[1].td_maxend = 0; -	ct->proto.tcp.seen[1].td_maxwin = 0; -	ct->proto.tcp.seen[1].td_scale = 0; -  	/* tcp_packet will set them */ -	ct->proto.tcp.state = TCP_CONNTRACK_NONE;  	ct->proto.tcp.last_index = TCP_NONE_SET;  	pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " @@ -1134,7 +1146,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,  	return true;  } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  #include <linux/netfilter/nfnetlink.h>  #include <linux/netfilter/nfnetlink_conntrack.h> @@ -1150,21 +1162,22 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,  	if (!nest_parms)  		goto nla_put_failure; -	NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state); - -	NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, -		   ct->proto.tcp.seen[0].td_scale); - -	NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, -		   ct->proto.tcp.seen[1].td_scale); +	if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) || +	    nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, +		       ct->proto.tcp.seen[0].td_scale) || +	    nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, +		       ct->proto.tcp.seen[1].td_scale)) +		goto nla_put_failure;  	tmp.flags = ct->proto.tcp.seen[0].flags; -	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, -		sizeof(struct nf_ct_tcp_flags), &tmp); +	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, +		    sizeof(struct nf_ct_tcp_flags), &tmp)) +		goto nla_put_failure;  	tmp.flags = ct->proto.tcp.seen[1].flags; -	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, -		sizeof(struct nf_ct_tcp_flags), &tmp); +	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, +		    sizeof(struct nf_ct_tcp_flags), &tmp)) +		goto nla_put_failure;  	spin_unlock_bh(&ct->lock);  	nla_nest_end(skb, nest_parms); @@ -1247,97 +1260,194 @@ static int tcp_nlattr_tuple_size(void)  }  #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], +				     struct net *net, void *data) +{ +	unsigned int *timeouts = data; +	struct nf_tcp_net *tn = tcp_pernet(net); +	int i; + +	/* set default TCP timeouts. */ +	for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) +		timeouts[i] = tn->timeouts[i]; + +	if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { +		timeouts[TCP_CONNTRACK_SYN_SENT] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { +		timeouts[TCP_CONNTRACK_SYN_RECV] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { +		timeouts[TCP_CONNTRACK_ESTABLISHED] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { +		timeouts[TCP_CONNTRACK_FIN_WAIT] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { +		timeouts[TCP_CONNTRACK_CLOSE_WAIT] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { +		timeouts[TCP_CONNTRACK_LAST_ACK] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { +		timeouts[TCP_CONNTRACK_TIME_WAIT] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_CLOSE]) { +		timeouts[TCP_CONNTRACK_CLOSE] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { +		timeouts[TCP_CONNTRACK_SYN_SENT2] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_RETRANS]) { +		timeouts[TCP_CONNTRACK_RETRANS] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; +	} +	if (tb[CTA_TIMEOUT_TCP_UNACK]) { +		timeouts[TCP_CONNTRACK_UNACK] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; +	} +	return 0; +} + +static int +tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +	const unsigned int *timeouts = data; + +	if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, +			htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, +			 htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, +			 htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, +			 htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, +			 htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, +			 htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, +			 htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, +			 htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, +			 htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, +			 htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, +			 htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { +	[CTA_TIMEOUT_TCP_SYN_SENT]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_SYN_RECV]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_ESTABLISHED]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_FIN_WAIT]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_CLOSE_WAIT]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_LAST_ACK]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_TIME_WAIT]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_CLOSE]		= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_SYN_SENT2]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_RETRANS]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_TCP_UNACK]		= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +  #ifdef CONFIG_SYSCTL -static unsigned int tcp_sysctl_table_users; -static struct ctl_table_header *tcp_sysctl_header;  static struct ctl_table tcp_sysctl_table[] = {  	{  		.procname	= "nf_conntrack_tcp_timeout_syn_sent", -		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_syn_recv", -		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_established", -		.data		= &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_fin_wait", -		.data		= &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_close_wait", -		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_last_ack", -		.data		= &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_time_wait", -		.data		= &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_close", -		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_max_retrans", -		.data		= &nf_ct_tcp_timeout_max_retrans,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_timeout_unacknowledged", -		.data		= &nf_ct_tcp_timeout_unacknowledged,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_tcp_loose", -		.data		= &nf_ct_tcp_loose,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname       = "nf_conntrack_tcp_be_liberal", -		.data           = &nf_ct_tcp_be_liberal,  		.maxlen         = sizeof(unsigned int),  		.mode           = 0644,  		.proc_handler   = proc_dointvec,  	},  	{  		.procname	= "nf_conntrack_tcp_max_retrans", -		.data		= &nf_ct_tcp_max_retrans,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec, @@ -1349,91 +1459,78 @@ static struct ctl_table tcp_sysctl_table[] = {  static struct ctl_table tcp_compat_sysctl_table[] = {  	{  		.procname	= "ip_conntrack_tcp_timeout_syn_sent", -		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_syn_sent2", -		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_syn_recv", -		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_established", -		.data		= &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_fin_wait", -		.data		= &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_close_wait", -		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_last_ack", -		.data		= &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_time_wait", -		.data		= &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_close", -		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE],  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_timeout_max_retrans", -		.data		= &nf_ct_tcp_timeout_max_retrans,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_tcp_loose", -		.data		= &nf_ct_tcp_loose,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "ip_conntrack_tcp_be_liberal", -		.data		= &nf_ct_tcp_be_liberal,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "ip_conntrack_tcp_max_retrans", -		.data		= &nf_ct_tcp_max_retrans,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec, @@ -1443,6 +1540,101 @@ static struct ctl_table tcp_compat_sysctl_table[] = {  #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  #endif /* CONFIG_SYSCTL */ +static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, +				    struct nf_tcp_net *tn) +{ +#ifdef CONFIG_SYSCTL +	if (pn->ctl_table) +		return 0; + +	pn->ctl_table = kmemdup(tcp_sysctl_table, +				sizeof(tcp_sysctl_table), +				GFP_KERNEL); +	if (!pn->ctl_table) +		return -ENOMEM; + +	pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; +	pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; +	pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; +	pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; +	pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; +	pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; +	pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; +	pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; +	pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; +	pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK]; +	pn->ctl_table[10].data = &tn->tcp_loose; +	pn->ctl_table[11].data = &tn->tcp_be_liberal; +	pn->ctl_table[12].data = &tn->tcp_max_retrans; +#endif +	return 0; +} + +static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, +					   struct nf_tcp_net *tn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +	pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table, +				       sizeof(tcp_compat_sysctl_table), +				       GFP_KERNEL); +	if (!pn->ctl_compat_table) +		return -ENOMEM; + +	pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; +	pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2]; +	pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; +	pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; +	pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; +	pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; +	pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; +	pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; +	pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; +	pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; +	pn->ctl_compat_table[10].data = &tn->tcp_loose; +	pn->ctl_compat_table[11].data = &tn->tcp_be_liberal; +	pn->ctl_compat_table[12].data = &tn->tcp_max_retrans; +#endif +#endif +	return 0; +} + +static int tcp_init_net(struct net *net, u_int16_t proto) +{ +	int ret; +	struct nf_tcp_net *tn = tcp_pernet(net); +	struct nf_proto_net *pn = &tn->pn; + +	if (!pn->users) { +		int i; + +		for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) +			tn->timeouts[i] = tcp_timeouts[i]; + +		tn->tcp_loose = nf_ct_tcp_loose; +		tn->tcp_be_liberal = nf_ct_tcp_be_liberal; +		tn->tcp_max_retrans = nf_ct_tcp_max_retrans; +	} + +	if (proto == AF_INET) { +		ret = tcp_kmemdup_compat_sysctl_table(pn, tn); +		if (ret < 0) +			return ret; + +		ret = tcp_kmemdup_sysctl_table(pn, tn); +		if (ret < 0) +			nf_ct_kfree_compat_sysctl_table(pn); +	} else +		ret = tcp_kmemdup_sysctl_table(pn, tn); + +	return ret; +} + +static struct nf_proto_net *tcp_get_net_proto(struct net *net) +{ +	return &net->ct.nf_ct_proto.tcp.pn; +} +  struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =  {  	.l3proto		= PF_INET, @@ -1453,9 +1645,10 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =  	.print_tuple 		= tcp_print_tuple,  	.print_conntrack 	= tcp_print_conntrack,  	.packet 		= tcp_packet, +	.get_timeouts		= tcp_get_timeouts,  	.new 			= tcp_new,  	.error			= tcp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.to_nlattr		= tcp_to_nlattr,  	.nlattr_size		= tcp_nlattr_size,  	.from_nlattr		= nlattr_to_tcp, @@ -1464,14 +1657,18 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =  	.nlattr_tuple_size	= tcp_nlattr_tuple_size,  	.nla_policy		= nf_ct_port_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &tcp_sysctl_table_users, -	.ctl_table_header	= &tcp_sysctl_header, -	.ctl_table		= tcp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -	.ctl_compat_table	= tcp_compat_sysctl_table, -#endif -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_TCP_MAX, +		.obj_size	= sizeof(unsigned int) * +					TCP_CONNTRACK_TIMEOUT_MAX, +		.nla_policy	= tcp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.init_net		= tcp_init_net, +	.get_net_proto		= tcp_get_net_proto,  };  EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); @@ -1485,9 +1682,10 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =  	.print_tuple 		= tcp_print_tuple,  	.print_conntrack 	= tcp_print_conntrack,  	.packet 		= tcp_packet, +	.get_timeouts		= tcp_get_timeouts,  	.new 			= tcp_new,  	.error			= tcp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.to_nlattr		= tcp_to_nlattr,  	.nlattr_size		= tcp_nlattr_size,  	.from_nlattr		= nlattr_to_tcp, @@ -1496,10 +1694,17 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =  	.nlattr_tuple_size	= tcp_nlattr_tuple_size,  	.nla_policy		= nf_ct_port_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &tcp_sysctl_table_users, -	.ctl_table_header	= &tcp_sysctl_header, -	.ctl_table		= tcp_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_TCP_MAX, +		.obj_size	= sizeof(unsigned int) * +					TCP_CONNTRACK_TIMEOUT_MAX, +		.nla_policy	= tcp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.init_net		= tcp_init_net, +	.get_net_proto		= tcp_get_net_proto,  };  EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8289088b821..9d7721cbce4 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -1,5 +1,6 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -25,8 +26,15 @@  #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>  #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> -static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ; -static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ; +static unsigned int udp_timeouts[UDP_CT_MAX] = { +	[UDP_CT_UNREPLIED]	= 30*HZ, +	[UDP_CT_REPLIED]	= 180*HZ, +}; + +static inline struct nf_udp_net *udp_pernet(struct net *net) +{ +	return &net->ct.nf_ct_proto.udp; +}  static bool udp_pkt_to_tuple(const struct sk_buff *skb,  			     unsigned int dataoff, @@ -63,30 +71,38 @@ static int udp_print_tuple(struct seq_file *s,  			  ntohs(tuple->dst.u.udp.port));  } +static unsigned int *udp_get_timeouts(struct net *net) +{ +	return udp_pernet(net)->timeouts; +} +  /* Returns verdict for packet, and may modify conntracktype */  static int udp_packet(struct nf_conn *ct,  		      const struct sk_buff *skb,  		      unsigned int dataoff,  		      enum ip_conntrack_info ctinfo,  		      u_int8_t pf, -		      unsigned int hooknum) +		      unsigned int hooknum, +		      unsigned int *timeouts)  {  	/* If we've seen traffic both ways, this is some kind of UDP  	   stream.  Extend timeout. */  	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { -		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream); +		nf_ct_refresh_acct(ct, ctinfo, skb, +				   timeouts[UDP_CT_REPLIED]);  		/* Also, more likely to be important, and not a probe */  		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))  			nf_conntrack_event_cache(IPCT_ASSURED, ct); -	} else -		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout); - +	} else { +		nf_ct_refresh_acct(ct, ctinfo, skb, +				   timeouts[UDP_CT_UNREPLIED]); +	}  	return NF_ACCEPT;  }  /* Called when a new connection for this protocol found. */  static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, -		    unsigned int dataoff) +		    unsigned int dataoff, unsigned int *timeouts)  {  	return true;  } @@ -104,7 +120,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,  	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);  	if (hdr == NULL) {  		if (LOG_INVALID(net, IPPROTO_UDP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				      "nf_ct_udp: short packet ");  		return -NF_ACCEPT;  	} @@ -112,7 +128,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,  	/* Truncated/malformed packets */  	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {  		if (LOG_INVALID(net, IPPROTO_UDP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				"nf_ct_udp: truncated/malformed packet ");  		return -NF_ACCEPT;  	} @@ -128,7 +144,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,  	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&  	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {  		if (LOG_INVALID(net, IPPROTO_UDP)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				"nf_ct_udp: bad UDP checksum ");  		return -NF_ACCEPT;  	} @@ -136,20 +152,65 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,  	return NF_ACCEPT;  } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], +				     struct net *net, void *data) +{ +	unsigned int *timeouts = data; +	struct nf_udp_net *un = udp_pernet(net); + +	/* set default timeouts for UDP. */ +	timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; +	timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; + +	if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { +		timeouts[UDP_CT_UNREPLIED] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; +	} +	if (tb[CTA_TIMEOUT_UDP_REPLIED]) { +		timeouts[UDP_CT_REPLIED] = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; +	} +	return 0; +} + +static int +udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +	const unsigned int *timeouts = data; + +	if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED, +			 htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED, +			 htonl(timeouts[UDP_CT_REPLIED] / HZ))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static const struct nla_policy +udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { +       [CTA_TIMEOUT_UDP_UNREPLIED]	= { .type = NLA_U32 }, +       [CTA_TIMEOUT_UDP_REPLIED]	= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +  #ifdef CONFIG_SYSCTL -static unsigned int udp_sysctl_table_users; -static struct ctl_table_header *udp_sysctl_header;  static struct ctl_table udp_sysctl_table[] = {  	{  		.procname	= "nf_conntrack_udp_timeout", -		.data		= &nf_ct_udp_timeout,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_udp_timeout_stream", -		.data		= &nf_ct_udp_timeout_stream,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -160,14 +221,12 @@ static struct ctl_table udp_sysctl_table[] = {  static struct ctl_table udp_compat_sysctl_table[] = {  	{  		.procname	= "ip_conntrack_udp_timeout", -		.data		= &nf_ct_udp_timeout,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "ip_conntrack_udp_timeout_stream", -		.data		= &nf_ct_udp_timeout_stream,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -177,6 +236,73 @@ static struct ctl_table udp_compat_sysctl_table[] = {  #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  #endif /* CONFIG_SYSCTL */ +static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, +				    struct nf_udp_net *un) +{ +#ifdef CONFIG_SYSCTL +	if (pn->ctl_table) +		return 0; +	pn->ctl_table = kmemdup(udp_sysctl_table, +				sizeof(udp_sysctl_table), +				GFP_KERNEL); +	if (!pn->ctl_table) +		return -ENOMEM; +	pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; +	pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED]; +#endif +	return 0; +} + +static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, +					   struct nf_udp_net *un) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +	pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table, +				       sizeof(udp_compat_sysctl_table), +				       GFP_KERNEL); +	if (!pn->ctl_compat_table) +		return -ENOMEM; + +	pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; +	pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED]; +#endif +#endif +	return 0; +} + +static int udp_init_net(struct net *net, u_int16_t proto) +{ +	int ret; +	struct nf_udp_net *un = udp_pernet(net); +	struct nf_proto_net *pn = &un->pn; + +	if (!pn->users) { +		int i; + +		for (i = 0; i < UDP_CT_MAX; i++) +			un->timeouts[i] = udp_timeouts[i]; +	} + +	if (proto == AF_INET) { +		ret = udp_kmemdup_compat_sysctl_table(pn, un); +		if (ret < 0) +			return ret; + +		ret = udp_kmemdup_sysctl_table(pn, un); +		if (ret < 0) +			nf_ct_kfree_compat_sysctl_table(pn); +	} else +		ret = udp_kmemdup_sysctl_table(pn, un); + +	return ret; +} + +static struct nf_proto_net *udp_get_net_proto(struct net *net) +{ +	return &net->ct.nf_ct_proto.udp.pn; +} +  struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =  {  	.l3proto		= PF_INET, @@ -186,22 +312,26 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =  	.invert_tuple		= udp_invert_tuple,  	.print_tuple		= udp_print_tuple,  	.packet			= udp_packet, +	.get_timeouts		= udp_get_timeouts,  	.new			= udp_new,  	.error			= udp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,  	.nla_policy		= nf_ct_port_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &udp_sysctl_table_users, -	.ctl_table_header	= &udp_sysctl_header, -	.ctl_table		= udp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -	.ctl_compat_table	= udp_compat_sysctl_table, -#endif -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= udp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= udp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_UDP_MAX, +		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, +		.nla_policy	= udp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.init_net		= udp_init_net, +	.get_net_proto		= udp_get_net_proto,  };  EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); @@ -214,18 +344,25 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =  	.invert_tuple		= udp_invert_tuple,  	.print_tuple		= udp_print_tuple,  	.packet			= udp_packet, +	.get_timeouts		= udp_get_timeouts,  	.new			= udp_new,  	.error			= udp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,  	.nla_policy		= nf_ct_port_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &udp_sysctl_table_users, -	.ctl_table_header	= &udp_sysctl_header, -	.ctl_table		= udp_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= udp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= udp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_UDP_MAX, +		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, +		.nla_policy	= udp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.init_net		= udp_init_net, +	.get_net_proto		= udp_get_net_proto,  };  EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 263b5a72588..2750e6c69f8 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -24,8 +24,27 @@  #include <net/netfilter/nf_conntrack_ecache.h>  #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_udplite_timeout __read_mostly = 30*HZ; -static unsigned int nf_ct_udplite_timeout_stream __read_mostly = 180*HZ; +enum udplite_conntrack { +	UDPLITE_CT_UNREPLIED, +	UDPLITE_CT_REPLIED, +	UDPLITE_CT_MAX +}; + +static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = { +	[UDPLITE_CT_UNREPLIED]	= 30*HZ, +	[UDPLITE_CT_REPLIED]	= 180*HZ, +}; + +static int udplite_net_id __read_mostly; +struct udplite_net { +	struct nf_proto_net pn; +	unsigned int timeouts[UDPLITE_CT_MAX]; +}; + +static inline struct udplite_net *udplite_pernet(struct net *net) +{ +	return net_generic(net, udplite_net_id); +}  static bool udplite_pkt_to_tuple(const struct sk_buff *skb,  				 unsigned int dataoff, @@ -60,31 +79,38 @@ static int udplite_print_tuple(struct seq_file *s,  			  ntohs(tuple->dst.u.udp.port));  } +static unsigned int *udplite_get_timeouts(struct net *net) +{ +	return udplite_pernet(net)->timeouts; +} +  /* Returns verdict for packet, and may modify conntracktype */  static int udplite_packet(struct nf_conn *ct,  			  const struct sk_buff *skb,  			  unsigned int dataoff,  			  enum ip_conntrack_info ctinfo,  			  u_int8_t pf, -			  unsigned int hooknum) +			  unsigned int hooknum, +			  unsigned int *timeouts)  {  	/* If we've seen traffic both ways, this is some kind of UDP  	   stream.  Extend timeout. */  	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {  		nf_ct_refresh_acct(ct, ctinfo, skb, -				   nf_ct_udplite_timeout_stream); +				   timeouts[UDPLITE_CT_REPLIED]);  		/* Also, more likely to be important, and not a probe */  		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))  			nf_conntrack_event_cache(IPCT_ASSURED, ct); -	} else -		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout); - +	} else { +		nf_ct_refresh_acct(ct, ctinfo, skb, +				   timeouts[UDPLITE_CT_UNREPLIED]); +	}  	return NF_ACCEPT;  }  /* Called when a new connection for this protocol found. */  static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, -			unsigned int dataoff) +			unsigned int dataoff, unsigned int *timeouts)  {  	return true;  } @@ -105,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,  	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);  	if (hdr == NULL) {  		if (LOG_INVALID(net, IPPROTO_UDPLITE)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				      "nf_ct_udplite: short packet ");  		return -NF_ACCEPT;  	} @@ -115,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,  		cscov = udplen;  	else if (cscov < sizeof(*hdr) || cscov > udplen) {  		if (LOG_INVALID(net, IPPROTO_UDPLITE)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				"nf_ct_udplite: invalid checksum coverage ");  		return -NF_ACCEPT;  	} @@ -123,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,  	/* UDPLITE mandates checksums */  	if (!hdr->check) {  		if (LOG_INVALID(net, IPPROTO_UDPLITE)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				      "nf_ct_udplite: checksum missing ");  		return -NF_ACCEPT;  	} @@ -133,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,  	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,  	    			pf)) {  		if (LOG_INVALID(net, IPPROTO_UDPLITE)) -			nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,  				      "nf_ct_udplite: bad UDPLite checksum ");  		return -NF_ACCEPT;  	} @@ -141,20 +167,65 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,  	return NF_ACCEPT;  } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[], +					 struct net *net, void *data) +{ +	unsigned int *timeouts = data; +	struct udplite_net *un = udplite_pernet(net); + +	/* set default timeouts for UDPlite. */ +	timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED]; +	timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED]; + +	if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) { +		timeouts[UDPLITE_CT_UNREPLIED] = +		  ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ; +	} +	if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) { +		timeouts[UDPLITE_CT_REPLIED] = +		  ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ; +	} +	return 0; +} + +static int +udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +	const unsigned int *timeouts = data; + +	if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED, +			 htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) || +	    nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED, +			 htonl(timeouts[UDPLITE_CT_REPLIED] / HZ))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static const struct nla_policy +udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = { +	[CTA_TIMEOUT_UDPLITE_UNREPLIED]	= { .type = NLA_U32 }, +	[CTA_TIMEOUT_UDPLITE_REPLIED]	= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +  #ifdef CONFIG_SYSCTL -static unsigned int udplite_sysctl_table_users; -static struct ctl_table_header *udplite_sysctl_header;  static struct ctl_table udplite_sysctl_table[] = {  	{  		.procname	= "nf_conntrack_udplite_timeout", -		.data		= &nf_ct_udplite_timeout,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies,  	},  	{  		.procname	= "nf_conntrack_udplite_timeout_stream", -		.data		= &nf_ct_udplite_timeout_stream,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -163,6 +234,40 @@ static struct ctl_table udplite_sysctl_table[] = {  };  #endif /* CONFIG_SYSCTL */ +static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn, +					struct udplite_net *un) +{ +#ifdef CONFIG_SYSCTL +	if (pn->ctl_table) +		return 0; + +	pn->ctl_table = kmemdup(udplite_sysctl_table, +				sizeof(udplite_sysctl_table), +				GFP_KERNEL); +	if (!pn->ctl_table) +		return -ENOMEM; + +	pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED]; +	pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED]; +#endif +	return 0; +} + +static int udplite_init_net(struct net *net, u_int16_t proto) +{ +	struct udplite_net *un = udplite_pernet(net); +	struct nf_proto_net *pn = &un->pn; + +	if (!pn->users) { +		int i; + +		for (i = 0 ; i < UDPLITE_CT_MAX; i++) +			un->timeouts[i] = udplite_timeouts[i]; +	} + +	return udplite_kmemdup_sysctl_table(pn, un); +} +  static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =  {  	.l3proto		= PF_INET, @@ -172,19 +277,27 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =  	.invert_tuple		= udplite_invert_tuple,  	.print_tuple		= udplite_print_tuple,  	.packet			= udplite_packet, +	.get_timeouts		= udplite_get_timeouts,  	.new			= udplite_new,  	.error			= udplite_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,  	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nla_policy		= nf_ct_port_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &udplite_sysctl_table_users, -	.ctl_table_header	= &udplite_sysctl_header, -	.ctl_table		= udplite_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= udplite_timeout_nlattr_to_obj, +		.obj_to_nlattr	= udplite_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_UDPLITE_MAX, +		.obj_size	= sizeof(unsigned int) * +					CTA_TIMEOUT_UDPLITE_MAX, +		.nla_policy	= udplite_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.net_id			= &udplite_net_id, +	.init_net		= udplite_init_net,  };  static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = @@ -196,42 +309,94 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =  	.invert_tuple		= udplite_invert_tuple,  	.print_tuple		= udplite_print_tuple,  	.packet			= udplite_packet, +	.get_timeouts		= udplite_get_timeouts,  	.new			= udplite_new,  	.error			= udplite_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)  	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,  	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,  	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,  	.nla_policy		= nf_ct_port_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_users	= &udplite_sysctl_table_users, -	.ctl_table_header	= &udplite_sysctl_header, -	.ctl_table		= udplite_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= udplite_timeout_nlattr_to_obj, +		.obj_to_nlattr	= udplite_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_UDPLITE_MAX, +		.obj_size	= sizeof(unsigned int) * +					CTA_TIMEOUT_UDPLITE_MAX, +		.nla_policy	= udplite_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.net_id			= &udplite_net_id, +	.init_net		= udplite_init_net, +}; + +static int udplite_net_init(struct net *net) +{ +	int ret = 0; + +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4); +	if (ret < 0) { +		pr_err("nf_conntrack_udplite4: pernet registration failed.\n"); +		goto out; +	} +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6); +	if (ret < 0) { +		pr_err("nf_conntrack_udplite6: pernet registration failed.\n"); +		goto cleanup_udplite4; +	} +	return 0; + +cleanup_udplite4: +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); +out: +	return ret; +} + +static void udplite_net_exit(struct net *net) +{ +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6); +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); +} + +static struct pernet_operations udplite_net_ops = { +	.init = udplite_net_init, +	.exit = udplite_net_exit, +	.id   = &udplite_net_id, +	.size = sizeof(struct udplite_net),  };  static int __init nf_conntrack_proto_udplite_init(void)  { -	int err; +	int ret; + +	ret = register_pernet_subsys(&udplite_net_ops); +	if (ret < 0) +		goto out_pernet; + +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4); +	if (ret < 0) +		goto out_udplite4; + +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6); +	if (ret < 0) +		goto out_udplite6; -	err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite4); -	if (err < 0) -		goto err1; -	err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite6); -	if (err < 0) -		goto err2;  	return 0; -err2: -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); -err1: -	return err; +out_udplite6: +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +out_udplite4: +	unregister_pernet_subsys(&udplite_net_ops); +out_pernet: +	return ret;  }  static void __exit nf_conntrack_proto_udplite_exit(void)  { -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite6); -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +	unregister_pernet_subsys(&udplite_net_ops);  }  module_init(nf_conntrack_proto_udplite_init); diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index d9e27734b2a..4a2134fd3fc 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -69,16 +69,15 @@ static int help(struct sk_buff *skb,  	void *sb_ptr;  	int ret = NF_ACCEPT;  	int dir = CTINFO2DIR(ctinfo); -	struct nf_ct_sane_master *ct_sane_info; +	struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);  	struct nf_conntrack_expect *exp;  	struct nf_conntrack_tuple *tuple;  	struct sane_request *req;  	struct sane_reply_net_start *reply; -	ct_sane_info = &nfct_help(ct)->help.ct_sane_info;  	/* Until there's been traffic both ways, don't look in packets. */  	if (ctinfo != IP_CT_ESTABLISHED && -	    ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) +	    ctinfo != IP_CT_ESTABLISHED_REPLY)  		return NF_ACCEPT;  	/* Not a full tcp header? */ @@ -139,6 +138,7 @@ static int help(struct sk_buff *skb,  	exp = nf_ct_expect_alloc(ct);  	if (exp == NULL) { +		nf_ct_helper_log(skb, ct, "cannot alloc expectation");  		ret = NF_DROP;  		goto out;  	} @@ -152,8 +152,10 @@ static int help(struct sk_buff *skb,  	nf_ct_dump_tuple(&exp->tuple);  	/* Can't expect this?  Best to drop packet now. */ -	if (nf_ct_expect_related(exp) != 0) +	if (nf_ct_expect_related(exp) != 0) { +		nf_ct_helper_log(skb, ct, "cannot add expectation");  		ret = NF_DROP; +	}  	nf_ct_expect_put(exp); @@ -163,7 +165,6 @@ out:  }  static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; -static char sane_names[MAX_PORTS][2][sizeof("sane-65535")] __read_mostly;  static const struct nf_conntrack_expect_policy sane_exp_policy = {  	.max_expected	= 1, @@ -190,7 +191,6 @@ static void nf_conntrack_sane_fini(void)  static int __init nf_conntrack_sane_init(void)  {  	int i, j = -1, ret = 0; -	char *tmpname;  	sane_buffer = kmalloc(65536, GFP_KERNEL);  	if (!sane_buffer) @@ -205,17 +205,16 @@ static int __init nf_conntrack_sane_init(void)  		sane[i][0].tuple.src.l3num = PF_INET;  		sane[i][1].tuple.src.l3num = PF_INET6;  		for (j = 0; j < 2; j++) { +			sane[i][j].data_len = sizeof(struct nf_ct_sane_master);  			sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);  			sane[i][j].tuple.dst.protonum = IPPROTO_TCP;  			sane[i][j].expect_policy = &sane_exp_policy;  			sane[i][j].me = THIS_MODULE;  			sane[i][j].help = help; -			tmpname = &sane_names[i][j][0];  			if (ports[i] == SANE_PORT) -				sprintf(tmpname, "sane"); +				sprintf(sane[i][j].name, "sane");  			else -				sprintf(tmpname, "sane-%d", ports[i]); -			sane[i][j].name = tmpname; +				sprintf(sane[i][j].name, "sane-%d", ports[i]);  			pr_debug("nf_ct_sane: registering helper for pf: %d "  				 "port: %d\n", diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c new file mode 100644 index 00000000000..f6e2ae91a80 --- /dev/null +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -0,0 +1,243 @@ +#include <linux/types.h> +#include <linux/netfilter.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> + +int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, +		      s32 off) +{ +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	struct nf_conn_seqadj *seqadj; +	struct nf_ct_seqadj *this_way; + +	if (off == 0) +		return 0; + +	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + +	seqadj = nfct_seqadj(ct); +	this_way = &seqadj->seq[dir]; +	this_way->offset_before	 = off; +	this_way->offset_after	 = off; +	return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); + +int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, +		     __be32 seq, s32 off) +{ +	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	struct nf_ct_seqadj *this_way; + +	if (off == 0) +		return 0; + +	if (unlikely(!seqadj)) { +		WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); +		return 0; +	} + +	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + +	spin_lock_bh(&ct->lock); +	this_way = &seqadj->seq[dir]; +	if (this_way->offset_before == this_way->offset_after || +	    before(this_way->correction_pos, ntohl(seq))) { +		this_way->correction_pos = ntohl(seq); +		this_way->offset_before	 = this_way->offset_after; +		this_way->offset_after	+= off; +	} +	spin_unlock_bh(&ct->lock); +	return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); + +void nf_ct_tcp_seqadj_set(struct sk_buff *skb, +			  struct nf_conn *ct, enum ip_conntrack_info ctinfo, +			  s32 off) +{ +	const struct tcphdr *th; + +	if (nf_ct_protonum(ct) != IPPROTO_TCP) +		return; + +	th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); +	nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} +EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); + +/* Adjust one found SACK option including checksum correction */ +static void nf_ct_sack_block_adjust(struct sk_buff *skb, +				    struct tcphdr *tcph, +				    unsigned int sackoff, +				    unsigned int sackend, +				    struct nf_ct_seqadj *seq) +{ +	while (sackoff < sackend) { +		struct tcp_sack_block_wire *sack; +		__be32 new_start_seq, new_end_seq; + +		sack = (void *)skb->data + sackoff; +		if (after(ntohl(sack->start_seq) - seq->offset_before, +			  seq->correction_pos)) +			new_start_seq = htonl(ntohl(sack->start_seq) - +					seq->offset_after); +		else +			new_start_seq = htonl(ntohl(sack->start_seq) - +					seq->offset_before); + +		if (after(ntohl(sack->end_seq) - seq->offset_before, +			  seq->correction_pos)) +			new_end_seq = htonl(ntohl(sack->end_seq) - +				      seq->offset_after); +		else +			new_end_seq = htonl(ntohl(sack->end_seq) - +				      seq->offset_before); + +		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", +			 ntohl(sack->start_seq), new_start_seq, +			 ntohl(sack->end_seq), new_end_seq); + +		inet_proto_csum_replace4(&tcph->check, skb, +					 sack->start_seq, new_start_seq, 0); +		inet_proto_csum_replace4(&tcph->check, skb, +					 sack->end_seq, new_end_seq, 0); +		sack->start_seq = new_start_seq; +		sack->end_seq = new_end_seq; +		sackoff += sizeof(*sack); +	} +} + +/* TCP SACK sequence number adjustment */ +static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, +				      unsigned int protoff, +				      struct tcphdr *tcph, +				      struct nf_conn *ct, +				      enum ip_conntrack_info ctinfo) +{ +	unsigned int dir, optoff, optend; +	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + +	optoff = protoff + sizeof(struct tcphdr); +	optend = protoff + tcph->doff * 4; + +	if (!skb_make_writable(skb, optend)) +		return 0; + +	dir = CTINFO2DIR(ctinfo); + +	while (optoff < optend) { +		/* Usually: option, length. */ +		unsigned char *op = skb->data + optoff; + +		switch (op[0]) { +		case TCPOPT_EOL: +			return 1; +		case TCPOPT_NOP: +			optoff++; +			continue; +		default: +			/* no partial options */ +			if (optoff + 1 == optend || +			    optoff + op[1] > optend || +			    op[1] < 2) +				return 0; +			if (op[0] == TCPOPT_SACK && +			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK && +			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) +				nf_ct_sack_block_adjust(skb, tcph, optoff + 2, +							optoff+op[1], +							&seqadj->seq[!dir]); +			optoff += op[1]; +		} +	} +	return 1; +} + +/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */ +int nf_ct_seq_adjust(struct sk_buff *skb, +		     struct nf_conn *ct, enum ip_conntrack_info ctinfo, +		     unsigned int protoff) +{ +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	struct tcphdr *tcph; +	__be32 newseq, newack; +	s32 seqoff, ackoff; +	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); +	struct nf_ct_seqadj *this_way, *other_way; +	int res; + +	this_way  = &seqadj->seq[dir]; +	other_way = &seqadj->seq[!dir]; + +	if (!skb_make_writable(skb, protoff + sizeof(*tcph))) +		return 0; + +	tcph = (void *)skb->data + protoff; +	spin_lock_bh(&ct->lock); +	if (after(ntohl(tcph->seq), this_way->correction_pos)) +		seqoff = this_way->offset_after; +	else +		seqoff = this_way->offset_before; + +	if (after(ntohl(tcph->ack_seq) - other_way->offset_before, +		  other_way->correction_pos)) +		ackoff = other_way->offset_after; +	else +		ackoff = other_way->offset_before; + +	newseq = htonl(ntohl(tcph->seq) + seqoff); +	newack = htonl(ntohl(tcph->ack_seq) - ackoff); + +	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); +	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + +	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", +		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), +		 ntohl(newack)); + +	tcph->seq = newseq; +	tcph->ack_seq = newack; + +	res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); +	spin_unlock_bh(&ct->lock); + +	return res; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); + +s32 nf_ct_seq_offset(const struct nf_conn *ct, +		     enum ip_conntrack_dir dir, +		     u32 seq) +{ +	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); +	struct nf_ct_seqadj *this_way; + +	if (!seqadj) +		return 0; + +	this_way = &seqadj->seq[dir]; +	return after(seq, this_way->correction_pos) ? +		 this_way->offset_after : this_way->offset_before; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_offset); + +static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = { +	.len	= sizeof(struct nf_conn_seqadj), +	.align	= __alignof__(struct nf_conn_seqadj), +	.id	= NF_CT_EXT_SEQADJ, +}; + +int nf_conntrack_seqadj_init(void) +{ +	return nf_ct_extend_register(&nf_ct_seqadj_extend); +} + +void nf_conntrack_seqadj_fini(void) +{ +	nf_ct_extend_unregister(&nf_ct_seqadj_extend); +} diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index bcf47eb518e..4c3ba1c8d68 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -52,60 +52,8 @@ module_param(sip_direct_media, int, 0600);  MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling "  				   "endpoints only (default 1)"); -unsigned int (*nf_nat_sip_hook)(struct sk_buff *skb, unsigned int dataoff, -				const char **dptr, -				unsigned int *datalen) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_hook); - -void (*nf_nat_sip_seq_adjust_hook)(struct sk_buff *skb, s16 off) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_seq_adjust_hook); - -unsigned int (*nf_nat_sip_expect_hook)(struct sk_buff *skb, -				       unsigned int dataoff, -				       const char **dptr, -				       unsigned int *datalen, -				       struct nf_conntrack_expect *exp, -				       unsigned int matchoff, -				       unsigned int matchlen) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_expect_hook); - -unsigned int (*nf_nat_sdp_addr_hook)(struct sk_buff *skb, unsigned int dataoff, -				     const char **dptr, -				     unsigned int *datalen, -				     unsigned int sdpoff, -				     enum sdp_header_types type, -				     enum sdp_header_types term, -				     const union nf_inet_addr *addr) -				     __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_addr_hook); - -unsigned int (*nf_nat_sdp_port_hook)(struct sk_buff *skb, unsigned int dataoff, -				     const char **dptr, -				     unsigned int *datalen, -				     unsigned int matchoff, -				     unsigned int matchlen, -				     u_int16_t port) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_port_hook); - -unsigned int (*nf_nat_sdp_session_hook)(struct sk_buff *skb, -					unsigned int dataoff, -					const char **dptr, -					unsigned int *datalen, -					unsigned int sdpoff, -					const union nf_inet_addr *addr) -					__read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_session_hook); - -unsigned int (*nf_nat_sdp_media_hook)(struct sk_buff *skb, unsigned int dataoff, -				      const char **dptr, -				      unsigned int *datalen, -				      struct nf_conntrack_expect *rtp_exp, -				      struct nf_conntrack_expect *rtcp_exp, -				      unsigned int mediaoff, -				      unsigned int medialen, -				      union nf_inet_addr *rtp_addr) -				      __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_media_hook); +const struct nf_nat_sip_hooks *nf_nat_sip_hooks; +EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);  static int string_len(const struct nf_conn *ct, const char *dptr,  		      const char *limit, int *shift) @@ -183,12 +131,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr,  	return len + digits_len(ct, dptr, limit, shift);  } -static int parse_addr(const struct nf_conn *ct, const char *cp, -                      const char **endp, union nf_inet_addr *addr, -                      const char *limit) +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, +			  const char **endp, union nf_inet_addr *addr, +			  const char *limit, bool delim)  {  	const char *end; -	int ret = 0; +	int ret;  	if (!ct)  		return 0; @@ -197,16 +145,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,  	switch (nf_ct_l3num(ct)) {  	case AF_INET:  		ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); +		if (ret == 0) +			return 0;  		break;  	case AF_INET6: +		if (cp < limit && *cp == '[') +			cp++; +		else if (delim) +			return 0; +  		ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); +		if (ret == 0) +			return 0; + +		if (end < limit && *end == ']') +			end++; +		else if (delim) +			return 0;  		break;  	default:  		BUG();  	} -	if (ret == 0 || end == cp) -		return 0;  	if (endp)  		*endp = end;  	return 1; @@ -219,7 +179,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr,  	union nf_inet_addr addr;  	const char *aux = dptr; -	if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { +	if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) {  		pr_debug("ip: %s parse failed.!\n", dptr);  		return 0;  	} @@ -296,7 +256,7 @@ int ct_sip_parse_request(const struct nf_conn *ct,  		return 0;  	dptr += shift; -	if (!parse_addr(ct, dptr, &end, addr, limit)) +	if (!sip_parse_addr(ct, dptr, &end, addr, limit, true))  		return -1;  	if (end < limit && *end == ':') {  		end++; @@ -550,7 +510,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,  	if (ret == 0)  		return ret; -	if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) +	if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true))  		return -1;  	if (*c == ':') {  		c++; @@ -599,7 +559,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,  			       unsigned int dataoff, unsigned int datalen,  			       const char *name,  			       unsigned int *matchoff, unsigned int *matchlen, -			       union nf_inet_addr *addr) +			       union nf_inet_addr *addr, bool delim)  {  	const char *limit = dptr + datalen;  	const char *start, *end; @@ -613,7 +573,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,  		return 0;  	start += strlen(name); -	if (!parse_addr(ct, start, &end, addr, limit)) +	if (!sip_parse_addr(ct, start, &end, addr, limit, delim))  		return 0;  	*matchoff = start - dptr;  	*matchlen = end - start; @@ -675,6 +635,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,  	return 1;  } +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, +			  const char **endp, union nf_inet_addr *addr, +			  const char *limit) +{ +	const char *end; +	int ret; + +	memset(addr, 0, sizeof(*addr)); +	switch (nf_ct_l3num(ct)) { +	case AF_INET: +		ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); +		break; +	case AF_INET6: +		ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); +		break; +	default: +		BUG(); +	} + +	if (ret == 0) +		return 0; +	if (endp) +		*endp = end; +	return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, +			const char *limit, int *shift) +{ +	union nf_inet_addr addr; +	const char *aux = dptr; + +	if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { +		pr_debug("ip: %s parse failed.!\n", dptr); +		return 0; +	} + +	return dptr - aux; +} +  /* SDP header parsing: a SDP session description contains an ordered set of   * headers, starting with a section containing general session parameters,   * optionally followed by multiple media descriptions. @@ -684,13 +685,18 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,   * be tolerant and also accept records terminated with a single newline   * character". We handle both cases.   */ -static const struct sip_header ct_sdp_hdrs[] = { -	[SDP_HDR_VERSION]		= SDP_HDR("v=", NULL, digits_len), -	[SDP_HDR_OWNER_IP4]		= SDP_HDR("o=", "IN IP4 ", epaddr_len), -	[SDP_HDR_CONNECTION_IP4]	= SDP_HDR("c=", "IN IP4 ", epaddr_len), -	[SDP_HDR_OWNER_IP6]		= SDP_HDR("o=", "IN IP6 ", epaddr_len), -	[SDP_HDR_CONNECTION_IP6]	= SDP_HDR("c=", "IN IP6 ", epaddr_len), -	[SDP_HDR_MEDIA]			= SDP_HDR("m=", NULL, media_len), +static const struct sip_header ct_sdp_hdrs_v4[] = { +	[SDP_HDR_VERSION]	= SDP_HDR("v=", NULL, digits_len), +	[SDP_HDR_OWNER]		= SDP_HDR("o=", "IN IP4 ", sdp_addr_len), +	[SDP_HDR_CONNECTION]	= SDP_HDR("c=", "IN IP4 ", sdp_addr_len), +	[SDP_HDR_MEDIA]		= SDP_HDR("m=", NULL, media_len), +}; + +static const struct sip_header ct_sdp_hdrs_v6[] = { +	[SDP_HDR_VERSION]	= SDP_HDR("v=", NULL, digits_len), +	[SDP_HDR_OWNER]		= SDP_HDR("o=", "IN IP6 ", sdp_addr_len), +	[SDP_HDR_CONNECTION]	= SDP_HDR("c=", "IN IP6 ", sdp_addr_len), +	[SDP_HDR_MEDIA]		= SDP_HDR("m=", NULL, media_len),  };  /* Linear string search within SDP header values */ @@ -707,7 +713,7 @@ static const char *ct_sdp_header_search(const char *dptr, const char *limit,  }  /* Locate a SDP header (optionally a substring within the header value), - * optionally stopping at the first occurence of the term header, parse + * optionally stopping at the first occurrence of the term header, parse   * it and return the offset and length of the data we're interested in.   */  int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr, @@ -716,11 +722,14 @@ int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr,  			  enum sdp_header_types term,  			  unsigned int *matchoff, unsigned int *matchlen)  { -	const struct sip_header *hdr = &ct_sdp_hdrs[type]; -	const struct sip_header *thdr = &ct_sdp_hdrs[term]; +	const struct sip_header *hdrs, *hdr, *thdr;  	const char *start = dptr, *limit = dptr + datalen;  	int shift = 0; +	hdrs = nf_ct_l3num(ct) == NFPROTO_IPV4 ? ct_sdp_hdrs_v4 : ct_sdp_hdrs_v6; +	hdr = &hdrs[type]; +	thdr = &hdrs[term]; +  	for (dptr += dataoff; dptr < limit; dptr++) {  		/* Find beginning of line */  		if (*dptr != '\r' && *dptr != '\n') @@ -775,8 +784,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr,  	if (ret <= 0)  		return ret; -	if (!parse_addr(ct, dptr + *matchoff, NULL, addr, -			dptr + *matchoff + *matchlen)) +	if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, +			    dptr + *matchoff + *matchlen))  		return -1;  	return 1;  } @@ -788,11 +797,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct,  {  	struct nf_conn_help *help = nfct_help(ct);  	struct nf_conntrack_expect *exp; -	struct hlist_node *n, *next; +	struct hlist_node *next;  	int found = 0; -	spin_lock_bh(&nf_conntrack_lock); -	hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { +	spin_lock_bh(&nf_conntrack_expect_lock); +	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {  		if (exp->class != SIP_EXPECT_SIGNALLING ||  		    !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) ||  		    exp->tuple.dst.protonum != proto || @@ -806,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,  		found = 1;  		break;  	} -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  	return found;  } @@ -814,10 +823,10 @@ static void flush_expectations(struct nf_conn *ct, bool media)  {  	struct nf_conn_help *help = nfct_help(ct);  	struct nf_conntrack_expect *exp; -	struct hlist_node *n, *next; +	struct hlist_node *next; -	spin_lock_bh(&nf_conntrack_lock); -	hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { +	spin_lock_bh(&nf_conntrack_expect_lock); +	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {  		if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)  			continue;  		if (!del_timer(&exp->timeout)) @@ -827,10 +836,11 @@ static void flush_expectations(struct nf_conn *ct, bool media)  		if (!media)  			break;  	} -	spin_unlock_bh(&nf_conntrack_lock); +	spin_unlock_bh(&nf_conntrack_expect_lock);  } -static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, +static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, +				 unsigned int dataoff,  				 const char **dptr, unsigned int *datalen,  				 union nf_inet_addr *daddr, __be16 port,  				 enum sip_expectation_classes class, @@ -846,8 +856,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,  	int direct_rtp = 0, skip_expect = 0, ret = NF_DROP;  	u_int16_t base_port;  	__be16 rtp_port, rtcp_port; -	typeof(nf_nat_sdp_port_hook) nf_nat_sdp_port; -	typeof(nf_nat_sdp_media_hook) nf_nat_sdp_media; +	const struct nf_nat_sip_hooks *hooks;  	saddr = NULL;  	if (sip_direct_media) { @@ -886,34 +895,35 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,  		    exp->class != class)  			break;  #ifdef CONFIG_NF_NAT_NEEDED -		if (exp->tuple.src.l3num == AF_INET && !direct_rtp && -		    (exp->saved_ip != exp->tuple.dst.u3.ip || +		if (!direct_rtp && +		    (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) ||  		     exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) &&  		    ct->status & IPS_NAT_MASK) { -			daddr->ip		= exp->saved_ip; -			tuple.dst.u3.ip		= exp->saved_ip; +			*daddr			= exp->saved_addr; +			tuple.dst.u3		= exp->saved_addr;  			tuple.dst.u.udp.port	= exp->saved_proto.udp.port;  			direct_rtp = 1;  		} else  #endif  			skip_expect = 1;  	} while (!skip_expect); -	rcu_read_unlock();  	base_port = ntohs(tuple.dst.u.udp.port) & ~1;  	rtp_port = htons(base_port);  	rtcp_port = htons(base_port + 1);  	if (direct_rtp) { -		nf_nat_sdp_port = rcu_dereference(nf_nat_sdp_port_hook); -		if (nf_nat_sdp_port && -		    !nf_nat_sdp_port(skb, dataoff, dptr, datalen, +		hooks = rcu_dereference(nf_nat_sip_hooks); +		if (hooks && +		    !hooks->sdp_port(skb, protoff, dataoff, dptr, datalen,  				     mediaoff, medialen, ntohs(rtp_port)))  			goto err1;  	} -	if (skip_expect) +	if (skip_expect) { +		rcu_read_unlock();  		return NF_ACCEPT; +	}  	rtp_exp = nf_ct_expect_alloc(ct);  	if (rtp_exp == NULL) @@ -927,10 +937,10 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,  	nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr,  			  IPPROTO_UDP, NULL, &rtcp_port); -	nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook); -	if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp) -		ret = nf_nat_sdp_media(skb, dataoff, dptr, datalen, -				       rtp_exp, rtcp_exp, +	hooks = rcu_dereference(nf_nat_sip_hooks); +	if (hooks && ct->status & IPS_NAT_MASK && !direct_rtp) +		ret = hooks->sdp_media(skb, protoff, dataoff, dptr, +				       datalen, rtp_exp, rtcp_exp,  				       mediaoff, medialen, daddr);  	else {  		if (nf_ct_expect_related(rtp_exp) == 0) { @@ -944,6 +954,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,  err2:  	nf_ct_expect_put(rtp_exp);  err1: +	rcu_read_unlock();  	return ret;  } @@ -970,7 +981,8 @@ static const struct sdp_media_type *sdp_media_type(const char *dptr,  	return NULL;  } -static int process_sdp(struct sk_buff *skb, unsigned int dataoff, +static int process_sdp(struct sk_buff *skb, unsigned int protoff, +		       unsigned int dataoff,  		       const char **dptr, unsigned int *datalen,  		       unsigned int cseq)  { @@ -982,16 +994,12 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,  	unsigned int caddr_len, maddr_len;  	unsigned int i;  	union nf_inet_addr caddr, maddr, rtp_addr; +	const struct nf_nat_sip_hooks *hooks;  	unsigned int port; -	enum sdp_header_types c_hdr;  	const struct sdp_media_type *t;  	int ret = NF_ACCEPT; -	typeof(nf_nat_sdp_addr_hook) nf_nat_sdp_addr; -	typeof(nf_nat_sdp_session_hook) nf_nat_sdp_session; -	nf_nat_sdp_addr = rcu_dereference(nf_nat_sdp_addr_hook); -	c_hdr = nf_ct_l3num(ct) == AF_INET ? SDP_HDR_CONNECTION_IP4 : -					     SDP_HDR_CONNECTION_IP6; +	hooks = rcu_dereference(nf_nat_sip_hooks);  	/* Find beginning of session description */  	if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, @@ -1005,7 +1013,7 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,  	 * the end of the session description. */  	caddr_len = 0;  	if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen, -				  c_hdr, SDP_HDR_MEDIA, +				  SDP_HDR_CONNECTION, SDP_HDR_MEDIA,  				  &matchoff, &matchlen, &caddr) > 0)  		caddr_len = matchlen; @@ -1029,111 +1037,129 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,  		port = simple_strtoul(*dptr + mediaoff, NULL, 10);  		if (port == 0)  			continue; -		if (port < 1024 || port > 65535) +		if (port < 1024 || port > 65535) { +			nf_ct_helper_log(skb, ct, "wrong port %u", port);  			return NF_DROP; +		}  		/* The media description overrides the session description. */  		maddr_len = 0;  		if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen, -					  c_hdr, SDP_HDR_MEDIA, +					  SDP_HDR_CONNECTION, SDP_HDR_MEDIA,  					  &matchoff, &matchlen, &maddr) > 0) {  			maddr_len = matchlen;  			memcpy(&rtp_addr, &maddr, sizeof(rtp_addr));  		} else if (caddr_len)  			memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); -		else +		else { +			nf_ct_helper_log(skb, ct, "cannot parse SDP message");  			return NF_DROP; +		} -		ret = set_expected_rtp_rtcp(skb, dataoff, dptr, datalen, +		ret = set_expected_rtp_rtcp(skb, protoff, dataoff, +					    dptr, datalen,  					    &rtp_addr, htons(port), t->class,  					    mediaoff, medialen); -		if (ret != NF_ACCEPT) +		if (ret != NF_ACCEPT) { +			nf_ct_helper_log(skb, ct, +					 "cannot add expectation for voice");  			return ret; +		}  		/* Update media connection address if present */ -		if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) { -			ret = nf_nat_sdp_addr(skb, dataoff, dptr, datalen, -					      mediaoff, c_hdr, SDP_HDR_MEDIA, +		if (maddr_len && hooks && ct->status & IPS_NAT_MASK) { +			ret = hooks->sdp_addr(skb, protoff, dataoff, +					      dptr, datalen, mediaoff, +					      SDP_HDR_CONNECTION, +					      SDP_HDR_MEDIA,  					      &rtp_addr); -			if (ret != NF_ACCEPT) +			if (ret != NF_ACCEPT) { +				nf_ct_helper_log(skb, ct, "cannot mangle SDP");  				return ret; +			}  		}  		i++;  	}  	/* Update session connection and owner addresses */ -	nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook); -	if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK) -		ret = nf_nat_sdp_session(skb, dataoff, dptr, datalen, sdpoff, +	hooks = rcu_dereference(nf_nat_sip_hooks); +	if (hooks && ct->status & IPS_NAT_MASK) +		ret = hooks->sdp_session(skb, protoff, dataoff, +					 dptr, datalen, sdpoff,  					 &rtp_addr);  	return ret;  } -static int process_invite_response(struct sk_buff *skb, unsigned int dataoff, +static int process_invite_response(struct sk_buff *skb, unsigned int protoff, +				   unsigned int dataoff,  				   const char **dptr, unsigned int *datalen,  				   unsigned int cseq, unsigned int code)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	struct nf_conn_help *help = nfct_help(ct); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);  	if ((code >= 100 && code <= 199) ||  	    (code >= 200 && code <= 299)) -		return process_sdp(skb, dataoff, dptr, datalen, cseq); -	else if (help->help.ct_sip_info.invite_cseq == cseq) +		return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); +	else if (ct_sip_info->invite_cseq == cseq)  		flush_expectations(ct, true);  	return NF_ACCEPT;  } -static int process_update_response(struct sk_buff *skb, unsigned int dataoff, +static int process_update_response(struct sk_buff *skb, unsigned int protoff, +				   unsigned int dataoff,  				   const char **dptr, unsigned int *datalen,  				   unsigned int cseq, unsigned int code)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	struct nf_conn_help *help = nfct_help(ct); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);  	if ((code >= 100 && code <= 199) ||  	    (code >= 200 && code <= 299)) -		return process_sdp(skb, dataoff, dptr, datalen, cseq); -	else if (help->help.ct_sip_info.invite_cseq == cseq) +		return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); +	else if (ct_sip_info->invite_cseq == cseq)  		flush_expectations(ct, true);  	return NF_ACCEPT;  } -static int process_prack_response(struct sk_buff *skb, unsigned int dataoff, +static int process_prack_response(struct sk_buff *skb, unsigned int protoff, +				  unsigned int dataoff,  				  const char **dptr, unsigned int *datalen,  				  unsigned int cseq, unsigned int code)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	struct nf_conn_help *help = nfct_help(ct); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);  	if ((code >= 100 && code <= 199) ||  	    (code >= 200 && code <= 299)) -		return process_sdp(skb, dataoff, dptr, datalen, cseq); -	else if (help->help.ct_sip_info.invite_cseq == cseq) +		return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); +	else if (ct_sip_info->invite_cseq == cseq)  		flush_expectations(ct, true);  	return NF_ACCEPT;  } -static int process_invite_request(struct sk_buff *skb, unsigned int dataoff, +static int process_invite_request(struct sk_buff *skb, unsigned int protoff, +				  unsigned int dataoff,  				  const char **dptr, unsigned int *datalen,  				  unsigned int cseq)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	struct nf_conn_help *help = nfct_help(ct); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);  	unsigned int ret;  	flush_expectations(ct, true); -	ret = process_sdp(skb, dataoff, dptr, datalen, cseq); +	ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);  	if (ret == NF_ACCEPT) -		help->help.ct_sip_info.invite_cseq = cseq; +		ct_sip_info->invite_cseq = cseq;  	return ret;  } -static int process_bye_request(struct sk_buff *skb, unsigned int dataoff, +static int process_bye_request(struct sk_buff *skb, unsigned int protoff, +			       unsigned int dataoff,  			       const char **dptr, unsigned int *datalen,  			       unsigned int cseq)  { @@ -1148,22 +1174,23 @@ static int process_bye_request(struct sk_buff *skb, unsigned int dataoff,   * signalling connections. The expectation is marked inactive and is activated   * when receiving a response indicating success from the registrar.   */ -static int process_register_request(struct sk_buff *skb, unsigned int dataoff, +static int process_register_request(struct sk_buff *skb, unsigned int protoff, +				    unsigned int dataoff,  				    const char **dptr, unsigned int *datalen,  				    unsigned int cseq)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	struct nf_conn_help *help = nfct_help(ct); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);  	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);  	unsigned int matchoff, matchlen;  	struct nf_conntrack_expect *exp;  	union nf_inet_addr *saddr, daddr; +	const struct nf_nat_sip_hooks *hooks;  	__be16 port;  	u8 proto;  	unsigned int expires = 0;  	int ret; -	typeof(nf_nat_sip_expect_hook) nf_nat_sip_expect;  	/* Expected connections can not register again. */  	if (ct->status & IPS_EXPECTED) @@ -1184,9 +1211,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,  	ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,  				      SIP_HDR_CONTACT, NULL,  				      &matchoff, &matchlen, &daddr, &port); -	if (ret < 0) +	if (ret < 0) { +		nf_ct_helper_log(skb, ct, "cannot parse contact");  		return NF_DROP; -	else if (ret == 0) +	} else if (ret == 0)  		return NF_ACCEPT;  	/* We don't support third-party registrations */ @@ -1199,8 +1227,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,  	if (ct_sip_parse_numerical_param(ct, *dptr,  					 matchoff + matchlen, *datalen, -					 "expires=", NULL, NULL, &expires) < 0) +					 "expires=", NULL, NULL, &expires) < 0) { +		nf_ct_helper_log(skb, ct, "cannot parse expires");  		return NF_DROP; +	}  	if (expires == 0) {  		ret = NF_ACCEPT; @@ -1208,8 +1238,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,  	}  	exp = nf_ct_expect_alloc(ct); -	if (!exp) +	if (!exp) { +		nf_ct_helper_log(skb, ct, "cannot alloc expectation");  		return NF_DROP; +	}  	saddr = NULL;  	if (sip_direct_signalling) @@ -1221,31 +1253,33 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,  	exp->helper = nfct_help(ct)->helper;  	exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; -	nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook); -	if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK) -		ret = nf_nat_sip_expect(skb, dataoff, dptr, datalen, exp, -					matchoff, matchlen); +	hooks = rcu_dereference(nf_nat_sip_hooks); +	if (hooks && ct->status & IPS_NAT_MASK) +		ret = hooks->expect(skb, protoff, dataoff, dptr, datalen, +				    exp, matchoff, matchlen);  	else { -		if (nf_ct_expect_related(exp) != 0) +		if (nf_ct_expect_related(exp) != 0) { +			nf_ct_helper_log(skb, ct, "cannot add expectation");  			ret = NF_DROP; -		else +		} else  			ret = NF_ACCEPT;  	}  	nf_ct_expect_put(exp);  store_cseq:  	if (ret == NF_ACCEPT) -		help->help.ct_sip_info.register_cseq = cseq; +		ct_sip_info->register_cseq = cseq;  	return ret;  } -static int process_register_response(struct sk_buff *skb, unsigned int dataoff, +static int process_register_response(struct sk_buff *skb, unsigned int protoff, +				     unsigned int dataoff,  				     const char **dptr, unsigned int *datalen,  				     unsigned int cseq, unsigned int code)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	struct nf_conn_help *help = nfct_help(ct); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);  	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);  	union nf_inet_addr addr;  	__be16 port; @@ -1262,7 +1296,7 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,  	 * responses, so we store the sequence number of the last valid  	 * request and compare it here.  	 */ -	if (help->help.ct_sip_info.register_cseq != cseq) +	if (ct_sip_info->register_cseq != cseq)  		return NF_ACCEPT;  	if (code >= 100 && code <= 199) @@ -1281,9 +1315,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,  					      SIP_HDR_CONTACT, &in_contact,  					      &matchoff, &matchlen,  					      &addr, &port); -		if (ret < 0) +		if (ret < 0) { +			nf_ct_helper_log(skb, ct, "cannot parse contact");  			return NF_DROP; -		else if (ret == 0) +		} else if (ret == 0)  			break;  		/* We don't support third-party registrations */ @@ -1298,8 +1333,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,  						   matchoff + matchlen,  						   *datalen, "expires=",  						   NULL, NULL, &c_expires); -		if (ret < 0) +		if (ret < 0) { +			nf_ct_helper_log(skb, ct, "cannot parse expires");  			return NF_DROP; +		}  		if (c_expires == 0)  			break;  		if (refresh_signalling_expectation(ct, &addr, proto, port, @@ -1321,7 +1358,8 @@ static const struct sip_handler sip_handlers[] = {  	SIP_HANDLER("REGISTER", process_register_request, process_register_response),  }; -static int process_sip_response(struct sk_buff *skb, unsigned int dataoff, +static int process_sip_response(struct sk_buff *skb, unsigned int protoff, +				unsigned int dataoff,  				const char **dptr, unsigned int *datalen)  {  	enum ip_conntrack_info ctinfo; @@ -1332,15 +1370,21 @@ static int process_sip_response(struct sk_buff *skb, unsigned int dataoff,  	if (*datalen < strlen("SIP/2.0 200"))  		return NF_ACCEPT;  	code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10); -	if (!code) +	if (!code) { +		nf_ct_helper_log(skb, ct, "cannot get code");  		return NF_DROP; +	}  	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, -			      &matchoff, &matchlen) <= 0) +			      &matchoff, &matchlen) <= 0) { +		nf_ct_helper_log(skb, ct, "cannot parse cseq");  		return NF_DROP; +	}  	cseq = simple_strtoul(*dptr + matchoff, NULL, 10); -	if (!cseq) +	if (!cseq) { +		nf_ct_helper_log(skb, ct, "cannot get cseq");  		return NF_DROP; +	}  	matchend = matchoff + matchlen + 1;  	for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { @@ -1352,19 +1396,37 @@ static int process_sip_response(struct sk_buff *skb, unsigned int dataoff,  		if (*datalen < matchend + handler->len ||  		    strnicmp(*dptr + matchend, handler->method, handler->len))  			continue; -		return handler->response(skb, dataoff, dptr, datalen, +		return handler->response(skb, protoff, dataoff, dptr, datalen,  					 cseq, code);  	}  	return NF_ACCEPT;  } -static int process_sip_request(struct sk_buff *skb, unsigned int dataoff, +static int process_sip_request(struct sk_buff *skb, unsigned int protoff, +			       unsigned int dataoff,  			       const char **dptr, unsigned int *datalen)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);  	unsigned int matchoff, matchlen;  	unsigned int cseq, i; +	union nf_inet_addr addr; +	__be16 port; + +	/* Many Cisco IP phones use a high source port for SIP requests, but +	 * listen for the response on port 5060.  If we are the local +	 * router for one of these phones, save the port number from the +	 * Via: header so that nf_nat_sip can redirect the responses to +	 * the correct port. +	 */ +	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, +				    SIP_HDR_VIA_UDP, NULL, &matchoff, +				    &matchlen, &addr, &port) > 0 && +	    port != ct->tuplehash[dir].tuple.src.u.udp.port && +	    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3)) +		ct_sip_info->forced_dport = port;  	for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {  		const struct sip_handler *handler; @@ -1377,33 +1439,41 @@ static int process_sip_request(struct sk_buff *skb, unsigned int dataoff,  			continue;  		if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, -				      &matchoff, &matchlen) <= 0) +				      &matchoff, &matchlen) <= 0) { +			nf_ct_helper_log(skb, ct, "cannot parse cseq");  			return NF_DROP; +		}  		cseq = simple_strtoul(*dptr + matchoff, NULL, 10); -		if (!cseq) +		if (!cseq) { +			nf_ct_helper_log(skb, ct, "cannot get cseq");  			return NF_DROP; +		} -		return handler->request(skb, dataoff, dptr, datalen, cseq); +		return handler->request(skb, protoff, dataoff, dptr, datalen, +					cseq);  	}  	return NF_ACCEPT;  }  static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct, -			   unsigned int dataoff, const char **dptr, -			   unsigned int *datalen) +			   unsigned int protoff, unsigned int dataoff, +			   const char **dptr, unsigned int *datalen)  { -	typeof(nf_nat_sip_hook) nf_nat_sip; +	const struct nf_nat_sip_hooks *hooks;  	int ret;  	if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0) -		ret = process_sip_request(skb, dataoff, dptr, datalen); +		ret = process_sip_request(skb, protoff, dataoff, dptr, datalen);  	else -		ret = process_sip_response(skb, dataoff, dptr, datalen); +		ret = process_sip_response(skb, protoff, dataoff, dptr, datalen);  	if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { -		nf_nat_sip = rcu_dereference(nf_nat_sip_hook); -		if (nf_nat_sip && !nf_nat_sip(skb, dataoff, dptr, datalen)) +		hooks = rcu_dereference(nf_nat_sip_hooks); +		if (hooks && !hooks->msg(skb, protoff, dataoff, +					 dptr, datalen)) { +			nf_ct_helper_log(skb, ct, "cannot NAT SIP message");  			ret = NF_DROP; +		}  	}  	return ret; @@ -1419,10 +1489,10 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,  	const char *dptr, *end;  	s16 diff, tdiff = 0;  	int ret = NF_ACCEPT; -	typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust; +	bool term;  	if (ctinfo != IP_CT_ESTABLISHED && -	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) +	    ctinfo != IP_CT_ESTABLISHED_REPLY)  		return NF_ACCEPT;  	/* No Data ? */ @@ -1453,16 +1523,25 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,  		if (dptr + matchoff == end)  			break; -		if (end + strlen("\r\n\r\n") > dptr + datalen) -			break; -		if (end[0] != '\r' || end[1] != '\n' || -		    end[2] != '\r' || end[3] != '\n') +		term = false; +		for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) { +			if (end[0] == '\r' && end[1] == '\n' && +			    end[2] == '\r' && end[3] == '\n') { +				term = true; +				break; +			} +		} +		if (!term)  			break;  		end += strlen("\r\n\r\n") + clen;  		msglen = origlen = end - dptr; +		if (msglen > datalen) +			return NF_ACCEPT; -		ret = process_sip_msg(skb, ct, dataoff, &dptr, &msglen); +		ret = process_sip_msg(skb, ct, protoff, dataoff, +				      &dptr, &msglen); +		/* process_sip_* functions report why this packet is dropped */  		if (ret != NF_ACCEPT)  			break;  		diff     = msglen - origlen; @@ -1474,9 +1553,11 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,  	}  	if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { -		nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook); -		if (nf_nat_sip_seq_adjust) -			nf_nat_sip_seq_adjust(skb, tdiff); +		const struct nf_nat_sip_hooks *hooks; + +		hooks = rcu_dereference(nf_nat_sip_hooks); +		if (hooks) +			hooks->seq_adjust(skb, protoff, tdiff);  	}  	return ret; @@ -1503,11 +1584,10 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,  	if (datalen < strlen("SIP/2.0 200"))  		return NF_ACCEPT; -	return process_sip_msg(skb, ct, dataoff, &dptr, &datalen); +	return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen);  }  static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; -static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly;  static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {  	[SIP_EXPECT_SIGNALLING] = { @@ -1548,7 +1628,6 @@ static void nf_conntrack_sip_fini(void)  static int __init nf_conntrack_sip_init(void)  {  	int i, j, ret; -	char *tmpname;  	if (ports_c == 0)  		ports[ports_c++] = SIP_PORT; @@ -1571,17 +1650,16 @@ static int __init nf_conntrack_sip_init(void)  		sip[i][3].help = sip_help_tcp;  		for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { +			sip[i][j].data_len = sizeof(struct nf_ct_sip_master);  			sip[i][j].tuple.src.u.udp.port = htons(ports[i]);  			sip[i][j].expect_policy = sip_exp_policy;  			sip[i][j].expect_class_max = SIP_EXPECT_MAX;  			sip[i][j].me = THIS_MODULE; -			tmpname = &sip_names[i][j][0];  			if (ports[i] == SIP_PORT) -				sprintf(tmpname, "sip"); +				sprintf(sip[i][j].name, "sip");  			else -				sprintf(tmpname, "sip-%u", i); -			sip[i][j].name = tmpname; +				sprintf(sip[i][j].name, "sip-%u", i);  			pr_debug("port #%u: %u\n", i, ports[i]); diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c new file mode 100644 index 00000000000..87b95a2c270 --- /dev/null +++ b/net/netfilter/nf_conntrack_snmp.c @@ -0,0 +1,78 @@ +/* + *      SNMP service broadcast connection tracking helper + * + *      (c) 2011 Jiri Olsa <jolsa@redhat.com> + * + *      This program is free software; you can redistribute it and/or + *      modify it under the terms of the GNU General Public License + *      as published by the Free Software Foundation; either version + *      2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/in.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_snmp.h> + +#define SNMP_PORT	161 + +MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>"); +MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFCT_HELPER("snmp"); + +static unsigned int timeout __read_mostly = 30; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +int (*nf_nat_snmp_hook)(struct sk_buff *skb, +			unsigned int protoff, +			struct nf_conn *ct, +			enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); + +static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, +		struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ +	typeof(nf_nat_snmp_hook) nf_nat_snmp; + +	nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + +	nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); +	if (nf_nat_snmp && ct->status & IPS_NAT_MASK) +		return nf_nat_snmp(skb, protoff, ct, ctinfo); + +	return NF_ACCEPT; +} + +static struct nf_conntrack_expect_policy exp_policy = { +	.max_expected	= 1, +}; + +static struct nf_conntrack_helper helper __read_mostly = { +	.name			= "snmp", +	.tuple.src.l3num	= NFPROTO_IPV4, +	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_PORT), +	.tuple.dst.protonum	= IPPROTO_UDP, +	.me			= THIS_MODULE, +	.help			= snmp_conntrack_help, +	.expect_policy		= &exp_policy, +}; + +static int __init nf_conntrack_snmp_init(void) +{ +	exp_policy.timeout = timeout; +	return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_snmp_fini(void) +{ +	nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_snmp_init); +module_exit(nf_conntrack_snmp_fini); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0fb65705b44..f641751dba9 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,5 +1,6 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -29,10 +30,12 @@  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_conntrack_acct.h>  #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <linux/rculist_nulls.h>  MODULE_LICENSE("GPL"); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_NF_CONNTRACK_PROCFS  int  print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,              const struct nf_conntrack_l3proto *l3proto, @@ -45,6 +48,7 @@ EXPORT_SYMBOL_GPL(print_tuple);  struct ct_iter_state {  	struct seq_net_private p;  	unsigned int bucket; +	u_int64_t time_now;  };  static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) @@ -56,7 +60,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)  	for (st->bucket = 0;  	     st->bucket < net->ct.htable_size;  	     st->bucket++) { -		n = rcu_dereference(net->ct.hash[st->bucket].first); +		n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));  		if (!is_a_nulls(n))  			return n;  	} @@ -69,13 +73,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,  	struct net *net = seq_file_net(seq);  	struct ct_iter_state *st = seq->private; -	head = rcu_dereference(head->next); +	head = rcu_dereference(hlist_nulls_next_rcu(head));  	while (is_a_nulls(head)) {  		if (likely(get_nulls_value(head) == st->bucket)) {  			if (++st->bucket >= net->ct.htable_size)  				return NULL;  		} -		head = rcu_dereference(net->ct.hash[st->bucket].first); +		head = rcu_dereference( +				hlist_nulls_first_rcu( +					&net->ct.hash[st->bucket]));  	}  	return head;  } @@ -93,6 +99,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)  static void *ct_seq_start(struct seq_file *seq, loff_t *pos)  	__acquires(RCU)  { +	struct ct_iter_state *st = seq->private; + +	st->time_now = ktime_to_ns(ktime_get_real());  	rcu_read_lock();  	return ct_get_idx(seq, *pos);  } @@ -118,7 +127,7 @@ static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)  	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);  	if (ret) -		return ret; +		return 0;  	ret = seq_printf(s, "secctx=%s ", secctx); @@ -132,6 +141,34 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)  }  #endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ +	struct ct_iter_state *st = s->private; +	struct nf_conn_tstamp *tstamp; +	s64 delta_time; + +	tstamp = nf_conn_tstamp_find(ct); +	if (tstamp) { +		delta_time = st->time_now - tstamp->start; +		if (delta_time > 0) +			delta_time = div_s64(delta_time, NSEC_PER_SEC); +		else +			delta_time = 0; + +		return seq_printf(s, "delta-time=%llu ", +				  (unsigned long long)delta_time); +	} +	return 0; +} +#else +static inline int +ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ +	return 0; +} +#endif +  /* return 0 on success, 1 in case of error */  static int ct_seq_show(struct seq_file *s, void *v)  { @@ -200,13 +237,16 @@ static int ct_seq_show(struct seq_file *s, void *v)  		goto release;  #endif +	if (ct_show_delta_time(s, ct)) +		goto release; +  	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))  		goto release;  	ret = 0;  release:  	nf_ct_put(ct); -	return 0; +	return ret;  }  static const struct seq_operations ct_seq_ops = { @@ -327,7 +367,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)  {  	struct proc_dir_entry *pde; -	pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); +	pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);  	if (!pde)  		goto out_nf_conntrack; @@ -338,7 +378,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)  	return 0;  out_stat_nf_conntrack: -	proc_net_remove(net, "nf_conntrack"); +	remove_proc_entry("nf_conntrack", net->proc_net);  out_nf_conntrack:  	return -ENOMEM;  } @@ -346,7 +386,7 @@ out_nf_conntrack:  static void nf_conntrack_standalone_fini_proc(struct net *net)  {  	remove_proc_entry("nf_conntrack", net->proc_net_stat); -	proc_net_remove(net, "nf_conntrack"); +	remove_proc_entry("nf_conntrack", net->proc_net);  }  #else  static int nf_conntrack_standalone_init_proc(struct net *net) @@ -357,7 +397,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)  static void nf_conntrack_standalone_fini_proc(struct net *net)  {  } -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NF_CONNTRACK_PROCFS */  /* Sysctl support */ @@ -368,7 +408,7 @@ static int log_invalid_proto_max = 255;  static struct ctl_table_header *nf_ct_netfilter_header; -static ctl_table nf_ct_sysctl_table[] = { +static struct ctl_table nf_ct_sysctl_table[] = {  	{  		.procname	= "nf_conntrack_max",  		.data		= &nf_conntrack_max, @@ -418,7 +458,7 @@ static ctl_table nf_ct_sysctl_table[] = {  #define NET_NF_CONNTRACK_MAX 2089 -static ctl_table nf_ct_netfilter_table[] = { +static struct ctl_table nf_ct_netfilter_table[] = {  	{  		.procname	= "nf_conntrack_max",  		.data		= &nf_conntrack_max, @@ -429,22 +469,10 @@ static ctl_table nf_ct_netfilter_table[] = {  	{ }  }; -static struct ctl_path nf_ct_path[] = { -	{ .procname = "net", }, -	{ } -}; -  static int nf_conntrack_standalone_init_sysctl(struct net *net)  {  	struct ctl_table *table; -	if (net_eq(net, &init_net)) { -		nf_ct_netfilter_header = -		       register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); -		if (!nf_ct_netfilter_header) -			goto out; -	} -  	table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table),  			GFP_KERNEL);  	if (!table) @@ -455,8 +483,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)  	table[3].data = &net->ct.sysctl_checksum;  	table[4].data = &net->ct.sysctl_log_invalid; -	net->ct.sysctl_header = register_net_sysctl_table(net, -					nf_net_netfilter_sysctl_path, table); +	/* Don't export sysctls to unprivileged users */ +	if (net->user_ns != &init_user_ns) +		table[0].procname = NULL; + +	net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);  	if (!net->ct.sysctl_header)  		goto out_unregister_netfilter; @@ -465,10 +496,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)  out_unregister_netfilter:  	kfree(table);  out_kmemdup: -	if (net_eq(net, &init_net)) -		unregister_sysctl_table(nf_ct_netfilter_header); -out: -	printk(KERN_ERR "nf_conntrack: can't register to sysctl.\n");  	return -ENOMEM;  } @@ -476,8 +503,6 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)  {  	struct ctl_table *table; -	if (net_eq(net, &init_net)) -		unregister_sysctl_table(nf_ct_netfilter_header);  	table = net->ct.sysctl_header->ctl_table_arg;  	unregister_net_sysctl_table(net->ct.sysctl_header);  	kfree(table); @@ -493,51 +518,91 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)  }  #endif /* CONFIG_SYSCTL */ -static int nf_conntrack_net_init(struct net *net) +static int nf_conntrack_pernet_init(struct net *net)  {  	int ret; -	ret = nf_conntrack_init(net); +	ret = nf_conntrack_init_net(net);  	if (ret < 0)  		goto out_init; +  	ret = nf_conntrack_standalone_init_proc(net);  	if (ret < 0)  		goto out_proc; +  	net->ct.sysctl_checksum = 1;  	net->ct.sysctl_log_invalid = 0;  	ret = nf_conntrack_standalone_init_sysctl(net);  	if (ret < 0)  		goto out_sysctl; +  	return 0;  out_sysctl:  	nf_conntrack_standalone_fini_proc(net);  out_proc: -	nf_conntrack_cleanup(net); +	nf_conntrack_cleanup_net(net);  out_init:  	return ret;  } -static void nf_conntrack_net_exit(struct net *net) +static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)  { -	nf_conntrack_standalone_fini_sysctl(net); -	nf_conntrack_standalone_fini_proc(net); -	nf_conntrack_cleanup(net); +	struct net *net; + +	list_for_each_entry(net, net_exit_list, exit_list) { +		nf_conntrack_standalone_fini_sysctl(net); +		nf_conntrack_standalone_fini_proc(net); +	} +	nf_conntrack_cleanup_net_list(net_exit_list);  }  static struct pernet_operations nf_conntrack_net_ops = { -	.init = nf_conntrack_net_init, -	.exit = nf_conntrack_net_exit, +	.init		= nf_conntrack_pernet_init, +	.exit_batch	= nf_conntrack_pernet_exit,  };  static int __init nf_conntrack_standalone_init(void)  { -	return register_pernet_subsys(&nf_conntrack_net_ops); +	int ret = nf_conntrack_init_start(); +	if (ret < 0) +		goto out_start; + +#ifdef CONFIG_SYSCTL +	nf_ct_netfilter_header = +		register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); +	if (!nf_ct_netfilter_header) { +		pr_err("nf_conntrack: can't register to sysctl.\n"); +		ret = -ENOMEM; +		goto out_sysctl; +	} +#endif + +	ret = register_pernet_subsys(&nf_conntrack_net_ops); +	if (ret < 0) +		goto out_pernet; + +	nf_conntrack_init_end(); +	return 0; + +out_pernet: +#ifdef CONFIG_SYSCTL +	unregister_net_sysctl_table(nf_ct_netfilter_header); +out_sysctl: +#endif +	nf_conntrack_cleanup_end(); +out_start: +	return ret;  }  static void __exit nf_conntrack_standalone_fini(void)  { +	nf_conntrack_cleanup_start();  	unregister_pernet_subsys(&nf_conntrack_net_ops); +#ifdef CONFIG_SYSCTL +	unregister_net_sysctl_table(nf_ct_netfilter_header); +#endif +	nf_conntrack_cleanup_end();  }  module_init(nf_conntrack_standalone_init); diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 75466fd72f4..e68ab4fbd71 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -1,5 +1,5 @@  /* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as   * published by the Free Software Foundation. @@ -60,8 +60,10 @@ static int tftp_help(struct sk_buff *skb,  		nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);  		exp = nf_ct_expect_alloc(ct); -		if (exp == NULL) +		if (exp == NULL) { +			nf_ct_helper_log(skb, ct, "cannot alloc expectation");  			return NF_DROP; +		}  		tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;  		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,  				  nf_ct_l3num(ct), @@ -74,8 +76,10 @@ static int tftp_help(struct sk_buff *skb,  		nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);  		if (nf_nat_tftp && ct->status & IPS_NAT_MASK)  			ret = nf_nat_tftp(skb, ctinfo, exp); -		else if (nf_ct_expect_related(exp) != 0) +		else if (nf_ct_expect_related(exp) != 0) { +			nf_ct_helper_log(skb, ct, "cannot add expectation");  			ret = NF_DROP; +		}  		nf_ct_expect_put(exp);  		break;  	case TFTP_OPCODE_DATA: @@ -92,7 +96,6 @@ static int tftp_help(struct sk_buff *skb,  }  static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; -static char tftp_names[MAX_PORTS][2][sizeof("tftp-65535")] __read_mostly;  static const struct nf_conntrack_expect_policy tftp_exp_policy = {  	.max_expected	= 1, @@ -112,7 +115,6 @@ static void nf_conntrack_tftp_fini(void)  static int __init nf_conntrack_tftp_init(void)  {  	int i, j, ret; -	char *tmpname;  	if (ports_c == 0)  		ports[ports_c++] = TFTP_PORT; @@ -129,12 +131,10 @@ static int __init nf_conntrack_tftp_init(void)  			tftp[i][j].me = THIS_MODULE;  			tftp[i][j].help = tftp_help; -			tmpname = &tftp_names[i][j][0];  			if (ports[i] == TFTP_PORT) -				sprintf(tmpname, "tftp"); +				sprintf(tftp[i][j].name, "tftp");  			else -				sprintf(tmpname, "tftp-%u", i); -			tftp[i][j].name = tmpname; +				sprintf(tftp[i][j].name, "tftp-%u", i);  			ret = nf_conntrack_helper_register(&tftp[i][j]);  			if (ret) { diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c new file mode 100644 index 00000000000..93da609d9d2 --- /dev/null +++ b/net/netfilter/nf_conntrack_timeout.c @@ -0,0 +1,51 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +struct ctnl_timeout * +(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); + +void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); + +static struct nf_ct_ext_type timeout_extend __read_mostly = { +	.len	= sizeof(struct nf_conn_timeout), +	.align	= __alignof__(struct nf_conn_timeout), +	.id	= NF_CT_EXT_TIMEOUT, +}; + +int nf_conntrack_timeout_init(void) +{ +	int ret = nf_ct_extend_register(&timeout_extend); +	if (ret < 0) +		pr_err("nf_ct_timeout: Unable to register timeout extension.\n"); +	return ret; +} + +void nf_conntrack_timeout_fini(void) +{ +	nf_ct_extend_unregister(&timeout_extend); +} diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c new file mode 100644 index 00000000000..7a394df0deb --- /dev/null +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -0,0 +1,114 @@ +/* + * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timestamp.h> + +static bool nf_ct_tstamp __read_mostly; + +module_param_named(tstamp, nf_ct_tstamp, bool, 0644); +MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table tstamp_sysctl_table[] = { +	{ +		.procname	= "nf_conntrack_timestamp", +		.data		= &init_net.ct.sysctl_tstamp, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type tstamp_extend __read_mostly = { +	.len	= sizeof(struct nf_conn_tstamp), +	.align	= __alignof__(struct nf_conn_tstamp), +	.id	= NF_CT_EXT_TSTAMP, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ +	struct ctl_table *table; + +	table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table), +			GFP_KERNEL); +	if (!table) +		goto out; + +	table[0].data = &net->ct.sysctl_tstamp; + +	/* Don't export sysctls to unprivileged users */ +	if (net->user_ns != &init_user_ns) +		table[0].procname = NULL; + +	net->ct.tstamp_sysctl_header = register_net_sysctl(net,	"net/netfilter", +							   table); +	if (!net->ct.tstamp_sysctl_header) { +		printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); +		goto out_register; +	} +	return 0; + +out_register: +	kfree(table); +out: +	return -ENOMEM; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ +	struct ctl_table *table; + +	table = net->ct.tstamp_sysctl_header->ctl_table_arg; +	unregister_net_sysctl_table(net->ct.tstamp_sysctl_header); +	kfree(table); +} +#else +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ +	return 0; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_tstamp_pernet_init(struct net *net) +{ +	net->ct.sysctl_tstamp = nf_ct_tstamp; +	return nf_conntrack_tstamp_init_sysctl(net); +} + +void nf_conntrack_tstamp_pernet_fini(struct net *net) +{ +	nf_conntrack_tstamp_fini_sysctl(net); +} + +int nf_conntrack_tstamp_init(void) +{ +	int ret; +	ret = nf_ct_extend_register(&tstamp_extend); +	if (ret < 0) +		pr_err("nf_ct_tstamp: Unable to register extension\n"); +	return ret; +} + +void nf_conntrack_tstamp_fini(void) +{ +	nf_ct_extend_unregister(&tstamp_extend); +} diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 770f76432ad..61a3c927e63 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -13,26 +13,20 @@  /* core.c */ -extern unsigned int nf_iterate(struct list_head *head, -				struct sk_buff *skb, -				unsigned int hook, -				const struct net_device *indev, -				const struct net_device *outdev, -				struct list_head **i, -				int (*okfn)(struct sk_buff *), -				int hook_thresh); +unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, +			unsigned int hook, const struct net_device *indev, +			const struct net_device *outdev, +			struct nf_hook_ops **elemp, +			int (*okfn)(struct sk_buff *), int hook_thresh);  /* nf_queue.c */ -extern int nf_queue(struct sk_buff *skb, -		    struct list_head *elem, -		    u_int8_t pf, unsigned int hook, -		    struct net_device *indev, -		    struct net_device *outdev, -		    int (*okfn)(struct sk_buff *), -		    unsigned int queuenum); -extern int __init netfilter_queue_init(void); +int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf, +	     unsigned int hook, struct net_device *indev, +	     struct net_device *outdev, int (*okfn)(struct sk_buff *), +	     unsigned int queuenum); +int __init netfilter_queue_init(void);  /* nf_log.c */ -extern int __init netfilter_log_init(void); +int __init netfilter_log_init(void);  #endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index b07393eab88..85296d4eac0 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,7 +16,6 @@  #define NF_LOG_PREFIXLEN		128  #define NFLOGGER_NAME_LEN		64 -static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;  static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;  static DEFINE_MUTEX(nf_log_mutex); @@ -32,13 +31,46 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)  	return NULL;  } -/* return EEXIST if the same logger is registred, 0 on success. */ +void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +{ +	const struct nf_logger *log; + +	if (pf == NFPROTO_UNSPEC) +		return; + +	mutex_lock(&nf_log_mutex); +	log = rcu_dereference_protected(net->nf.nf_loggers[pf], +					lockdep_is_held(&nf_log_mutex)); +	if (log == NULL) +		rcu_assign_pointer(net->nf.nf_loggers[pf], logger); + +	mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_set); + +void nf_log_unset(struct net *net, const struct nf_logger *logger) +{ +	int i; +	const struct nf_logger *log; + +	mutex_lock(&nf_log_mutex); +	for (i = 0; i < NFPROTO_NUMPROTO; i++) { +		log = rcu_dereference_protected(net->nf.nf_loggers[i], +				lockdep_is_held(&nf_log_mutex)); +		if (log == logger) +			RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL); +	} +	mutex_unlock(&nf_log_mutex); +	synchronize_rcu(); +} +EXPORT_SYMBOL(nf_log_unset); + +/* return EEXIST if the same logger is registered, 0 on success. */  int nf_log_register(u_int8_t pf, struct nf_logger *logger)  { -	const struct nf_logger *llog;  	int i; -	if (pf >= ARRAY_SIZE(nf_loggers)) +	if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers))  		return -EINVAL;  	for (i = 0; i < ARRAY_SIZE(logger->list); i++) @@ -52,10 +84,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)  	} else {  		/* register at end of list to honor first register win */  		list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); -		llog = rcu_dereference_protected(nf_loggers[pf], -						 lockdep_is_held(&nf_log_mutex)); -		if (llog == NULL) -			rcu_assign_pointer(nf_loggers[pf], logger);  	}  	mutex_unlock(&nf_log_mutex); @@ -66,45 +94,43 @@ EXPORT_SYMBOL(nf_log_register);  void nf_log_unregister(struct nf_logger *logger)  { -	const struct nf_logger *c_logger;  	int i;  	mutex_lock(&nf_log_mutex); -	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { -		c_logger = rcu_dereference_protected(nf_loggers[i], -						     lockdep_is_held(&nf_log_mutex)); -		if (c_logger == logger) -			rcu_assign_pointer(nf_loggers[i], NULL); +	for (i = 0; i < NFPROTO_NUMPROTO; i++)  		list_del(&logger->list[i]); -	}  	mutex_unlock(&nf_log_mutex); - -	synchronize_rcu();  }  EXPORT_SYMBOL(nf_log_unregister); -int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) +int nf_log_bind_pf(struct net *net, u_int8_t pf, +		   const struct nf_logger *logger)  { +	if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) +		return -EINVAL;  	mutex_lock(&nf_log_mutex);  	if (__find_logger(pf, logger->name) == NULL) {  		mutex_unlock(&nf_log_mutex);  		return -ENOENT;  	} -	rcu_assign_pointer(nf_loggers[pf], logger); +	rcu_assign_pointer(net->nf.nf_loggers[pf], logger);  	mutex_unlock(&nf_log_mutex);  	return 0;  }  EXPORT_SYMBOL(nf_log_bind_pf); -void nf_log_unbind_pf(u_int8_t pf) +void nf_log_unbind_pf(struct net *net, u_int8_t pf)  { +	if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) +		return;  	mutex_lock(&nf_log_mutex); -	rcu_assign_pointer(nf_loggers[pf], NULL); +	RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL);  	mutex_unlock(&nf_log_mutex);  }  EXPORT_SYMBOL(nf_log_unbind_pf); -void nf_log_packet(u_int8_t pf, +void nf_log_packet(struct net *net, +		   u_int8_t pf,  		   unsigned int hooknum,  		   const struct sk_buff *skb,  		   const struct net_device *in, @@ -117,12 +143,12 @@ void nf_log_packet(u_int8_t pf,  	const struct nf_logger *logger;  	rcu_read_lock(); -	logger = rcu_dereference(nf_loggers[pf]); +	logger = rcu_dereference(net->nf.nf_loggers[pf]);  	if (logger) {  		va_start(args, fmt);  		vsnprintf(prefix, sizeof(prefix), fmt, args);  		va_end(args); -		logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); +		logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);  	}  	rcu_read_unlock();  } @@ -131,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet);  #ifdef CONFIG_PROC_FS  static void *seq_start(struct seq_file *seq, loff_t *pos)  { +	struct net *net = seq_file_net(seq); +  	mutex_lock(&nf_log_mutex); -	if (*pos >= ARRAY_SIZE(nf_loggers)) +	if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))  		return NULL;  	return pos; @@ -141,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos)  static void *seq_next(struct seq_file *s, void *v, loff_t *pos)  { +	struct net *net = seq_file_net(s); +  	(*pos)++; -	if (*pos >= ARRAY_SIZE(nf_loggers)) +	if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))  		return NULL;  	return pos; @@ -160,8 +190,10 @@ static int seq_show(struct seq_file *s, void *v)  	const struct nf_logger *logger;  	struct nf_logger *t;  	int ret; +	struct net *net = seq_file_net(s); -	logger = nf_loggers[*pos]; +	logger = rcu_dereference_protected(net->nf.nf_loggers[*pos], +					   lockdep_is_held(&nf_log_mutex));  	if (!logger)  		ret = seq_printf(s, "%2lld NONE (", *pos); @@ -194,7 +226,8 @@ static const struct seq_operations nflog_seq_ops = {  static int nflog_open(struct inode *inode, struct file *file)  { -	return seq_open(file, &nflog_seq_ops); +	return seq_open_net(inode, file, &nflog_seq_ops, +			    sizeof(struct seq_net_private));  }  static const struct file_operations nflog_file_ops = { @@ -202,25 +235,17 @@ static const struct file_operations nflog_file_ops = {  	.open	 = nflog_open,  	.read	 = seq_read,  	.llseek	 = seq_lseek, -	.release = seq_release, +	.release = seq_release_net,  };  #endif /* PROC_FS */  #ifdef CONFIG_SYSCTL -static struct ctl_path nf_log_sysctl_path[] = { -	{ .procname = "net", }, -	{ .procname = "netfilter", }, -	{ .procname = "nf_log", }, -	{ } -}; -  static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];  static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; -static struct ctl_table_header *nf_log_dir_header; -static int nf_log_proc_dostring(ctl_table *table, int write, +static int nf_log_proc_dostring(struct ctl_table *table, int write,  			 void __user *buffer, size_t *lenp, loff_t *ppos)  {  	const struct nf_logger *logger; @@ -228,6 +253,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,  	size_t size = *lenp;  	int r = 0;  	int tindex = (unsigned long)table->extra1; +	struct net *net = current->nsproxy->net_ns;  	if (write) {  		if (size > sizeof(buf)) @@ -236,7 +262,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,  			return -EFAULT;  		if (!strcmp(buf, "NONE")) { -			nf_log_unbind_pf(tindex); +			nf_log_unbind_pf(net, tindex);  			return 0;  		}  		mutex_lock(&nf_log_mutex); @@ -245,11 +271,12 @@ static int nf_log_proc_dostring(ctl_table *table, int write,  			mutex_unlock(&nf_log_mutex);  			return -ENOENT;  		} -		rcu_assign_pointer(nf_loggers[tindex], logger); +		rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);  		mutex_unlock(&nf_log_mutex);  	} else {  		mutex_lock(&nf_log_mutex); -		logger = nf_loggers[tindex]; +		logger = rcu_dereference_protected(net->nf.nf_loggers[tindex], +						   lockdep_is_held(&nf_log_mutex));  		if (!logger)  			table->data = "NONE";  		else @@ -261,49 +288,112 @@ static int nf_log_proc_dostring(ctl_table *table, int write,  	return r;  } -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net)  {  	int i; - -	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { -		snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i); -		nf_log_sysctl_table[i].procname	= -			nf_log_sysctl_fnames[i-NFPROTO_UNSPEC]; -		nf_log_sysctl_table[i].data = NULL; -		nf_log_sysctl_table[i].maxlen = -			NFLOGGER_NAME_LEN * sizeof(char); -		nf_log_sysctl_table[i].mode = 0644; -		nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; -		nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; +	struct ctl_table *table; + +	table = nf_log_sysctl_table; +	if (!net_eq(net, &init_net)) { +		table = kmemdup(nf_log_sysctl_table, +				 sizeof(nf_log_sysctl_table), +				 GFP_KERNEL); +		if (!table) +			goto err_alloc; +	} else { +		for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { +			snprintf(nf_log_sysctl_fnames[i], +				 3, "%d", i); +			nf_log_sysctl_table[i].procname	= +				nf_log_sysctl_fnames[i]; +			nf_log_sysctl_table[i].data = NULL; +			nf_log_sysctl_table[i].maxlen = +				NFLOGGER_NAME_LEN * sizeof(char); +			nf_log_sysctl_table[i].mode = 0644; +			nf_log_sysctl_table[i].proc_handler = +				nf_log_proc_dostring; +			nf_log_sysctl_table[i].extra1 = +				(void *)(unsigned long) i; +		}  	} -	nf_log_dir_header = register_sysctl_paths(nf_log_sysctl_path, -				       nf_log_sysctl_table); -	if (!nf_log_dir_header) -		return -ENOMEM; +	net->nf.nf_log_dir_header = register_net_sysctl(net, +						"net/netfilter/nf_log", +						table); +	if (!net->nf.nf_log_dir_header) +		goto err_reg;  	return 0; + +err_reg: +	if (!net_eq(net, &init_net)) +		kfree(table); +err_alloc: +	return -ENOMEM; +} + +static void netfilter_log_sysctl_exit(struct net *net) +{ +	struct ctl_table *table; + +	table = net->nf.nf_log_dir_header->ctl_table_arg; +	unregister_net_sysctl_table(net->nf.nf_log_dir_header); +	if (!net_eq(net, &init_net)) +		kfree(table);  }  #else -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net)  {  	return 0;  } + +static void netfilter_log_sysctl_exit(struct net *net) +{ +}  #endif /* CONFIG_SYSCTL */ -int __init netfilter_log_init(void) +static int __net_init nf_log_net_init(struct net *net)  { -	int i, r; +	int ret = -ENOMEM; +  #ifdef CONFIG_PROC_FS  	if (!proc_create("nf_log", S_IRUGO, -			 proc_net_netfilter, &nflog_file_ops)) -		return -1; +			 net->nf.proc_netfilter, &nflog_file_ops)) +		return ret;  #endif +	ret = netfilter_log_sysctl_init(net); +	if (ret < 0) +		goto out_sysctl; -	/* Errors will trigger panic, unroll on error is unnecessary. */ -	r = netfilter_log_sysctl_init(); -	if (r < 0) -		return r; +	return 0; + +out_sysctl: +#ifdef CONFIG_PROC_FS +	remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif +	return ret; +} + +static void __net_exit nf_log_net_exit(struct net *net) +{ +	netfilter_log_sysctl_exit(net); +#ifdef CONFIG_PROC_FS +	remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif +} + +static struct pernet_operations nf_log_net_ops = { +	.init = nf_log_net_init, +	.exit = nf_log_net_exit, +}; + +int __init netfilter_log_init(void) +{ +	int i, ret; + +	ret = register_pernet_subsys(&nf_log_net_ops); +	if (ret < 0) +		return ret;  	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)  		INIT_LIST_HEAD(&(nf_loggers_l[i])); diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c new file mode 100644 index 00000000000..eb772380a20 --- /dev/null +++ b/net/netfilter/nf_nat_amanda.c @@ -0,0 +1,90 @@ +/* Amanda extension for TCP NAT alteration. + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on a copy of HW's ip_nat_irc.c as well as other modules + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/udp.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_amanda.h> + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_amanda"); + +static unsigned int help(struct sk_buff *skb, +			 enum ip_conntrack_info ctinfo, +			 unsigned int protoff, +			 unsigned int matchoff, +			 unsigned int matchlen, +			 struct nf_conntrack_expect *exp) +{ +	char buffer[sizeof("65535")]; +	u_int16_t port; +	unsigned int ret; + +	/* Connection comes from client. */ +	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; +	exp->dir = IP_CT_DIR_ORIGINAL; + +	/* When you see the packet, we need to NAT it the same as the +	 * this one (ie. same IP: it will be TCP and master is UDP). */ +	exp->expectfn = nf_nat_follow_master; + +	/* Try to get same port: if not, try to change it. */ +	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { +		int res; + +		exp->tuple.dst.u.tcp.port = htons(port); +		res = nf_ct_expect_related(exp); +		if (res == 0) +			break; +		else if (res != -EBUSY) { +			port = 0; +			break; +		} +	} + +	if (port == 0) { +		nf_ct_helper_log(skb, exp->master, "all ports in use"); +		return NF_DROP; +	} + +	sprintf(buffer, "%u", port); +	ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, +				       protoff, matchoff, matchlen, +				       buffer, strlen(buffer)); +	if (ret != NF_ACCEPT) { +		nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); +		nf_ct_unexpect_related(exp); +	} +	return ret; +} + +static void __exit nf_nat_amanda_fini(void) +{ +	RCU_INIT_POINTER(nf_nat_amanda_hook, NULL); +	synchronize_rcu(); +} + +static int __init nf_nat_amanda_init(void) +{ +	BUG_ON(nf_nat_amanda_hook != NULL); +	RCU_INIT_POINTER(nf_nat_amanda_hook, help); +	return 0; +} + +module_init(nf_nat_amanda_init); +module_exit(nf_nat_amanda_fini); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c new file mode 100644 index 00000000000..a49907b1dab --- /dev/null +++ b/net/netfilter/nf_nat_core.c @@ -0,0 +1,898 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/gfp.h> +#include <net/xfrm.h> +#include <linux/jhash.h> +#include <linux/rtnetlink.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <linux/netfilter/nf_nat.h> + +static DEFINE_SPINLOCK(nf_nat_lock); + +static DEFINE_MUTEX(nf_nat_proto_mutex); +static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] +						__read_mostly; +static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] +						__read_mostly; + + +inline const struct nf_nat_l3proto * +__nf_nat_l3proto_find(u8 family) +{ +	return rcu_dereference(nf_nat_l3protos[family]); +} + +inline const struct nf_nat_l4proto * +__nf_nat_l4proto_find(u8 family, u8 protonum) +{ +	return rcu_dereference(nf_nat_l4protos[family][protonum]); +} +EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find); + +#ifdef CONFIG_XFRM +static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) +{ +	const struct nf_nat_l3proto *l3proto; +	const struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; +	enum ip_conntrack_dir dir; +	unsigned  long statusbit; +	u8 family; + +	ct = nf_ct_get(skb, &ctinfo); +	if (ct == NULL) +		return; + +	family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; +	rcu_read_lock(); +	l3proto = __nf_nat_l3proto_find(family); +	if (l3proto == NULL) +		goto out; + +	dir = CTINFO2DIR(ctinfo); +	if (dir == IP_CT_DIR_ORIGINAL) +		statusbit = IPS_DST_NAT; +	else +		statusbit = IPS_SRC_NAT; + +	l3proto->decode_session(skb, ct, dir, statusbit, fl); +out: +	rcu_read_unlock(); +} + +int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) +{ +	struct flowi fl; +	unsigned int hh_len; +	struct dst_entry *dst; +	int err; + +	err = xfrm_decode_session(skb, &fl, family); +	if (err < 0) +		return err; + +	dst = skb_dst(skb); +	if (dst->xfrm) +		dst = ((struct xfrm_dst *)dst)->route; +	dst_hold(dst); + +	dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); +	if (IS_ERR(dst)) +		return PTR_ERR(dst); + +	skb_dst_drop(skb); +	skb_dst_set(skb, dst); + +	/* Change in oif may mean change in hh_len. */ +	hh_len = skb_dst(skb)->dev->hard_header_len; +	if (skb_headroom(skb) < hh_len && +	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) +		return -ENOMEM; +	return 0; +} +EXPORT_SYMBOL(nf_xfrm_me_harder); +#endif /* CONFIG_XFRM */ + +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct net *net, u16 zone, +	    const struct nf_conntrack_tuple *tuple) +{ +	unsigned int hash; + +	/* Original src, to ensure we map it consistently if poss. */ +	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), +		      tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); +	return ((u64)hash * net->ct.nat_htable_size) >> 32; +} + +/* Is this tuple already taken? (not by us) */ +int +nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, +		  const struct nf_conn *ignored_conntrack) +{ +	/* Conntrack tracking doesn't keep track of outgoing tuples; only +	 * incoming ones.  NAT means they don't have a fixed mapping, +	 * so we invert the tuple and look for the incoming reply. +	 * +	 * We could keep a separate hash if this proves too slow. +	 */ +	struct nf_conntrack_tuple reply; + +	nf_ct_invert_tuplepr(&reply, tuple); +	return nf_conntrack_tuple_taken(&reply, ignored_conntrack); +} +EXPORT_SYMBOL(nf_nat_used_tuple); + +/* If we source map this tuple so reply looks like reply_tuple, will + * that meet the constraints of range. + */ +static int in_range(const struct nf_nat_l3proto *l3proto, +		    const struct nf_nat_l4proto *l4proto, +		    const struct nf_conntrack_tuple *tuple, +		    const struct nf_nat_range *range) +{ +	/* If we are supposed to map IPs, then we must be in the +	 * range specified, otherwise let this drag us onto a new src IP. +	 */ +	if (range->flags & NF_NAT_RANGE_MAP_IPS && +	    !l3proto->in_range(tuple, range)) +		return 0; + +	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) || +	    l4proto->in_range(tuple, NF_NAT_MANIP_SRC, +			      &range->min_proto, &range->max_proto)) +		return 1; + +	return 0; +} + +static inline int +same_src(const struct nf_conn *ct, +	 const struct nf_conntrack_tuple *tuple) +{ +	const struct nf_conntrack_tuple *t; + +	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; +	return (t->dst.protonum == tuple->dst.protonum && +		nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && +		t->src.u.all == tuple->src.u.all); +} + +/* Only called for SRC manip */ +static int +find_appropriate_src(struct net *net, u16 zone, +		     const struct nf_nat_l3proto *l3proto, +		     const struct nf_nat_l4proto *l4proto, +		     const struct nf_conntrack_tuple *tuple, +		     struct nf_conntrack_tuple *result, +		     const struct nf_nat_range *range) +{ +	unsigned int h = hash_by_src(net, zone, tuple); +	const struct nf_conn_nat *nat; +	const struct nf_conn *ct; + +	hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { +		ct = nat->ct; +		if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { +			/* Copy source part from reply tuple. */ +			nf_ct_invert_tuplepr(result, +				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple); +			result->dst = tuple->dst; + +			if (in_range(l3proto, l4proto, result, range)) +				return 1; +		} +	} +	return 0; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus + * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + * 1-65535, we don't do pro-rata allocation based on ports; we choose + * the ip with the lowest src-ip/dst-ip/proto usage. + */ +static void +find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, +		    const struct nf_nat_range *range, +		    const struct nf_conn *ct, +		    enum nf_nat_manip_type maniptype) +{ +	union nf_inet_addr *var_ipp; +	unsigned int i, max; +	/* Host order */ +	u32 minip, maxip, j, dist; +	bool full_range; + +	/* No IP mapping?  Do nothing. */ +	if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) +		return; + +	if (maniptype == NF_NAT_MANIP_SRC) +		var_ipp = &tuple->src.u3; +	else +		var_ipp = &tuple->dst.u3; + +	/* Fast path: only one choice. */ +	if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { +		*var_ipp = range->min_addr; +		return; +	} + +	if (nf_ct_l3num(ct) == NFPROTO_IPV4) +		max = sizeof(var_ipp->ip) / sizeof(u32) - 1; +	else +		max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; + +	/* Hashing source and destination IPs gives a fairly even +	 * spread in practice (if there are a small number of IPs +	 * involved, there usually aren't that many connections +	 * anyway).  The consistency means that servers see the same +	 * client coming from the same IP (some Internet Banking sites +	 * like this), even across reboots. +	 */ +	j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), +		   range->flags & NF_NAT_RANGE_PERSISTENT ? +			0 : (__force u32)tuple->dst.u3.all[max] ^ zone); + +	full_range = false; +	for (i = 0; i <= max; i++) { +		/* If first bytes of the address are at the maximum, use the +		 * distance. Otherwise use the full range. +		 */ +		if (!full_range) { +			minip = ntohl((__force __be32)range->min_addr.all[i]); +			maxip = ntohl((__force __be32)range->max_addr.all[i]); +			dist  = maxip - minip + 1; +		} else { +			minip = 0; +			dist  = ~0; +		} + +		var_ipp->all[i] = (__force __u32) +			htonl(minip + (((u64)j * dist) >> 32)); +		if (var_ipp->all[i] != range->max_addr.all[i]) +			full_range = true; + +		if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) +			j ^= (__force u32)tuple->dst.u3.all[i]; +	} +} + +/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, + * we change the source to map into the range. For NF_INET_PRE_ROUTING + * and NF_INET_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void +get_unique_tuple(struct nf_conntrack_tuple *tuple, +		 const struct nf_conntrack_tuple *orig_tuple, +		 const struct nf_nat_range *range, +		 struct nf_conn *ct, +		 enum nf_nat_manip_type maniptype) +{ +	const struct nf_nat_l3proto *l3proto; +	const struct nf_nat_l4proto *l4proto; +	struct net *net = nf_ct_net(ct); +	u16 zone = nf_ct_zone(ct); + +	rcu_read_lock(); +	l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); +	l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, +					orig_tuple->dst.protonum); + +	/* 1) If this srcip/proto/src-proto-part is currently mapped, +	 * and that same mapping gives a unique tuple within the given +	 * range, use that. +	 * +	 * This is only required for source (ie. NAT/masq) mappings. +	 * So far, we don't do local source mappings, so multiple +	 * manips not an issue. +	 */ +	if (maniptype == NF_NAT_MANIP_SRC && +	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { +		/* try the original tuple first */ +		if (in_range(l3proto, l4proto, orig_tuple, range)) { +			if (!nf_nat_used_tuple(orig_tuple, ct)) { +				*tuple = *orig_tuple; +				goto out; +			} +		} else if (find_appropriate_src(net, zone, l3proto, l4proto, +						orig_tuple, tuple, range)) { +			pr_debug("get_unique_tuple: Found current src map\n"); +			if (!nf_nat_used_tuple(tuple, ct)) +				goto out; +		} +	} + +	/* 2) Select the least-used IP/proto combination in the given range */ +	*tuple = *orig_tuple; +	find_best_ips_proto(zone, tuple, range, ct, maniptype); + +	/* 3) The per-protocol part of the manip is made to map into +	 * the range to make a unique tuple. +	 */ + +	/* Only bother mapping if it's not already in range and unique */ +	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { +		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { +			if (l4proto->in_range(tuple, maniptype, +					      &range->min_proto, +					      &range->max_proto) && +			    (range->min_proto.all == range->max_proto.all || +			     !nf_nat_used_tuple(tuple, ct))) +				goto out; +		} else if (!nf_nat_used_tuple(tuple, ct)) { +			goto out; +		} +	} + +	/* Last change: get protocol to try to obtain unique tuple. */ +	l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); +out: +	rcu_read_unlock(); +} + +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) +{ +	struct nf_conn_nat *nat = nfct_nat(ct); +	if (nat) +		return nat; + +	if (!nf_ct_is_confirmed(ct)) +		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + +	return nat; +} +EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); + +unsigned int +nf_nat_setup_info(struct nf_conn *ct, +		  const struct nf_nat_range *range, +		  enum nf_nat_manip_type maniptype) +{ +	struct net *net = nf_ct_net(ct); +	struct nf_conntrack_tuple curr_tuple, new_tuple; +	struct nf_conn_nat *nat; + +	/* nat helper or nfctnetlink also setup binding */ +	nat = nf_ct_nat_ext_add(ct); +	if (nat == NULL) +		return NF_ACCEPT; + +	NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || +		     maniptype == NF_NAT_MANIP_DST); +	BUG_ON(nf_nat_initialized(ct, maniptype)); + +	/* What we've got will look like inverse of reply. Normally +	 * this is what is in the conntrack, except for prior +	 * manipulations (future optimization: if num_manips == 0, +	 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) +	 */ +	nf_ct_invert_tuplepr(&curr_tuple, +			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + +	get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); + +	if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { +		struct nf_conntrack_tuple reply; + +		/* Alter conntrack table so will recognize replies. */ +		nf_ct_invert_tuplepr(&reply, &new_tuple); +		nf_conntrack_alter_reply(ct, &reply); + +		/* Non-atomic: we own this at the moment. */ +		if (maniptype == NF_NAT_MANIP_SRC) +			ct->status |= IPS_SRC_NAT; +		else +			ct->status |= IPS_DST_NAT; + +		if (nfct_help(ct)) +			nfct_seqadj_ext_add(ct); +	} + +	if (maniptype == NF_NAT_MANIP_SRC) { +		unsigned int srchash; + +		srchash = hash_by_src(net, nf_ct_zone(ct), +				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +		spin_lock_bh(&nf_nat_lock); +		/* nf_conntrack_alter_reply might re-allocate extension aera */ +		nat = nfct_nat(ct); +		nat->ct = ct; +		hlist_add_head_rcu(&nat->bysource, +				   &net->ct.nat_bysource[srchash]); +		spin_unlock_bh(&nf_nat_lock); +	} + +	/* It's done. */ +	if (maniptype == NF_NAT_MANIP_DST) +		ct->status |= IPS_DST_NAT_DONE; +	else +		ct->status |= IPS_SRC_NAT_DONE; + +	return NF_ACCEPT; +} +EXPORT_SYMBOL(nf_nat_setup_info); + +static unsigned int +__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) +{ +	/* Force range to this IP; let proto decide mapping for +	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). +	 * Use reply in case it's already been mangled (eg local packet). +	 */ +	union nf_inet_addr ip = +		(manip == NF_NAT_MANIP_SRC ? +		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : +		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); +	struct nf_nat_range range = { +		.flags		= NF_NAT_RANGE_MAP_IPS, +		.min_addr	= ip, +		.max_addr	= ip, +	}; +	return nf_nat_setup_info(ct, &range, manip); +} + +unsigned int +nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ +	return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); +} +EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); + +/* Do packet manipulations according to nf_nat_setup_info. */ +unsigned int nf_nat_packet(struct nf_conn *ct, +			   enum ip_conntrack_info ctinfo, +			   unsigned int hooknum, +			   struct sk_buff *skb) +{ +	const struct nf_nat_l3proto *l3proto; +	const struct nf_nat_l4proto *l4proto; +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	unsigned long statusbit; +	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); + +	if (mtype == NF_NAT_MANIP_SRC) +		statusbit = IPS_SRC_NAT; +	else +		statusbit = IPS_DST_NAT; + +	/* Invert if this is reply dir. */ +	if (dir == IP_CT_DIR_REPLY) +		statusbit ^= IPS_NAT_MASK; + +	/* Non-atomic: these bits don't change. */ +	if (ct->status & statusbit) { +		struct nf_conntrack_tuple target; + +		/* We are aiming to look like inverse of other direction. */ +		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + +		l3proto = __nf_nat_l3proto_find(target.src.l3num); +		l4proto = __nf_nat_l4proto_find(target.src.l3num, +						target.dst.protonum); +		if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) +			return NF_DROP; +	} +	return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_nat_packet); + +struct nf_nat_proto_clean { +	u8	l3proto; +	u8	l4proto; +}; + +/* kill conntracks with affected NAT section */ +static int nf_nat_proto_remove(struct nf_conn *i, void *data) +{ +	const struct nf_nat_proto_clean *clean = data; +	struct nf_conn_nat *nat = nfct_nat(i); + +	if (!nat) +		return 0; + +	if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || +	    (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) +		return 0; + +	return i->status & IPS_NAT_MASK ? 1 : 0; +} + +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ +	struct nf_conn_nat *nat = nfct_nat(ct); + +	if (nf_nat_proto_remove(ct, data)) +		return 1; + +	if (!nat || !nat->ct) +		return 0; + +	/* This netns is being destroyed, and conntrack has nat null binding. +	 * Remove it from bysource hash, as the table will be freed soon. +	 * +	 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() +	 * will delete entry from already-freed table. +	 */ +	if (!del_timer(&ct->timeout)) +		return 1; + +	spin_lock_bh(&nf_nat_lock); +	hlist_del_rcu(&nat->bysource); +	ct->status &= ~IPS_NAT_DONE_MASK; +	nat->ct = NULL; +	spin_unlock_bh(&nf_nat_lock); + +	add_timer(&ct->timeout); + +	/* don't delete conntrack.  Although that would make things a lot +	 * simpler, we'd end up flushing all conntracks on nat rmmod. +	 */ +	return 0; +} + +static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) +{ +	struct nf_nat_proto_clean clean = { +		.l3proto = l3proto, +		.l4proto = l4proto, +	}; +	struct net *net; + +	rtnl_lock(); +	for_each_net(net) +		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); +	rtnl_unlock(); +} + +static void nf_nat_l3proto_clean(u8 l3proto) +{ +	struct nf_nat_proto_clean clean = { +		.l3proto = l3proto, +	}; +	struct net *net; + +	rtnl_lock(); + +	for_each_net(net) +		nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); +	rtnl_unlock(); +} + +/* Protocol registration. */ +int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto) +{ +	const struct nf_nat_l4proto **l4protos; +	unsigned int i; +	int ret = 0; + +	mutex_lock(&nf_nat_proto_mutex); +	if (nf_nat_l4protos[l3proto] == NULL) { +		l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *), +				   GFP_KERNEL); +		if (l4protos == NULL) { +			ret = -ENOMEM; +			goto out; +		} + +		for (i = 0; i < IPPROTO_MAX; i++) +			RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown); + +		/* Before making proto_array visible to lockless readers, +		 * we must make sure its content is committed to memory. +		 */ +		smp_wmb(); + +		nf_nat_l4protos[l3proto] = l4protos; +	} + +	if (rcu_dereference_protected( +			nf_nat_l4protos[l3proto][l4proto->l4proto], +			lockdep_is_held(&nf_nat_proto_mutex) +			) != &nf_nat_l4proto_unknown) { +		ret = -EBUSY; +		goto out; +	} +	RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto); + out: +	mutex_unlock(&nf_nat_proto_mutex); +	return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_register); + +/* No one stores the protocol anywhere; simply delete it. */ +void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto) +{ +	mutex_lock(&nf_nat_proto_mutex); +	RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], +			 &nf_nat_l4proto_unknown); +	mutex_unlock(&nf_nat_proto_mutex); +	synchronize_rcu(); + +	nf_nat_l4proto_clean(l3proto, l4proto->l4proto); +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); + +int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) +{ +	int err; + +	err = nf_ct_l3proto_try_module_get(l3proto->l3proto); +	if (err < 0) +		return err; + +	mutex_lock(&nf_nat_proto_mutex); +	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], +			 &nf_nat_l4proto_tcp); +	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP], +			 &nf_nat_l4proto_udp); +	mutex_unlock(&nf_nat_proto_mutex); + +	RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto); +	return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_register); + +void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto) +{ +	mutex_lock(&nf_nat_proto_mutex); +	RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL); +	mutex_unlock(&nf_nat_proto_mutex); +	synchronize_rcu(); + +	nf_nat_l3proto_clean(l3proto->l3proto); +	nf_ct_l3proto_module_put(l3proto->l3proto); +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); + +/* No one using conntrack by the time this called. */ +static void nf_nat_cleanup_conntrack(struct nf_conn *ct) +{ +	struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); + +	if (nat == NULL || nat->ct == NULL) +		return; + +	NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); + +	spin_lock_bh(&nf_nat_lock); +	hlist_del_rcu(&nat->bysource); +	spin_unlock_bh(&nf_nat_lock); +} + +static void nf_nat_move_storage(void *new, void *old) +{ +	struct nf_conn_nat *new_nat = new; +	struct nf_conn_nat *old_nat = old; +	struct nf_conn *ct = old_nat->ct; + +	if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) +		return; + +	spin_lock_bh(&nf_nat_lock); +	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); +	spin_unlock_bh(&nf_nat_lock); +} + +static struct nf_ct_ext_type nat_extend __read_mostly = { +	.len		= sizeof(struct nf_conn_nat), +	.align		= __alignof__(struct nf_conn_nat), +	.destroy	= nf_nat_cleanup_conntrack, +	.move		= nf_nat_move_storage, +	.id		= NF_CT_EXT_NAT, +	.flags		= NF_CT_EXT_F_PREALLOC, +}; + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { +	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 }, +	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 }, +}; + +static int nfnetlink_parse_nat_proto(struct nlattr *attr, +				     const struct nf_conn *ct, +				     struct nf_nat_range *range) +{ +	struct nlattr *tb[CTA_PROTONAT_MAX+1]; +	const struct nf_nat_l4proto *l4proto; +	int err; + +	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); +	if (err < 0) +		return err; + +	l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); +	if (l4proto->nlattr_to_range) +		err = l4proto->nlattr_to_range(tb, range); + +	return err; +} + +static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { +	[CTA_NAT_V4_MINIP]	= { .type = NLA_U32 }, +	[CTA_NAT_V4_MAXIP]	= { .type = NLA_U32 }, +	[CTA_NAT_V6_MINIP]	= { .len = sizeof(struct in6_addr) }, +	[CTA_NAT_V6_MAXIP]	= { .len = sizeof(struct in6_addr) }, +	[CTA_NAT_PROTO]		= { .type = NLA_NESTED }, +}; + +static int +nfnetlink_parse_nat(const struct nlattr *nat, +		    const struct nf_conn *ct, struct nf_nat_range *range, +		    const struct nf_nat_l3proto *l3proto) +{ +	struct nlattr *tb[CTA_NAT_MAX+1]; +	int err; + +	memset(range, 0, sizeof(*range)); + +	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); +	if (err < 0) +		return err; + +	err = l3proto->nlattr_to_range(tb, range); +	if (err < 0) +		return err; + +	if (!tb[CTA_NAT_PROTO]) +		return 0; + +	return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); +} + +/* This function is called under rcu_read_lock() */ +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, +			  enum nf_nat_manip_type manip, +			  const struct nlattr *attr) +{ +	struct nf_nat_range range; +	const struct nf_nat_l3proto *l3proto; +	int err; + +	/* Should not happen, restricted to creating new conntracks +	 * via ctnetlink. +	 */ +	if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) +		return -EEXIST; + +	/* Make sure that L3 NAT is there by when we call nf_nat_setup_info to +	 * attach the null binding, otherwise this may oops. +	 */ +	l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); +	if (l3proto == NULL) +		return -EAGAIN; + +	/* No NAT information has been passed, allocate the null-binding */ +	if (attr == NULL) +		return __nf_nat_alloc_null_binding(ct, manip); + +	err = nfnetlink_parse_nat(attr, ct, &range, l3proto); +	if (err < 0) +		return err; + +	return nf_nat_setup_info(ct, &range, manip); +} +#else +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, +			  enum nf_nat_manip_type manip, +			  const struct nlattr *attr) +{ +	return -EOPNOTSUPP; +} +#endif + +static int __net_init nf_nat_net_init(struct net *net) +{ +	/* Leave them the same for the moment. */ +	net->ct.nat_htable_size = net->ct.htable_size; +	net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); +	if (!net->ct.nat_bysource) +		return -ENOMEM; +	return 0; +} + +static void __net_exit nf_nat_net_exit(struct net *net) +{ +	struct nf_nat_proto_clean clean = {}; + +	nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); +	synchronize_rcu(); +	nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); +} + +static struct pernet_operations nf_nat_net_ops = { +	.init = nf_nat_net_init, +	.exit = nf_nat_net_exit, +}; + +static struct nf_ct_helper_expectfn follow_master_nat = { +	.name		= "nat-follow-master", +	.expectfn	= nf_nat_follow_master, +}; + +static int __init nf_nat_init(void) +{ +	int ret; + +	ret = nf_ct_extend_register(&nat_extend); +	if (ret < 0) { +		printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); +		return ret; +	} + +	ret = register_pernet_subsys(&nf_nat_net_ops); +	if (ret < 0) +		goto cleanup_extend; + +	nf_ct_helper_expectfn_register(&follow_master_nat); + +	/* Initialize fake conntrack so that NAT will skip it */ +	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); + +	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); +	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, +			   nfnetlink_parse_nat_setup); +#ifdef CONFIG_XFRM +	BUG_ON(nf_nat_decode_session_hook != NULL); +	RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); +#endif +	return 0; + + cleanup_extend: +	nf_ct_extend_unregister(&nat_extend); +	return ret; +} + +static void __exit nf_nat_cleanup(void) +{ +	unsigned int i; + +	unregister_pernet_subsys(&nf_nat_net_ops); +	nf_ct_extend_unregister(&nat_extend); +	nf_ct_helper_expectfn_unregister(&follow_master_nat); +	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); +#ifdef CONFIG_XFRM +	RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); +#endif +	for (i = 0; i < NFPROTO_NUMPROTO; i++) +		kfree(nf_nat_l4protos[i]); +	synchronize_net(); +} + +MODULE_LICENSE("GPL"); + +module_init(nf_nat_init); +module_exit(nf_nat_cleanup); diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c new file mode 100644 index 00000000000..e84a578dbe3 --- /dev/null +++ b/net/netfilter/nf_nat_ftp.c @@ -0,0 +1,146 @@ +/* FTP extension for TCP NAT alteration. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/inet.h> +#include <linux/tcp.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_ftp.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp NAT helper"); +MODULE_ALIAS("ip_nat_ftp"); + +/* FIXME: Time out? --RR */ + +static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type, +			      char *buffer, size_t buflen, +			      union nf_inet_addr *addr, u16 port) +{ +	switch (type) { +	case NF_CT_FTP_PORT: +	case NF_CT_FTP_PASV: +		return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", +				((unsigned char *)&addr->ip)[0], +				((unsigned char *)&addr->ip)[1], +				((unsigned char *)&addr->ip)[2], +				((unsigned char *)&addr->ip)[3], +				port >> 8, +				port & 0xFF); +	case NF_CT_FTP_EPRT: +		if (nf_ct_l3num(ct) == NFPROTO_IPV4) +			return snprintf(buffer, buflen, "|1|%pI4|%u|", +					&addr->ip, port); +		else +			return snprintf(buffer, buflen, "|2|%pI6|%u|", +					&addr->ip6, port); +	case NF_CT_FTP_EPSV: +		return snprintf(buffer, buflen, "|||%u|", port); +	} + +	return 0; +} + +/* So, this packet has hit the connection tracking matching code. +   Mangle it, and change the expectation to match the new version. */ +static unsigned int nf_nat_ftp(struct sk_buff *skb, +			       enum ip_conntrack_info ctinfo, +			       enum nf_ct_ftp_type type, +			       unsigned int protoff, +			       unsigned int matchoff, +			       unsigned int matchlen, +			       struct nf_conntrack_expect *exp) +{ +	union nf_inet_addr newaddr; +	u_int16_t port; +	int dir = CTINFO2DIR(ctinfo); +	struct nf_conn *ct = exp->master; +	char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN]; +	unsigned int buflen; + +	pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + +	/* Connection will come from wherever this packet goes, hence !dir */ +	newaddr = ct->tuplehash[!dir].tuple.dst.u3; +	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; +	exp->dir = !dir; + +	/* When you see the packet, we need to NAT it the same as the +	 * this one. */ +	exp->expectfn = nf_nat_follow_master; + +	/* Try to get same port: if not, try to change it. */ +	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { +		int ret; + +		exp->tuple.dst.u.tcp.port = htons(port); +		ret = nf_ct_expect_related(exp); +		if (ret == 0) +			break; +		else if (ret != -EBUSY) { +			port = 0; +			break; +		} +	} + +	if (port == 0) { +		nf_ct_helper_log(skb, ct, "all ports in use"); +		return NF_DROP; +	} + +	buflen = nf_nat_ftp_fmt_cmd(ct, type, buffer, sizeof(buffer), +				    &newaddr, port); +	if (!buflen) +		goto out; + +	pr_debug("calling nf_nat_mangle_tcp_packet\n"); + +	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, +				      matchlen, buffer, buflen)) +		goto out; + +	return NF_ACCEPT; + +out: +	nf_ct_helper_log(skb, ct, "cannot mangle packet"); +	nf_ct_unexpect_related(exp); +	return NF_DROP; +} + +static void __exit nf_nat_ftp_fini(void) +{ +	RCU_INIT_POINTER(nf_nat_ftp_hook, NULL); +	synchronize_rcu(); +} + +static int __init nf_nat_ftp_init(void) +{ +	BUG_ON(nf_nat_ftp_hook != NULL); +	RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp); +	return 0; +} + +/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ +	printk(KERN_INFO KBUILD_MODNAME +	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); +	return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_ftp_init); +module_exit(nf_nat_ftp_fini); diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c new file mode 100644 index 00000000000..2840abb5bb9 --- /dev/null +++ b/net/netfilter/nf_nat_helper.c @@ -0,0 +1,212 @@ +/* nf_nat_helper.c - generic support functions for NAT helpers + * + * (C) 2000-2002 Harald Welte <laforge@netfilter.org> + * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2007-2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> + +/* Frobs data inside this packet, which is linear. */ +static void mangle_contents(struct sk_buff *skb, +			    unsigned int dataoff, +			    unsigned int match_offset, +			    unsigned int match_len, +			    const char *rep_buffer, +			    unsigned int rep_len) +{ +	unsigned char *data; + +	BUG_ON(skb_is_nonlinear(skb)); +	data = skb_network_header(skb) + dataoff; + +	/* move post-replacement */ +	memmove(data + match_offset + rep_len, +		data + match_offset + match_len, +		skb_tail_pointer(skb) - (skb_network_header(skb) + dataoff + +			     match_offset + match_len)); + +	/* insert data from buffer */ +	memcpy(data + match_offset, rep_buffer, rep_len); + +	/* update skb info */ +	if (rep_len > match_len) { +		pr_debug("nf_nat_mangle_packet: Extending packet by " +			 "%u from %u bytes\n", rep_len - match_len, skb->len); +		skb_put(skb, rep_len - match_len); +	} else { +		pr_debug("nf_nat_mangle_packet: Shrinking packet from " +			 "%u from %u bytes\n", match_len - rep_len, skb->len); +		__skb_trim(skb, skb->len + rep_len - match_len); +	} + +	if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) { +		/* fix IP hdr checksum information */ +		ip_hdr(skb)->tot_len = htons(skb->len); +		ip_send_check(ip_hdr(skb)); +	} else +		ipv6_hdr(skb)->payload_len = +			htons(skb->len - sizeof(struct ipv6hdr)); +} + +/* Unusual, but possible case. */ +static int enlarge_skb(struct sk_buff *skb, unsigned int extra) +{ +	if (skb->len + extra > 65535) +		return 0; + +	if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC)) +		return 0; + +	return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX + * command in FTP). + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * */ +int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, +			       struct nf_conn *ct, +			       enum ip_conntrack_info ctinfo, +			       unsigned int protoff, +			       unsigned int match_offset, +			       unsigned int match_len, +			       const char *rep_buffer, +			       unsigned int rep_len, bool adjust) +{ +	const struct nf_nat_l3proto *l3proto; +	struct tcphdr *tcph; +	int oldlen, datalen; + +	if (!skb_make_writable(skb, skb->len)) +		return 0; + +	if (rep_len > match_len && +	    rep_len - match_len > skb_tailroom(skb) && +	    !enlarge_skb(skb, rep_len - match_len)) +		return 0; + +	SKB_LINEAR_ASSERT(skb); + +	tcph = (void *)skb->data + protoff; + +	oldlen = skb->len - protoff; +	mangle_contents(skb, protoff + tcph->doff*4, +			match_offset, match_len, rep_buffer, rep_len); + +	datalen = skb->len - protoff; + +	l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); +	l3proto->csum_recalc(skb, IPPROTO_TCP, tcph, &tcph->check, +			     datalen, oldlen); + +	if (adjust && rep_len != match_len) +		nf_ct_seqadj_set(ct, ctinfo, tcph->seq, +				 (int)rep_len - (int)match_len); + +	return 1; +} +EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); + +/* Generic function for mangling variable-length address changes inside + * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX + * command in the Amanda protocol) + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * XXX - This function could be merged with nf_nat_mangle_tcp_packet which + *       should be fairly easy to do. + */ +int +nf_nat_mangle_udp_packet(struct sk_buff *skb, +			 struct nf_conn *ct, +			 enum ip_conntrack_info ctinfo, +			 unsigned int protoff, +			 unsigned int match_offset, +			 unsigned int match_len, +			 const char *rep_buffer, +			 unsigned int rep_len) +{ +	const struct nf_nat_l3proto *l3proto; +	struct udphdr *udph; +	int datalen, oldlen; + +	if (!skb_make_writable(skb, skb->len)) +		return 0; + +	if (rep_len > match_len && +	    rep_len - match_len > skb_tailroom(skb) && +	    !enlarge_skb(skb, rep_len - match_len)) +		return 0; + +	udph = (void *)skb->data + protoff; + +	oldlen = skb->len - protoff; +	mangle_contents(skb, protoff + sizeof(*udph), +			match_offset, match_len, rep_buffer, rep_len); + +	/* update the length of the UDP packet */ +	datalen = skb->len - protoff; +	udph->len = htons(datalen); + +	/* fix udp checksum if udp checksum was previously calculated */ +	if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) +		return 1; + +	l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); +	l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check, +			     datalen, oldlen); + +	return 1; +} +EXPORT_SYMBOL(nf_nat_mangle_udp_packet); + +/* Setup NAT on this expected conntrack so it follows master. */ +/* If we fail to get a free NAT slot, we'll get dropped on confirm */ +void nf_nat_follow_master(struct nf_conn *ct, +			  struct nf_conntrack_expect *exp) +{ +	struct nf_nat_range range; + +	/* This must be a fresh one. */ +	BUG_ON(ct->status & IPS_NAT_DONE_MASK); + +	/* Change src to where master sends to */ +	range.flags = NF_NAT_RANGE_MAP_IPS; +	range.min_addr = range.max_addr +		= ct->master->tuplehash[!exp->dir].tuple.dst.u3; +	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); + +	/* For DST manip, map port here to where it's expected. */ +	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); +	range.min_proto = range.max_proto = exp->saved_proto; +	range.min_addr = range.max_addr +		= ct->master->tuplehash[!exp->dir].tuple.src.u3; +	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} +EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c new file mode 100644 index 00000000000..1fb2258c353 --- /dev/null +++ b/net/netfilter/nf_nat_irc.c @@ -0,0 +1,119 @@ +/* IRC extension for TCP NAT alteration. + * + * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> + * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * based on a copy of RR's ip_nat_ftp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/tcp.h> +#include <linux/kernel.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_irc.h> + +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("IRC (DCC) NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_irc"); + +static unsigned int help(struct sk_buff *skb, +			 enum ip_conntrack_info ctinfo, +			 unsigned int protoff, +			 unsigned int matchoff, +			 unsigned int matchlen, +			 struct nf_conntrack_expect *exp) +{ +	char buffer[sizeof("4294967296 65635")]; +	struct nf_conn *ct = exp->master; +	union nf_inet_addr newaddr; +	u_int16_t port; +	unsigned int ret; + +	/* Reply comes from server. */ +	newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3; + +	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; +	exp->dir = IP_CT_DIR_REPLY; +	exp->expectfn = nf_nat_follow_master; + +	/* Try to get same port: if not, try to change it. */ +	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { +		int ret; + +		exp->tuple.dst.u.tcp.port = htons(port); +		ret = nf_ct_expect_related(exp); +		if (ret == 0) +			break; +		else if (ret != -EBUSY) { +			port = 0; +			break; +		} +	} + +	if (port == 0) { +		nf_ct_helper_log(skb, ct, "all ports in use"); +		return NF_DROP; +	} + +	/* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 +	 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 +	 * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 +	 * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 +	 * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 +	 * +	 * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, +	 *                        255.255.255.255==4294967296, 10 digits) +	 * P:         bound port (min 1 d, max 5d (65635)) +	 * F:         filename   (min 1 d ) +	 * S:         size       (min 1 d ) +	 * 0x01, \n:  terminators +	 */ +	/* AAA = "us", ie. where server normally talks to. */ +	snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port); +	pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", +		 buffer, &newaddr.ip, port); + +	ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, +				       matchlen, buffer, strlen(buffer)); +	if (ret != NF_ACCEPT) { +		nf_ct_helper_log(skb, ct, "cannot mangle packet"); +		nf_ct_unexpect_related(exp); +	} + +	return ret; +} + +static void __exit nf_nat_irc_fini(void) +{ +	RCU_INIT_POINTER(nf_nat_irc_hook, NULL); +	synchronize_rcu(); +} + +static int __init nf_nat_irc_init(void) +{ +	BUG_ON(nf_nat_irc_hook != NULL); +	RCU_INIT_POINTER(nf_nat_irc_hook, help); +	return 0; +} + +/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ +	printk(KERN_INFO KBUILD_MODNAME +	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); +	return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_irc_init); +module_exit(nf_nat_irc_fini); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c new file mode 100644 index 00000000000..83a72a235ca --- /dev/null +++ b/net/netfilter/nf_nat_proto_common.c @@ -0,0 +1,114 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/random.h> +#include <linux/netfilter.h> +#include <linux/export.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, +			     enum nf_nat_manip_type maniptype, +			     const union nf_conntrack_man_proto *min, +			     const union nf_conntrack_man_proto *max) +{ +	__be16 port; + +	if (maniptype == NF_NAT_MANIP_SRC) +		port = tuple->src.u.all; +	else +		port = tuple->dst.u.all; + +	return ntohs(port) >= ntohs(min->all) && +	       ntohs(port) <= ntohs(max->all); +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range); + +void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, +				 struct nf_conntrack_tuple *tuple, +				 const struct nf_nat_range *range, +				 enum nf_nat_manip_type maniptype, +				 const struct nf_conn *ct, +				 u16 *rover) +{ +	unsigned int range_size, min, i; +	__be16 *portptr; +	u_int16_t off; + +	if (maniptype == NF_NAT_MANIP_SRC) +		portptr = &tuple->src.u.all; +	else +		portptr = &tuple->dst.u.all; + +	/* If no range specified... */ +	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { +		/* If it's dst rewrite, can't change port */ +		if (maniptype == NF_NAT_MANIP_DST) +			return; + +		if (ntohs(*portptr) < 1024) { +			/* Loose convention: >> 512 is credential passing */ +			if (ntohs(*portptr) < 512) { +				min = 1; +				range_size = 511 - min + 1; +			} else { +				min = 600; +				range_size = 1023 - min + 1; +			} +		} else { +			min = 1024; +			range_size = 65535 - 1024 + 1; +		} +	} else { +		min = ntohs(range->min_proto.all); +		range_size = ntohs(range->max_proto.all) - min + 1; +	} + +	if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) { +		off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC +						  ? tuple->dst.u.all +						  : tuple->src.u.all); +	} else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { +		off = prandom_u32(); +	} else { +		off = *rover; +	} + +	for (i = 0; ; ++off) { +		*portptr = htons(min + off % range_size); +		if (++i != range_size && nf_nat_used_tuple(tuple, ct)) +			continue; +		if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) +			*rover = off; +		return; +	} +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], +				   struct nf_nat_range *range) +{ +	if (tb[CTA_PROTONAT_PORT_MIN]) { +		range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); +		range->max_proto.all = range->min_proto.all; +		range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; +	} +	if (tb[CTA_PROTONAT_PORT_MAX]) { +		range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); +		range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range); +#endif diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c new file mode 100644 index 00000000000..c8be2cdac0b --- /dev/null +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -0,0 +1,116 @@ +/* + * DCCP NAT protocol helper + * + * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/dccp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u_int16_t dccp_port_rover; + +static void +dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, +		  struct nf_conntrack_tuple *tuple, +		  const struct nf_nat_range *range, +		  enum nf_nat_manip_type maniptype, +		  const struct nf_conn *ct) +{ +	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, +				    &dccp_port_rover); +} + +static bool +dccp_manip_pkt(struct sk_buff *skb, +	       const struct nf_nat_l3proto *l3proto, +	       unsigned int iphdroff, unsigned int hdroff, +	       const struct nf_conntrack_tuple *tuple, +	       enum nf_nat_manip_type maniptype) +{ +	struct dccp_hdr *hdr; +	__be16 *portptr, oldport, newport; +	int hdrsize = 8; /* DCCP connection tracking guarantees this much */ + +	if (skb->len >= hdroff + sizeof(struct dccp_hdr)) +		hdrsize = sizeof(struct dccp_hdr); + +	if (!skb_make_writable(skb, hdroff + hdrsize)) +		return false; + +	hdr = (struct dccp_hdr *)(skb->data + hdroff); + +	if (maniptype == NF_NAT_MANIP_SRC) { +		newport = tuple->src.u.dccp.port; +		portptr = &hdr->dccph_sport; +	} else { +		newport = tuple->dst.u.dccp.port; +		portptr = &hdr->dccph_dport; +	} + +	oldport = *portptr; +	*portptr = newport; + +	if (hdrsize < sizeof(*hdr)) +		return true; + +	l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, +			     tuple, maniptype); +	inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, +				 0); +	return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { +	.l4proto		= IPPROTO_DCCP, +	.manip_pkt		= dccp_manip_pkt, +	.in_range		= nf_nat_l4proto_in_range, +	.unique_tuple		= dccp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_dccp_init(void) +{ +	int err; + +	err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp); +	if (err < 0) +		goto err1; +	err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp); +	if (err < 0) +		goto err2; +	return 0; + +err2: +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); +err1: +	return err; +} + +static void __exit nf_nat_proto_dccp_fini(void) +{ +	nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp); +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); + +} + +module_init(nf_nat_proto_dccp_init); +module_exit(nf_nat_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("DCCP NAT protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c new file mode 100644 index 00000000000..754536f2c67 --- /dev/null +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sctp.h> +#include <linux/module.h> +#include <net/sctp/checksum.h> + +#include <net/netfilter/nf_nat_l4proto.h> + +static u_int16_t nf_sctp_port_rover; + +static void +sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, +		  struct nf_conntrack_tuple *tuple, +		  const struct nf_nat_range *range, +		  enum nf_nat_manip_type maniptype, +		  const struct nf_conn *ct) +{ +	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, +				    &nf_sctp_port_rover); +} + +static bool +sctp_manip_pkt(struct sk_buff *skb, +	       const struct nf_nat_l3proto *l3proto, +	       unsigned int iphdroff, unsigned int hdroff, +	       const struct nf_conntrack_tuple *tuple, +	       enum nf_nat_manip_type maniptype) +{ +	sctp_sctphdr_t *hdr; + +	if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) +		return false; + +	hdr = (struct sctphdr *)(skb->data + hdroff); + +	if (maniptype == NF_NAT_MANIP_SRC) { +		/* Get rid of src port */ +		hdr->source = tuple->src.u.sctp.port; +	} else { +		/* Get rid of dst port */ +		hdr->dest = tuple->dst.u.sctp.port; +	} + +	hdr->checksum = sctp_compute_cksum(skb, hdroff); + +	return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { +	.l4proto		= IPPROTO_SCTP, +	.manip_pkt		= sctp_manip_pkt, +	.in_range		= nf_nat_l4proto_in_range, +	.unique_tuple		= sctp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_sctp_init(void) +{ +	int err; + +	err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp); +	if (err < 0) +		goto err1; +	err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp); +	if (err < 0) +		goto err2; +	return 0; + +err2: +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); +err1: +	return err; +} + +static void __exit nf_nat_proto_sctp_exit(void) +{ +	nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp); +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); +} + +module_init(nf_nat_proto_sctp_init); +module_exit(nf_nat_proto_sctp_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCTP NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c new file mode 100644 index 00000000000..83ec8a6e4c3 --- /dev/null +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -0,0 +1,85 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> + +static u16 tcp_port_rover; + +static void +tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, +		 struct nf_conntrack_tuple *tuple, +		 const struct nf_nat_range *range, +		 enum nf_nat_manip_type maniptype, +		 const struct nf_conn *ct) +{ +	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, +				    &tcp_port_rover); +} + +static bool +tcp_manip_pkt(struct sk_buff *skb, +	      const struct nf_nat_l3proto *l3proto, +	      unsigned int iphdroff, unsigned int hdroff, +	      const struct nf_conntrack_tuple *tuple, +	      enum nf_nat_manip_type maniptype) +{ +	struct tcphdr *hdr; +	__be16 *portptr, newport, oldport; +	int hdrsize = 8; /* TCP connection tracking guarantees this much */ + +	/* this could be a inner header returned in icmp packet; in such +	   cases we cannot update the checksum field since it is outside of +	   the 8 bytes of transport layer headers we are guaranteed */ +	if (skb->len >= hdroff + sizeof(struct tcphdr)) +		hdrsize = sizeof(struct tcphdr); + +	if (!skb_make_writable(skb, hdroff + hdrsize)) +		return false; + +	hdr = (struct tcphdr *)(skb->data + hdroff); + +	if (maniptype == NF_NAT_MANIP_SRC) { +		/* Get rid of src port */ +		newport = tuple->src.u.tcp.port; +		portptr = &hdr->source; +	} else { +		/* Get rid of dst port */ +		newport = tuple->dst.u.tcp.port; +		portptr = &hdr->dest; +	} + +	oldport = *portptr; +	*portptr = newport; + +	if (hdrsize < sizeof(*hdr)) +		return true; + +	l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); +	inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); +	return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_tcp = { +	.l4proto		= IPPROTO_TCP, +	.manip_pkt		= tcp_manip_pkt, +	.in_range		= nf_nat_l4proto_in_range, +	.unique_tuple		= tcp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c new file mode 100644 index 00000000000..7df613fb34a --- /dev/null +++ b/net/netfilter/nf_nat_proto_udp.c @@ -0,0 +1,76 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u16 udp_port_rover; + +static void +udp_unique_tuple(const struct nf_nat_l3proto *l3proto, +		 struct nf_conntrack_tuple *tuple, +		 const struct nf_nat_range *range, +		 enum nf_nat_manip_type maniptype, +		 const struct nf_conn *ct) +{ +	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, +				    &udp_port_rover); +} + +static bool +udp_manip_pkt(struct sk_buff *skb, +	      const struct nf_nat_l3proto *l3proto, +	      unsigned int iphdroff, unsigned int hdroff, +	      const struct nf_conntrack_tuple *tuple, +	      enum nf_nat_manip_type maniptype) +{ +	struct udphdr *hdr; +	__be16 *portptr, newport; + +	if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) +		return false; +	hdr = (struct udphdr *)(skb->data + hdroff); + +	if (maniptype == NF_NAT_MANIP_SRC) { +		/* Get rid of src port */ +		newport = tuple->src.u.udp.port; +		portptr = &hdr->source; +	} else { +		/* Get rid of dst port */ +		newport = tuple->dst.u.udp.port; +		portptr = &hdr->dest; +	} +	if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { +		l3proto->csum_update(skb, iphdroff, &hdr->check, +				     tuple, maniptype); +		inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, +					 0); +		if (!hdr->check) +			hdr->check = CSUM_MANGLED_0; +	} +	*portptr = newport; +	return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_udp = { +	.l4proto		= IPPROTO_UDP, +	.manip_pkt		= udp_manip_pkt, +	.in_range		= nf_nat_l4proto_in_range, +	.unique_tuple		= udp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c new file mode 100644 index 00000000000..776a0d1317b --- /dev/null +++ b/net/netfilter/nf_nat_proto_udplite.c @@ -0,0 +1,106 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/module.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u16 udplite_port_rover; + +static void +udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, +		     struct nf_conntrack_tuple *tuple, +		     const struct nf_nat_range *range, +		     enum nf_nat_manip_type maniptype, +		     const struct nf_conn *ct) +{ +	nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, +				    &udplite_port_rover); +} + +static bool +udplite_manip_pkt(struct sk_buff *skb, +		  const struct nf_nat_l3proto *l3proto, +		  unsigned int iphdroff, unsigned int hdroff, +		  const struct nf_conntrack_tuple *tuple, +		  enum nf_nat_manip_type maniptype) +{ +	struct udphdr *hdr; +	__be16 *portptr, newport; + +	if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) +		return false; + +	hdr = (struct udphdr *)(skb->data + hdroff); + +	if (maniptype == NF_NAT_MANIP_SRC) { +		/* Get rid of source port */ +		newport = tuple->src.u.udp.port; +		portptr = &hdr->source; +	} else { +		/* Get rid of dst port */ +		newport = tuple->dst.u.udp.port; +		portptr = &hdr->dest; +	} + +	l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); +	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); +	if (!hdr->check) +		hdr->check = CSUM_MANGLED_0; + +	*portptr = newport; +	return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_udplite = { +	.l4proto		= IPPROTO_UDPLITE, +	.manip_pkt		= udplite_manip_pkt, +	.in_range		= nf_nat_l4proto_in_range, +	.unique_tuple		= udplite_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_udplite_init(void) +{ +	int err; + +	err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite); +	if (err < 0) +		goto err1; +	err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite); +	if (err < 0) +		goto err2; +	return 0; + +err2: +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); +err1: +	return err; +} + +static void __exit nf_nat_proto_udplite_fini(void) +{ +	nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite); +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); +} + +module_init(nf_nat_proto_udplite_init); +module_exit(nf_nat_proto_udplite_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c new file mode 100644 index 00000000000..6e494d58441 --- /dev/null +++ b/net/netfilter/nf_nat_proto_unknown.c @@ -0,0 +1,54 @@ +/* The "unknown" protocol.  This is what is used for protocols we + * don't understand.  It's returned by ip_ct_find_proto(). + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, +			     enum nf_nat_manip_type manip_type, +			     const union nf_conntrack_man_proto *min, +			     const union nf_conntrack_man_proto *max) +{ +	return true; +} + +static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, +				 struct nf_conntrack_tuple *tuple, +				 const struct nf_nat_range *range, +				 enum nf_nat_manip_type maniptype, +				 const struct nf_conn *ct) +{ +	/* Sorry: we can't help you; if it's not unique, we can't frob +	 * anything. +	 */ +	return; +} + +static bool +unknown_manip_pkt(struct sk_buff *skb, +		  const struct nf_nat_l3proto *l3proto, +		  unsigned int iphdroff, unsigned int hdroff, +		  const struct nf_conntrack_tuple *tuple, +		  enum nf_nat_manip_type maniptype) +{ +	return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_unknown = { +	.manip_pkt		= unknown_manip_pkt, +	.in_range		= unknown_in_range, +	.unique_tuple		= unknown_unique_tuple, +}; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c new file mode 100644 index 00000000000..b4d691db955 --- /dev/null +++ b/net/netfilter/nf_nat_sip.c @@ -0,0 +1,653 @@ +/* SIP extension for NAT alteration. + * + * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> + * based on RR's ip_nat_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008, 2011, 2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet.h> +#include <linux/udp.h> +#include <linux/tcp.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <linux/netfilter/nf_conntrack_sip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); +MODULE_DESCRIPTION("SIP NAT helper"); +MODULE_ALIAS("ip_nat_sip"); + + +static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff, +				  unsigned int dataoff, +				  const char **dptr, unsigned int *datalen, +				  unsigned int matchoff, unsigned int matchlen, +				  const char *buffer, unsigned int buflen) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	struct tcphdr *th; +	unsigned int baseoff; + +	if (nf_ct_protonum(ct) == IPPROTO_TCP) { +		th = (struct tcphdr *)(skb->data + protoff); +		baseoff = protoff + th->doff * 4; +		matchoff += dataoff - baseoff; + +		if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, +						protoff, matchoff, matchlen, +						buffer, buflen, false)) +			return 0; +	} else { +		baseoff = protoff + sizeof(struct udphdr); +		matchoff += dataoff - baseoff; + +		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, +					      protoff, matchoff, matchlen, +					      buffer, buflen)) +			return 0; +	} + +	/* Reload data pointer and adjust datalen value */ +	*dptr = skb->data + dataoff; +	*datalen += buflen - matchlen; +	return 1; +} + +static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer, +			    const union nf_inet_addr *addr, bool delim) +{ +	if (nf_ct_l3num(ct) == NFPROTO_IPV4) +		return sprintf(buffer, "%pI4", &addr->ip); +	else { +		if (delim) +			return sprintf(buffer, "[%pI6c]", &addr->ip6); +		else +			return sprintf(buffer, "%pI6c", &addr->ip6); +	} +} + +static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer, +				 const union nf_inet_addr *addr, u16 port) +{ +	if (nf_ct_l3num(ct) == NFPROTO_IPV4) +		return sprintf(buffer, "%pI4:%u", &addr->ip, port); +	else +		return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port); +} + +static int map_addr(struct sk_buff *skb, unsigned int protoff, +		    unsigned int dataoff, +		    const char **dptr, unsigned int *datalen, +		    unsigned int matchoff, unsigned int matchlen, +		    union nf_inet_addr *addr, __be16 port) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); +	char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; +	unsigned int buflen; +	union nf_inet_addr newaddr; +	__be16 newport; + +	if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) && +	    ct->tuplehash[dir].tuple.src.u.udp.port == port) { +		newaddr = ct->tuplehash[!dir].tuple.dst.u3; +		newport = ct->tuplehash[!dir].tuple.dst.u.udp.port; +	} else if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, addr) && +		   ct->tuplehash[dir].tuple.dst.u.udp.port == port) { +		newaddr = ct->tuplehash[!dir].tuple.src.u3; +		newport = ct_sip_info->forced_dport ? : +			  ct->tuplehash[!dir].tuple.src.u.udp.port; +	} else +		return 1; + +	if (nf_inet_addr_cmp(&newaddr, addr) && newport == port) +		return 1; + +	buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport)); +	return mangle_packet(skb, protoff, dataoff, dptr, datalen, +			     matchoff, matchlen, buffer, buflen); +} + +static int map_sip_addr(struct sk_buff *skb, unsigned int protoff, +			unsigned int dataoff, +			const char **dptr, unsigned int *datalen, +			enum sip_header_types type) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	unsigned int matchlen, matchoff; +	union nf_inet_addr addr; +	__be16 port; + +	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, +				    &matchoff, &matchlen, &addr, &port) <= 0) +		return 1; +	return map_addr(skb, protoff, dataoff, dptr, datalen, +			matchoff, matchlen, &addr, port); +} + +static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, +			       unsigned int dataoff, +			       const char **dptr, unsigned int *datalen) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); +	unsigned int coff, matchoff, matchlen; +	enum sip_header_types hdr; +	union nf_inet_addr addr; +	__be16 port; +	int request, in_header; + +	/* Basic rules: requests and responses. */ +	if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) { +		if (ct_sip_parse_request(ct, *dptr, *datalen, +					 &matchoff, &matchlen, +					 &addr, &port) > 0 && +		    !map_addr(skb, protoff, dataoff, dptr, datalen, +			      matchoff, matchlen, &addr, port)) { +			nf_ct_helper_log(skb, ct, "cannot mangle SIP message"); +			return NF_DROP; +		} +		request = 1; +	} else +		request = 0; + +	if (nf_ct_protonum(ct) == IPPROTO_TCP) +		hdr = SIP_HDR_VIA_TCP; +	else +		hdr = SIP_HDR_VIA_UDP; + +	/* Translate topmost Via header and parameters */ +	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, +				    hdr, NULL, &matchoff, &matchlen, +				    &addr, &port) > 0) { +		unsigned int olen, matchend, poff, plen, buflen, n; +		char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; + +		/* We're only interested in headers related to this +		 * connection */ +		if (request) { +			if (!nf_inet_addr_cmp(&addr, +					&ct->tuplehash[dir].tuple.src.u3) || +			    port != ct->tuplehash[dir].tuple.src.u.udp.port) +				goto next; +		} else { +			if (!nf_inet_addr_cmp(&addr, +					&ct->tuplehash[dir].tuple.dst.u3) || +			    port != ct->tuplehash[dir].tuple.dst.u.udp.port) +				goto next; +		} + +		olen = *datalen; +		if (!map_addr(skb, protoff, dataoff, dptr, datalen, +			      matchoff, matchlen, &addr, port)) { +			nf_ct_helper_log(skb, ct, "cannot mangle Via header"); +			return NF_DROP; +		} + +		matchend = matchoff + matchlen + *datalen - olen; + +		/* The maddr= parameter (RFC 2361) specifies where to send +		 * the reply. */ +		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, +					       "maddr=", &poff, &plen, +					       &addr, true) > 0 && +		    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) && +		    !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) { +			buflen = sip_sprintf_addr(ct, buffer, +					&ct->tuplehash[!dir].tuple.dst.u3, +					true); +			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, +					   poff, plen, buffer, buflen)) { +				nf_ct_helper_log(skb, ct, "cannot mangle maddr"); +				return NF_DROP; +			} +		} + +		/* The received= parameter (RFC 2361) contains the address +		 * from which the server received the request. */ +		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, +					       "received=", &poff, &plen, +					       &addr, false) > 0 && +		    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) && +		    !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) { +			buflen = sip_sprintf_addr(ct, buffer, +					&ct->tuplehash[!dir].tuple.src.u3, +					false); +			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, +					   poff, plen, buffer, buflen)) { +				nf_ct_helper_log(skb, ct, "cannot mangle received"); +				return NF_DROP; +			} +		} + +		/* The rport= parameter (RFC 3581) contains the port number +		 * from which the server received the request. */ +		if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, +						 "rport=", &poff, &plen, +						 &n) > 0 && +		    htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && +		    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { +			__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; +			buflen = sprintf(buffer, "%u", ntohs(p)); +			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, +					   poff, plen, buffer, buflen)) { +				nf_ct_helper_log(skb, ct, "cannot mangle rport"); +				return NF_DROP; +			} +		} +	} + +next: +	/* Translate Contact headers */ +	coff = 0; +	in_header = 0; +	while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, +				       SIP_HDR_CONTACT, &in_header, +				       &matchoff, &matchlen, +				       &addr, &port) > 0) { +		if (!map_addr(skb, protoff, dataoff, dptr, datalen, +			      matchoff, matchlen, +			      &addr, port)) { +			nf_ct_helper_log(skb, ct, "cannot mangle contact"); +			return NF_DROP; +		} +	} + +	if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) || +	    !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) { +		nf_ct_helper_log(skb, ct, "cannot mangle SIP from/to"); +		return NF_DROP; +	} + +	/* Mangle destination port for Cisco phones, then fix up checksums */ +	if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { +		struct udphdr *uh; + +		if (!skb_make_writable(skb, skb->len)) { +			nf_ct_helper_log(skb, ct, "cannot mangle packet"); +			return NF_DROP; +		} + +		uh = (void *)skb->data + protoff; +		uh->dest = ct_sip_info->forced_dport; + +		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, protoff, +					      0, 0, NULL, 0)) { +			nf_ct_helper_log(skb, ct, "cannot mangle packet"); +			return NF_DROP; +		} +	} + +	return NF_ACCEPT; +} + +static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, +				  s16 off) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	const struct tcphdr *th; + +	if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) +		return; + +	th = (struct tcphdr *)(skb->data + protoff); +	nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} + +/* Handles expected signalling connections and media streams */ +static void nf_nat_sip_expected(struct nf_conn *ct, +				struct nf_conntrack_expect *exp) +{ +	struct nf_nat_range range; + +	/* This must be a fresh one. */ +	BUG_ON(ct->status & IPS_NAT_DONE_MASK); + +	/* For DST manip, map port here to where it's expected. */ +	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); +	range.min_proto = range.max_proto = exp->saved_proto; +	range.min_addr = range.max_addr = exp->saved_addr; +	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); + +	/* Change src to where master sends to, but only if the connection +	 * actually came from the same source. */ +	if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, +			     &ct->master->tuplehash[exp->dir].tuple.src.u3)) { +		range.flags = NF_NAT_RANGE_MAP_IPS; +		range.min_addr = range.max_addr +			= ct->master->tuplehash[!exp->dir].tuple.dst.u3; +		nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); +	} +} + +static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, +				      unsigned int dataoff, +				      const char **dptr, unsigned int *datalen, +				      struct nf_conntrack_expect *exp, +				      unsigned int matchoff, +				      unsigned int matchlen) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); +	union nf_inet_addr newaddr; +	u_int16_t port; +	__be16 srcport; +	char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; +	unsigned int buflen; + +	/* Connection will come from reply */ +	if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, +			     &ct->tuplehash[!dir].tuple.dst.u3)) +		newaddr = exp->tuple.dst.u3; +	else +		newaddr = ct->tuplehash[!dir].tuple.dst.u3; + +	/* If the signalling port matches the connection's source port in the +	 * original direction, try to use the destination port in the opposite +	 * direction. */ +	srcport = ct_sip_info->forced_dport ? : +		  ct->tuplehash[dir].tuple.src.u.udp.port; +	if (exp->tuple.dst.u.udp.port == srcport) +		port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); +	else +		port = ntohs(exp->tuple.dst.u.udp.port); + +	exp->saved_addr = exp->tuple.dst.u3; +	exp->tuple.dst.u3 = newaddr; +	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; +	exp->dir = !dir; +	exp->expectfn = nf_nat_sip_expected; + +	for (; port != 0; port++) { +		int ret; + +		exp->tuple.dst.u.udp.port = htons(port); +		ret = nf_ct_expect_related(exp); +		if (ret == 0) +			break; +		else if (ret != -EBUSY) { +			port = 0; +			break; +		} +	} + +	if (port == 0) { +		nf_ct_helper_log(skb, ct, "all ports in use for SIP"); +		return NF_DROP; +	} + +	if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) || +	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { +		buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port); +		if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, +				   matchoff, matchlen, buffer, buflen)) { +			nf_ct_helper_log(skb, ct, "cannot mangle packet"); +			goto err; +		} +	} +	return NF_ACCEPT; + +err: +	nf_ct_unexpect_related(exp); +	return NF_DROP; +} + +static int mangle_content_len(struct sk_buff *skb, unsigned int protoff, +			      unsigned int dataoff, +			      const char **dptr, unsigned int *datalen) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	unsigned int matchoff, matchlen; +	char buffer[sizeof("65536")]; +	int buflen, c_len; + +	/* Get actual SDP length */ +	if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, +				  SDP_HDR_VERSION, SDP_HDR_UNSPEC, +				  &matchoff, &matchlen) <= 0) +		return 0; +	c_len = *datalen - matchoff + strlen("v="); + +	/* Now, update SDP length */ +	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH, +			      &matchoff, &matchlen) <= 0) +		return 0; + +	buflen = sprintf(buffer, "%u", c_len); +	return mangle_packet(skb, protoff, dataoff, dptr, datalen, +			     matchoff, matchlen, buffer, buflen); +} + +static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff, +			     unsigned int dataoff, +			     const char **dptr, unsigned int *datalen, +			     unsigned int sdpoff, +			     enum sdp_header_types type, +			     enum sdp_header_types term, +			     char *buffer, int buflen) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	unsigned int matchlen, matchoff; + +	if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, +				  &matchoff, &matchlen) <= 0) +		return -ENOENT; +	return mangle_packet(skb, protoff, dataoff, dptr, datalen, +			     matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL; +} + +static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff, +				    unsigned int dataoff, +				    const char **dptr, unsigned int *datalen, +				    unsigned int sdpoff, +				    enum sdp_header_types type, +				    enum sdp_header_types term, +				    const union nf_inet_addr *addr) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	char buffer[INET6_ADDRSTRLEN]; +	unsigned int buflen; + +	buflen = sip_sprintf_addr(ct, buffer, addr, false); +	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, +			      sdpoff, type, term, buffer, buflen)) +		return 0; + +	return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff, +				    unsigned int dataoff, +				    const char **dptr, unsigned int *datalen, +				    unsigned int matchoff, +				    unsigned int matchlen, +				    u_int16_t port) +{ +	char buffer[sizeof("nnnnn")]; +	unsigned int buflen; + +	buflen = sprintf(buffer, "%u", port); +	if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, +			   matchoff, matchlen, buffer, buflen)) +		return 0; + +	return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff, +				       unsigned int dataoff, +				       const char **dptr, unsigned int *datalen, +				       unsigned int sdpoff, +				       const union nf_inet_addr *addr) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	char buffer[INET6_ADDRSTRLEN]; +	unsigned int buflen; + +	/* Mangle session description owner and contact addresses */ +	buflen = sip_sprintf_addr(ct, buffer, addr, false); +	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, +			      SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen)) +		return 0; + +	switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, +				  SDP_HDR_CONNECTION, SDP_HDR_MEDIA, +				  buffer, buflen)) { +	case 0: +	/* +	 * RFC 2327: +	 * +	 * Session description +	 * +	 * c=* (connection information - not required if included in all media) +	 */ +	case -ENOENT: +		break; +	default: +		return 0; +	} + +	return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +/* So, this packet has hit the connection tracking matching code. +   Mangle it, and change the expectation to match the new version. */ +static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff, +				     unsigned int dataoff, +				     const char **dptr, unsigned int *datalen, +				     struct nf_conntrack_expect *rtp_exp, +				     struct nf_conntrack_expect *rtcp_exp, +				     unsigned int mediaoff, +				     unsigned int medialen, +				     union nf_inet_addr *rtp_addr) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	u_int16_t port; + +	/* Connection will come from reply */ +	if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, +			     &ct->tuplehash[!dir].tuple.dst.u3)) +		*rtp_addr = rtp_exp->tuple.dst.u3; +	else +		*rtp_addr = ct->tuplehash[!dir].tuple.dst.u3; + +	rtp_exp->saved_addr = rtp_exp->tuple.dst.u3; +	rtp_exp->tuple.dst.u3 = *rtp_addr; +	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; +	rtp_exp->dir = !dir; +	rtp_exp->expectfn = nf_nat_sip_expected; + +	rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3; +	rtcp_exp->tuple.dst.u3 = *rtp_addr; +	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; +	rtcp_exp->dir = !dir; +	rtcp_exp->expectfn = nf_nat_sip_expected; + +	/* Try to get same pair of ports: if not, try to change them. */ +	for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); +	     port != 0; port += 2) { +		int ret; + +		rtp_exp->tuple.dst.u.udp.port = htons(port); +		ret = nf_ct_expect_related(rtp_exp); +		if (ret == -EBUSY) +			continue; +		else if (ret < 0) { +			port = 0; +			break; +		} +		rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); +		ret = nf_ct_expect_related(rtcp_exp); +		if (ret == 0) +			break; +		else if (ret == -EBUSY) { +			nf_ct_unexpect_related(rtp_exp); +			continue; +		} else if (ret < 0) { +			nf_ct_unexpect_related(rtp_exp); +			port = 0; +			break; +		} +	} + +	if (port == 0) { +		nf_ct_helper_log(skb, ct, "all ports in use for SDP media"); +		goto err1; +	} + +	/* Update media port. */ +	if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && +	    !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen, +			     mediaoff, medialen, port)) { +		nf_ct_helper_log(skb, ct, "cannot mangle SDP message"); +		goto err2; +	} + +	return NF_ACCEPT; + +err2: +	nf_ct_unexpect_related(rtp_exp); +	nf_ct_unexpect_related(rtcp_exp); +err1: +	return NF_DROP; +} + +static struct nf_ct_helper_expectfn sip_nat = { +	.name		= "sip", +	.expectfn	= nf_nat_sip_expected, +}; + +static void __exit nf_nat_sip_fini(void) +{ +	RCU_INIT_POINTER(nf_nat_sip_hooks, NULL); + +	nf_ct_helper_expectfn_unregister(&sip_nat); +	synchronize_rcu(); +} + +static const struct nf_nat_sip_hooks sip_hooks = { +	.msg		= nf_nat_sip, +	.seq_adjust	= nf_nat_sip_seq_adjust, +	.expect		= nf_nat_sip_expect, +	.sdp_addr	= nf_nat_sdp_addr, +	.sdp_port	= nf_nat_sdp_port, +	.sdp_session	= nf_nat_sdp_session, +	.sdp_media	= nf_nat_sdp_media, +}; + +static int __init nf_nat_sip_init(void) +{ +	BUG_ON(nf_nat_sip_hooks != NULL); +	RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks); +	nf_ct_helper_expectfn_register(&sip_nat); +	return 0; +} + +module_init(nf_nat_sip_init); +module_exit(nf_nat_sip_fini); diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c new file mode 100644 index 00000000000..7f67e1d5310 --- /dev/null +++ b/net/netfilter/nf_nat_tftp.c @@ -0,0 +1,52 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/udp.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_tftp.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("TFTP NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_tftp"); + +static unsigned int help(struct sk_buff *skb, +			 enum ip_conntrack_info ctinfo, +			 struct nf_conntrack_expect *exp) +{ +	const struct nf_conn *ct = exp->master; + +	exp->saved_proto.udp.port +		= ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; +	exp->dir = IP_CT_DIR_REPLY; +	exp->expectfn = nf_nat_follow_master; +	if (nf_ct_expect_related(exp) != 0) { +		nf_ct_helper_log(skb, exp->master, "cannot add expectation"); +		return NF_DROP; +	} +	return NF_ACCEPT; +} + +static void __exit nf_nat_tftp_fini(void) +{ +	RCU_INIT_POINTER(nf_nat_tftp_hook, NULL); +	synchronize_rcu(); +} + +static int __init nf_nat_tftp_init(void) +{ +	BUG_ON(nf_nat_tftp_hook != NULL); +	RCU_INIT_POINTER(nf_nat_tftp_hook, help); +	return 0; +} + +module_init(nf_nat_tftp_init); +module_exit(nf_nat_tftp_fini); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 74aebed5bd2..5d24b1fdb59 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -1,3 +1,8 @@ +/* + * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 + */ +  #include <linux/kernel.h>  #include <linux/slab.h>  #include <linux/init.h> @@ -14,75 +19,33 @@  #include "nf_internals.h"  /* - * A queue handler may be registered for each protocol.  Each is protected by - * long term mutex.  The handler must provide an an outfn() to accept packets - * for queueing and must reinject all packets it receives, no matter what. + * Hook for nfnetlink_queue to register its queue handler. + * We do this so that most of the NFQUEUE code can be modular. + * + * Once the queue is registered it must reinject all packets it + * receives, no matter what.   */ -static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly; - -static DEFINE_MUTEX(queue_handler_mutex); +static const struct nf_queue_handler __rcu *queue_handler __read_mostly;  /* return EBUSY when somebody else is registered, return EEXIST if the   * same handler is registered, return 0 in case of success. */ -int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +void nf_register_queue_handler(const struct nf_queue_handler *qh)  { -	int ret; - -	if (pf >= ARRAY_SIZE(queue_handler)) -		return -EINVAL; - -	mutex_lock(&queue_handler_mutex); -	if (queue_handler[pf] == qh) -		ret = -EEXIST; -	else if (queue_handler[pf]) -		ret = -EBUSY; -	else { -		rcu_assign_pointer(queue_handler[pf], qh); -		ret = 0; -	} -	mutex_unlock(&queue_handler_mutex); - -	return ret; +	/* should never happen, we only have one queueing backend in kernel */ +	WARN_ON(rcu_access_pointer(queue_handler)); +	rcu_assign_pointer(queue_handler, qh);  }  EXPORT_SYMBOL(nf_register_queue_handler);  /* The caller must flush their queue before this */ -int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +void nf_unregister_queue_handler(void)  { -	if (pf >= ARRAY_SIZE(queue_handler)) -		return -EINVAL; - -	mutex_lock(&queue_handler_mutex); -	if (queue_handler[pf] && queue_handler[pf] != qh) { -		mutex_unlock(&queue_handler_mutex); -		return -EINVAL; -	} - -	rcu_assign_pointer(queue_handler[pf], NULL); -	mutex_unlock(&queue_handler_mutex); - +	RCU_INIT_POINTER(queue_handler, NULL);  	synchronize_rcu(); - -	return 0;  }  EXPORT_SYMBOL(nf_unregister_queue_handler); -void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) -{ -	u_int8_t pf; - -	mutex_lock(&queue_handler_mutex); -	for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++)  { -		if (queue_handler[pf] == qh) -			rcu_assign_pointer(queue_handler[pf], NULL); -	} -	mutex_unlock(&queue_handler_mutex); - -	synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); - -static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +void nf_queue_entry_release_refs(struct nf_queue_entry *entry)  {  	/* Release those devices we held, or Alexey will kill me. */  	if (entry->indev) @@ -102,75 +65,87 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)  	/* Drop reference to owner of hook which queued us. */  	module_put(entry->elem->owner);  } +EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + +/* Bump dev refs so they don't vanish while packet is out */ +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) +{ +	if (!try_module_get(entry->elem->owner)) +		return false; + +	if (entry->indev) +		dev_hold(entry->indev); +	if (entry->outdev) +		dev_hold(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER +	if (entry->skb->nf_bridge) { +		struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; +		struct net_device *physdev; + +		physdev = nf_bridge->physindev; +		if (physdev) +			dev_hold(physdev); +		physdev = nf_bridge->physoutdev; +		if (physdev) +			dev_hold(physdev); +	} +#endif + +	return true; +} +EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);  /*   * Any packet that leaves via this function must come back   * through nf_reinject().   */ -static int __nf_queue(struct sk_buff *skb, -		      struct list_head *elem, +int nf_queue(struct sk_buff *skb, +		      struct nf_hook_ops *elem,  		      u_int8_t pf, unsigned int hook,  		      struct net_device *indev,  		      struct net_device *outdev,  		      int (*okfn)(struct sk_buff *),  		      unsigned int queuenum)  { -	int status; +	int status = -ENOENT;  	struct nf_queue_entry *entry = NULL; -#ifdef CONFIG_BRIDGE_NETFILTER -	struct net_device *physindev; -	struct net_device *physoutdev; -#endif  	const struct nf_afinfo *afinfo;  	const struct nf_queue_handler *qh; -	/* QUEUE == DROP if noone is waiting, to be safe. */ +	/* QUEUE == DROP if no one is waiting, to be safe. */  	rcu_read_lock(); -	qh = rcu_dereference(queue_handler[pf]); -	if (!qh) +	qh = rcu_dereference(queue_handler); +	if (!qh) { +		status = -ESRCH;  		goto err_unlock; +	}  	afinfo = nf_get_afinfo(pf);  	if (!afinfo)  		goto err_unlock;  	entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); -	if (!entry) +	if (!entry) { +		status = -ENOMEM;  		goto err_unlock; +	}  	*entry = (struct nf_queue_entry) {  		.skb	= skb, -		.elem	= list_entry(elem, struct nf_hook_ops, list), +		.elem	= elem,  		.pf	= pf,  		.hook	= hook,  		.indev	= indev,  		.outdev	= outdev,  		.okfn	= okfn, +		.size	= sizeof(*entry) + afinfo->route_key_size,  	}; -	/* If it's going away, ignore hook. */ -	if (!try_module_get(entry->elem->owner)) { -		rcu_read_unlock(); -		kfree(entry); -		return 0; -	} - -	/* Bump dev refs so they don't vanish while packet is out */ -	if (indev) -		dev_hold(indev); -	if (outdev) -		dev_hold(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER -	if (skb->nf_bridge) { -		physindev = skb->nf_bridge->physindev; -		if (physindev) -			dev_hold(physindev); -		physoutdev = skb->nf_bridge->physoutdev; -		if (physoutdev) -			dev_hold(physoutdev); +	if (!nf_queue_entry_get_refs(entry)) { +		status = -ECANCELED; +		goto err_unlock;  	} -#endif  	skb_dst_force(skb);  	afinfo->saveroute(skb, entry);  	status = qh->outfn(entry, queuenum); @@ -182,61 +157,21 @@ static int __nf_queue(struct sk_buff *skb,  		goto err;  	} -	return 1; +	return 0;  err_unlock:  	rcu_read_unlock();  err: -	kfree_skb(skb);  	kfree(entry); -	return 1; -} - -int nf_queue(struct sk_buff *skb, -	     struct list_head *elem, -	     u_int8_t pf, unsigned int hook, -	     struct net_device *indev, -	     struct net_device *outdev, -	     int (*okfn)(struct sk_buff *), -	     unsigned int queuenum) -{ -	struct sk_buff *segs; - -	if (!skb_is_gso(skb)) -		return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, -				  queuenum); - -	switch (pf) { -	case NFPROTO_IPV4: -		skb->protocol = htons(ETH_P_IP); -		break; -	case NFPROTO_IPV6: -		skb->protocol = htons(ETH_P_IPV6); -		break; -	} - -	segs = skb_gso_segment(skb, 0); -	kfree_skb(skb); -	if (IS_ERR(segs)) -		return 1; - -	do { -		struct sk_buff *nskb = segs->next; - -		segs->next = NULL; -		if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn, -				queuenum)) -			kfree_skb(segs); -		segs = nskb; -	} while (segs); -	return 1; +	return status;  }  void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)  {  	struct sk_buff *skb = entry->skb; -	struct list_head *elem = &entry->elem->list; +	struct nf_hook_ops *elem = entry->elem;  	const struct nf_afinfo *afinfo; +	int err;  	rcu_read_lock(); @@ -244,7 +179,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)  	/* Continue traversal iff userspace said ok... */  	if (verdict == NF_REPEAT) { -		elem = elem->prev; +		elem = list_entry(elem->list.prev, struct nf_hook_ops, list);  		verdict = NF_ACCEPT;  	} @@ -270,12 +205,20 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)  		local_bh_enable();  		break;  	case NF_QUEUE: -		if (!__nf_queue(skb, elem, entry->pf, entry->hook, +		err = nf_queue(skb, elem, entry->pf, entry->hook,  				entry->indev, entry->outdev, entry->okfn, -				verdict >> NF_VERDICT_BITS)) -			goto next_hook; +				verdict >> NF_VERDICT_QBITS); +		if (err < 0) { +			if (err == -ECANCELED) +				goto next_hook; +			if (err == -ESRCH && +			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) +				goto next_hook; +			kfree_skb(skb); +		}  		break;  	case NF_STOLEN: +		break;  	default:  		kfree_skb(skb);  	} @@ -283,77 +226,3 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)  	kfree(entry);  }  EXPORT_SYMBOL(nf_reinject); - -#ifdef CONFIG_PROC_FS -static void *seq_start(struct seq_file *seq, loff_t *pos) -{ -	if (*pos >= ARRAY_SIZE(queue_handler)) -		return NULL; - -	return pos; -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ -	(*pos)++; - -	if (*pos >= ARRAY_SIZE(queue_handler)) -		return NULL; - -	return pos; -} - -static void seq_stop(struct seq_file *s, void *v) -{ - -} - -static int seq_show(struct seq_file *s, void *v) -{ -	int ret; -	loff_t *pos = v; -	const struct nf_queue_handler *qh; - -	rcu_read_lock(); -	qh = rcu_dereference(queue_handler[*pos]); -	if (!qh) -		ret = seq_printf(s, "%2lld NONE\n", *pos); -	else -		ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); -	rcu_read_unlock(); - -	return ret; -} - -static const struct seq_operations nfqueue_seq_ops = { -	.start	= seq_start, -	.next	= seq_next, -	.stop	= seq_stop, -	.show	= seq_show, -}; - -static int nfqueue_open(struct inode *inode, struct file *file) -{ -	return seq_open(file, &nfqueue_seq_ops); -} - -static const struct file_operations nfqueue_file_ops = { -	.owner	 = THIS_MODULE, -	.open	 = nfqueue_open, -	.read	 = seq_read, -	.llseek	 = seq_lseek, -	.release = seq_release, -}; -#endif /* PROC_FS */ - - -int __init netfilter_queue_init(void) -{ -#ifdef CONFIG_PROC_FS -	if (!proc_create("nf_queue", S_IRUGO, -			 proc_net_netfilter, &nfqueue_file_ops)) -		return -1; -#endif -	return 0; -} - diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c new file mode 100644 index 00000000000..52e20c9a46a --- /dev/null +++ b/net/netfilter/nf_synproxy_core.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <asm/unaligned.h> +#include <net/tcp.h> +#include <net/netns/generic.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +int synproxy_net_id; +EXPORT_SYMBOL_GPL(synproxy_net_id); + +bool +synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, +		       const struct tcphdr *th, struct synproxy_options *opts) +{ +	int length = (th->doff * 4) - sizeof(*th); +	u8 buf[40], *ptr; + +	ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); +	if (ptr == NULL) +		return false; + +	opts->options = 0; +	while (length > 0) { +		int opcode = *ptr++; +		int opsize; + +		switch (opcode) { +		case TCPOPT_EOL: +			return true; +		case TCPOPT_NOP: +			length--; +			continue; +		default: +			opsize = *ptr++; +			if (opsize < 2) +				return true; +			if (opsize > length) +				return true; + +			switch (opcode) { +			case TCPOPT_MSS: +				if (opsize == TCPOLEN_MSS) { +					opts->mss = get_unaligned_be16(ptr); +					opts->options |= XT_SYNPROXY_OPT_MSS; +				} +				break; +			case TCPOPT_WINDOW: +				if (opsize == TCPOLEN_WINDOW) { +					opts->wscale = *ptr; +					if (opts->wscale > 14) +						opts->wscale = 14; +					opts->options |= XT_SYNPROXY_OPT_WSCALE; +				} +				break; +			case TCPOPT_TIMESTAMP: +				if (opsize == TCPOLEN_TIMESTAMP) { +					opts->tsval = get_unaligned_be32(ptr); +					opts->tsecr = get_unaligned_be32(ptr + 4); +					opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; +				} +				break; +			case TCPOPT_SACK_PERM: +				if (opsize == TCPOLEN_SACK_PERM) +					opts->options |= XT_SYNPROXY_OPT_SACK_PERM; +				break; +			} + +			ptr += opsize - 2; +			length -= opsize; +		} +	} +	return true; +} +EXPORT_SYMBOL_GPL(synproxy_parse_options); + +unsigned int synproxy_options_size(const struct synproxy_options *opts) +{ +	unsigned int size = 0; + +	if (opts->options & XT_SYNPROXY_OPT_MSS) +		size += TCPOLEN_MSS_ALIGNED; +	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) +		size += TCPOLEN_TSTAMP_ALIGNED; +	else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) +		size += TCPOLEN_SACKPERM_ALIGNED; +	if (opts->options & XT_SYNPROXY_OPT_WSCALE) +		size += TCPOLEN_WSCALE_ALIGNED; + +	return size; +} +EXPORT_SYMBOL_GPL(synproxy_options_size); + +void +synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) +{ +	__be32 *ptr = (__be32 *)(th + 1); +	u8 options = opts->options; + +	if (options & XT_SYNPROXY_OPT_MSS) +		*ptr++ = htonl((TCPOPT_MSS << 24) | +			       (TCPOLEN_MSS << 16) | +			       opts->mss); + +	if (options & XT_SYNPROXY_OPT_TIMESTAMP) { +		if (options & XT_SYNPROXY_OPT_SACK_PERM) +			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) | +				       (TCPOLEN_SACK_PERM << 16) | +				       (TCPOPT_TIMESTAMP << 8) | +				       TCPOLEN_TIMESTAMP); +		else +			*ptr++ = htonl((TCPOPT_NOP << 24) | +				       (TCPOPT_NOP << 16) | +				       (TCPOPT_TIMESTAMP << 8) | +				       TCPOLEN_TIMESTAMP); + +		*ptr++ = htonl(opts->tsval); +		*ptr++ = htonl(opts->tsecr); +	} else if (options & XT_SYNPROXY_OPT_SACK_PERM) +		*ptr++ = htonl((TCPOPT_NOP << 24) | +			       (TCPOPT_NOP << 16) | +			       (TCPOPT_SACK_PERM << 8) | +			       TCPOLEN_SACK_PERM); + +	if (options & XT_SYNPROXY_OPT_WSCALE) +		*ptr++ = htonl((TCPOPT_NOP << 24) | +			       (TCPOPT_WINDOW << 16) | +			       (TCPOLEN_WINDOW << 8) | +			       opts->wscale); +} +EXPORT_SYMBOL_GPL(synproxy_build_options); + +void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, +				    struct synproxy_options *opts) +{ +	opts->tsecr = opts->tsval; +	opts->tsval = tcp_time_stamp & ~0x3f; + +	if (opts->options & XT_SYNPROXY_OPT_WSCALE) { +		opts->tsval |= opts->wscale; +		opts->wscale = info->wscale; +	} else +		opts->tsval |= 0xf; + +	if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) +		opts->tsval |= 1 << 4; + +	if (opts->options & XT_SYNPROXY_OPT_ECN) +		opts->tsval |= 1 << 5; +} +EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); + +void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +{ +	opts->wscale = opts->tsecr & 0xf; +	if (opts->wscale != 0xf) +		opts->options |= XT_SYNPROXY_OPT_WSCALE; + +	opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + +	opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; +} +EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); + +unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, +				    unsigned int protoff, +				    struct tcphdr *th, +				    struct nf_conn *ct, +				    enum ip_conntrack_info ctinfo, +				    const struct nf_conn_synproxy *synproxy) +{ +	unsigned int optoff, optend; +	u32 *ptr, old; + +	if (synproxy->tsoff == 0) +		return 1; + +	optoff = protoff + sizeof(struct tcphdr); +	optend = protoff + th->doff * 4; + +	if (!skb_make_writable(skb, optend)) +		return 0; + +	while (optoff < optend) { +		unsigned char *op = skb->data + optoff; + +		switch (op[0]) { +		case TCPOPT_EOL: +			return 1; +		case TCPOPT_NOP: +			optoff++; +			continue; +		default: +			if (optoff + 1 == optend || +			    optoff + op[1] > optend || +			    op[1] < 2) +				return 0; +			if (op[0] == TCPOPT_TIMESTAMP && +			    op[1] == TCPOLEN_TIMESTAMP) { +				if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { +					ptr = (u32 *)&op[2]; +					old = *ptr; +					*ptr = htonl(ntohl(*ptr) - +						     synproxy->tsoff); +				} else { +					ptr = (u32 *)&op[6]; +					old = *ptr; +					*ptr = htonl(ntohl(*ptr) + +						     synproxy->tsoff); +				} +				inet_proto_csum_replace4(&th->check, skb, +							 old, *ptr, 0); +				return 1; +			} +			optoff += op[1]; +		} +	} +	return 1; +} +EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); + +static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { +	.len		= sizeof(struct nf_conn_synproxy), +	.align		= __alignof__(struct nf_conn_synproxy), +	.id		= NF_CT_EXT_SYNPROXY, +}; + +#ifdef CONFIG_PROC_FS +static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ +	struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); +	int cpu; + +	if (*pos == 0) +		return SEQ_START_TOKEN; + +	for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { +		if (!cpu_possible(cpu)) +			continue; +		*pos = cpu + 1; +		return per_cpu_ptr(snet->stats, cpu); +	} + +	return NULL; +} + +static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); +	int cpu; + +	for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { +		if (!cpu_possible(cpu)) +			continue; +		*pos = cpu + 1; +		return per_cpu_ptr(snet->stats, cpu); +	} + +	return NULL; +} + +static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) +{ +	return; +} + +static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) +{ +	struct synproxy_stats *stats = v; + +	if (v == SEQ_START_TOKEN) { +		seq_printf(seq, "entries\t\tsyn_received\t" +				"cookie_invalid\tcookie_valid\t" +				"cookie_retrans\tconn_reopened\n"); +		return 0; +	} + +	seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, +		   stats->syn_received, +		   stats->cookie_invalid, +		   stats->cookie_valid, +		   stats->cookie_retrans, +		   stats->conn_reopened); + +	return 0; +} + +static const struct seq_operations synproxy_cpu_seq_ops = { +	.start		= synproxy_cpu_seq_start, +	.next		= synproxy_cpu_seq_next, +	.stop		= synproxy_cpu_seq_stop, +	.show		= synproxy_cpu_seq_show, +}; + +static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &synproxy_cpu_seq_ops, +			    sizeof(struct seq_net_private)); +} + +static const struct file_operations synproxy_cpu_seq_fops = { +	.owner		= THIS_MODULE, +	.open		= synproxy_cpu_seq_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_net, +}; + +static int __net_init synproxy_proc_init(struct net *net) +{ +	if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, +			 &synproxy_cpu_seq_fops)) +		return -ENOMEM; +	return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ +	remove_proc_entry("synproxy", net->proc_net_stat); +} +#else +static int __net_init synproxy_proc_init(struct net *net) +{ +	return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ +	return; +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init synproxy_net_init(struct net *net) +{ +	struct synproxy_net *snet = synproxy_pernet(net); +	struct nf_conntrack_tuple t; +	struct nf_conn *ct; +	int err = -ENOMEM; + +	memset(&t, 0, sizeof(t)); +	ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); +	if (IS_ERR(ct)) { +		err = PTR_ERR(ct); +		goto err1; +	} + +	if (!nfct_seqadj_ext_add(ct)) +		goto err2; +	if (!nfct_synproxy_ext_add(ct)) +		goto err2; + +	nf_conntrack_tmpl_insert(net, ct); +	snet->tmpl = ct; + +	snet->stats = alloc_percpu(struct synproxy_stats); +	if (snet->stats == NULL) +		goto err2; + +	err = synproxy_proc_init(net); +	if (err < 0) +		goto err3; + +	return 0; + +err3: +	free_percpu(snet->stats); +err2: +	nf_conntrack_free(ct); +err1: +	return err; +} + +static void __net_exit synproxy_net_exit(struct net *net) +{ +	struct synproxy_net *snet = synproxy_pernet(net); + +	nf_ct_put(snet->tmpl); +	synproxy_proc_exit(net); +	free_percpu(snet->stats); +} + +static struct pernet_operations synproxy_net_ops = { +	.init		= synproxy_net_init, +	.exit		= synproxy_net_exit, +	.id		= &synproxy_net_id, +	.size		= sizeof(struct synproxy_net), +}; + +static int __init synproxy_core_init(void) +{ +	int err; + +	err = nf_ct_extend_register(&nf_ct_synproxy_extend); +	if (err < 0) +		goto err1; + +	err = register_pernet_subsys(&synproxy_net_ops); +	if (err < 0) +		goto err2; + +	return 0; + +err2: +	nf_ct_extend_unregister(&nf_ct_synproxy_extend); +err1: +	return err; +} + +static void __exit synproxy_core_exit(void) +{ +	unregister_pernet_subsys(&synproxy_net_ops); +	nf_ct_extend_unregister(&nf_ct_synproxy_extend); +} + +module_init(synproxy_core_init); +module_exit(synproxy_core_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c new file mode 100644 index 00000000000..8746ff9a835 --- /dev/null +++ b/net/netfilter/nf_tables_api.c @@ -0,0 +1,4041 @@ +/* + * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +static LIST_HEAD(nf_tables_expressions); + +/** + *	nft_register_afinfo - register nf_tables address family info + * + *	@afi: address family info to register + * + *	Register the address family for use with nf_tables. Returns zero on + *	success or a negative errno code otherwise. + */ +int nft_register_afinfo(struct net *net, struct nft_af_info *afi) +{ +	INIT_LIST_HEAD(&afi->tables); +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_add_tail_rcu(&afi->list, &net->nft.af_info); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return 0; +} +EXPORT_SYMBOL_GPL(nft_register_afinfo); + +/** + *	nft_unregister_afinfo - unregister nf_tables address family info + * + *	@afi: address family info to unregister + * + *	Unregister the address family for use with nf_tables. + */ +void nft_unregister_afinfo(struct nft_af_info *afi) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_del_rcu(&afi->list); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_afinfo); + +static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family) +{ +	struct nft_af_info *afi; + +	list_for_each_entry(afi, &net->nft.af_info, list) { +		if (afi->family == family) +			return afi; +	} +	return NULL; +} + +static struct nft_af_info * +nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) +{ +	struct nft_af_info *afi; + +	afi = nft_afinfo_lookup(net, family); +	if (afi != NULL) +		return afi; +#ifdef CONFIG_MODULES +	if (autoload) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-afinfo-%u", family); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		afi = nft_afinfo_lookup(net, family); +		if (afi != NULL) +			return ERR_PTR(-EAGAIN); +	} +#endif +	return ERR_PTR(-EAFNOSUPPORT); +} + +static void nft_ctx_init(struct nft_ctx *ctx, +			 const struct sk_buff *skb, +			 const struct nlmsghdr *nlh, +			 struct nft_af_info *afi, +			 struct nft_table *table, +			 struct nft_chain *chain, +			 const struct nlattr * const *nla) +{ +	ctx->net	= sock_net(skb->sk); +	ctx->afi	= afi; +	ctx->table	= table; +	ctx->chain	= chain; +	ctx->nla   	= nla; +	ctx->portid	= NETLINK_CB(skb).portid; +	ctx->report	= nlmsg_report(nlh); +	ctx->seq	= nlh->nlmsg_seq; +} + +static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, +					 u32 size) +{ +	struct nft_trans *trans; + +	trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); +	if (trans == NULL) +		return NULL; + +	trans->msg_type = msg_type; +	trans->ctx	= *ctx; + +	return trans; +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ +	list_del(&trans->list); +	kfree(trans); +} + +/* + * Tables + */ + +static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, +					  const struct nlattr *nla) +{ +	struct nft_table *table; + +	list_for_each_entry(table, &afi->tables, list) { +		if (!nla_strcmp(nla, table->name)) +			return table; +	} +	return NULL; +} + +static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, +						const struct nlattr *nla) +{ +	struct nft_table *table; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	table = nft_table_lookup(afi, nla); +	if (table != NULL) +		return table; + +	return ERR_PTR(-ENOENT); +} + +static inline u64 nf_tables_alloc_handle(struct nft_table *table) +{ +	return ++table->hgenerator; +} + +static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX]; + +static const struct nf_chain_type * +__nf_tables_chain_type_lookup(int family, const struct nlattr *nla) +{ +	int i; + +	for (i = 0; i < NFT_CHAIN_T_MAX; i++) { +		if (chain_type[family][i] != NULL && +		    !nla_strcmp(nla, chain_type[family][i]->name)) +			return chain_type[family][i]; +	} +	return NULL; +} + +static const struct nf_chain_type * +nf_tables_chain_type_lookup(const struct nft_af_info *afi, +			    const struct nlattr *nla, +			    bool autoload) +{ +	const struct nf_chain_type *type; + +	type = __nf_tables_chain_type_lookup(afi->family, nla); +	if (type != NULL) +		return type; +#ifdef CONFIG_MODULES +	if (autoload) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-chain-%u-%.*s", afi->family, +			       nla_len(nla), (const char *)nla_data(nla)); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		type = __nf_tables_chain_type_lookup(afi->family, nla); +		if (type != NULL) +			return ERR_PTR(-EAGAIN); +	} +#endif +	return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { +	[NFTA_TABLE_NAME]	= { .type = NLA_STRING }, +	[NFTA_TABLE_FLAGS]	= { .type = NLA_U32 }, +}; + +static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, +				     int event, u32 flags, int family, +				     const struct nft_table *table) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || +	    nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || +	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) +{ +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, +					ctx->afi->family, ctx->table); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, +			     ctx->report, GFP_KERNEL); +err: +	if (err < 0) { +		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, +				  err); +	} +	return err; +} + +static int nf_tables_dump_tables(struct sk_buff *skb, +				 struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	unsigned int idx = 0, s_idx = cb->args[0]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (family != NFPROTO_UNSPEC && family != afi->family) +			continue; + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			if (idx < s_idx) +				goto cont; +			if (idx > s_idx) +				memset(&cb->args[1], 0, +				       sizeof(cb->args) - sizeof(cb->args[0])); +			if (nf_tables_fill_table_info(skb, +						      NETLINK_CB(cb->skb).portid, +						      cb->nlh->nlmsg_seq, +						      NFT_MSG_NEWTABLE, +						      NLM_F_MULTI, +						      afi->family, table) < 0) +				goto done; + +			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +			idx++; +		} +	} +done: +	rcu_read_unlock(); +	cb->args[0] = idx; +	return skb->len; +} + +/* Internal table flags */ +#define NFT_TABLE_INACTIVE	(1 << 15) + +static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	struct sk_buff *skb2; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	int err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_tables, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb2) +		return -ENOMEM; + +	err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, +					nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, +					family, table); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static int nf_tables_table_enable(const struct nft_af_info *afi, +				  struct nft_table *table) +{ +	struct nft_chain *chain; +	int err, i = 0; + +	list_for_each_entry(chain, &table->chains, list) { +		if (!(chain->flags & NFT_BASE_CHAIN)) +			continue; + +		err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); +		if (err < 0) +			goto err; + +		i++; +	} +	return 0; +err: +	list_for_each_entry(chain, &table->chains, list) { +		if (!(chain->flags & NFT_BASE_CHAIN)) +			continue; + +		if (i-- <= 0) +			break; + +		nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); +	} +	return err; +} + +static void nf_tables_table_disable(const struct nft_af_info *afi, +				   struct nft_table *table) +{ +	struct nft_chain *chain; + +	list_for_each_entry(chain, &table->chains, list) { +		if (chain->flags & NFT_BASE_CHAIN) +			nf_unregister_hooks(nft_base_chain(chain)->ops, +					    afi->nops); +	} +} + +static int nf_tables_updtable(struct nft_ctx *ctx) +{ +	struct nft_trans *trans; +	u32 flags; +	int ret = 0; + +	if (!ctx->nla[NFTA_TABLE_FLAGS]) +		return 0; + +	flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); +	if (flags & ~NFT_TABLE_F_DORMANT) +		return -EINVAL; + +	if (flags == ctx->table->flags) +		return 0; + +	trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, +				sizeof(struct nft_trans_table)); +	if (trans == NULL) +		return -ENOMEM; + +	if ((flags & NFT_TABLE_F_DORMANT) && +	    !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { +		nft_trans_table_enable(trans) = false; +	} else if (!(flags & NFT_TABLE_F_DORMANT) && +		   ctx->table->flags & NFT_TABLE_F_DORMANT) { +		ret = nf_tables_table_enable(ctx->afi, ctx->table); +		if (ret >= 0) { +			ctx->table->flags &= ~NFT_TABLE_F_DORMANT; +			nft_trans_table_enable(trans) = true; +		} +	} +	if (ret < 0) +		goto err; + +	nft_trans_table_update(trans) = true; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; +err: +	nft_trans_destroy(trans); +	return ret; +} + +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); +	if (trans == NULL) +		return -ENOMEM; + +	if (msg_type == NFT_MSG_NEWTABLE) +		ctx->table->flags |= NFT_TABLE_INACTIVE; + +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; +} + +static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nlattr *name; +	struct nft_af_info *afi; +	struct nft_table *table; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	u32 flags = 0; +	struct nft_ctx ctx; +	int err; + +	afi = nf_tables_afinfo_lookup(net, family, true); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	name = nla[NFTA_TABLE_NAME]; +	table = nf_tables_table_lookup(afi, name); +	if (IS_ERR(table)) { +		if (PTR_ERR(table) != -ENOENT) +			return PTR_ERR(table); +		table = NULL; +	} + +	if (table != NULL) { +		if (table->flags & NFT_TABLE_INACTIVE) +			return -ENOENT; +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EOPNOTSUPP; + +		nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); +		return nf_tables_updtable(&ctx); +	} + +	if (nla[NFTA_TABLE_FLAGS]) { +		flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); +		if (flags & ~NFT_TABLE_F_DORMANT) +			return -EINVAL; +	} + +	if (!try_module_get(afi->owner)) +		return -EAFNOSUPPORT; + +	table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); +	if (table == NULL) { +		module_put(afi->owner); +		return -ENOMEM; +	} + +	nla_strlcpy(table->name, name, nla_len(name)); +	INIT_LIST_HEAD(&table->chains); +	INIT_LIST_HEAD(&table->sets); +	table->flags = flags; + +	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); +	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); +	if (err < 0) { +		kfree(table); +		module_put(afi->owner); +		return err; +	} +	list_add_tail_rcu(&table->list, &afi->tables); +	return 0; +} + +static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family, err; +	struct nft_ctx ctx; + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; +	if (table->use > 0) +		return -EBUSY; + +	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); +	err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); +	if (err < 0) +		return err; + +	list_del_rcu(&table->list); +	return 0; +} + +static void nf_tables_table_destroy(struct nft_ctx *ctx) +{ +	BUG_ON(ctx->table->use > 0); + +	kfree(ctx->table); +	module_put(ctx->afi->owner); +} + +int nft_register_chain_type(const struct nf_chain_type *ctype) +{ +	int err = 0; + +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	if (chain_type[ctype->family][ctype->type] != NULL) { +		err = -EBUSY; +		goto out; +	} +	chain_type[ctype->family][ctype->type] = ctype; +out: +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return err; +} +EXPORT_SYMBOL_GPL(nft_register_chain_type); + +void nft_unregister_chain_type(const struct nf_chain_type *ctype) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	chain_type[ctype->family][ctype->type] = NULL; +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_chain_type); + +/* + * Chains + */ + +static struct nft_chain * +nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) +{ +	struct nft_chain *chain; + +	list_for_each_entry(chain, &table->chains, list) { +		if (chain->handle == handle) +			return chain; +	} + +	return ERR_PTR(-ENOENT); +} + +static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, +						const struct nlattr *nla) +{ +	struct nft_chain *chain; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	list_for_each_entry(chain, &table->chains, list) { +		if (!nla_strcmp(nla, chain->name)) +			return chain; +	} + +	return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { +	[NFTA_CHAIN_TABLE]	= { .type = NLA_STRING }, +	[NFTA_CHAIN_HANDLE]	= { .type = NLA_U64 }, +	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING, +				    .len = NFT_CHAIN_MAXNAMELEN - 1 }, +	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED }, +	[NFTA_CHAIN_POLICY]	= { .type = NLA_U32 }, +	[NFTA_CHAIN_TYPE]	= { .type = NLA_STRING }, +	[NFTA_CHAIN_COUNTERS]	= { .type = NLA_NESTED }, +}; + +static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { +	[NFTA_HOOK_HOOKNUM]	= { .type = NLA_U32 }, +	[NFTA_HOOK_PRIORITY]	= { .type = NLA_U32 }, +}; + +static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) +{ +	struct nft_stats *cpu_stats, total; +	struct nlattr *nest; +	unsigned int seq; +	u64 pkts, bytes; +	int cpu; + +	memset(&total, 0, sizeof(total)); +	for_each_possible_cpu(cpu) { +		cpu_stats = per_cpu_ptr(stats, cpu); +		do { +			seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); +			pkts = cpu_stats->pkts; +			bytes = cpu_stats->bytes; +		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); +		total.pkts += pkts; +		total.bytes += bytes; +	} +	nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); +	if (nest == NULL) +		goto nla_put_failure; + +	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) || +	    nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes))) +		goto nla_put_failure; + +	nla_nest_end(skb, nest); +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, +				     int event, u32 flags, int family, +				     const struct nft_table *table, +				     const struct nft_chain *chain) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle))) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) +		goto nla_put_failure; + +	if (chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = nft_base_chain(chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; +		struct nlattr *nest; + +		nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); +		if (nest == NULL) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) +			goto nla_put_failure; +		nla_nest_end(skb, nest); + +		if (nla_put_be32(skb, NFTA_CHAIN_POLICY, +				 htonl(basechain->policy))) +			goto nla_put_failure; + +		if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) +			goto nla_put_failure; + +		if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) +			goto nla_put_failure; +	} + +	if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +{ +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, +					ctx->afi->family, ctx->table, +					ctx->chain); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, +			     ctx->report, GFP_KERNEL); +err: +	if (err < 0) { +		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, +				  err); +	} +	return err; +} + +static int nf_tables_dump_chains(struct sk_buff *skb, +				 struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	unsigned int idx = 0, s_idx = cb->args[0]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (family != NFPROTO_UNSPEC && family != afi->family) +			continue; + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			list_for_each_entry_rcu(chain, &table->chains, list) { +				if (idx < s_idx) +					goto cont; +				if (idx > s_idx) +					memset(&cb->args[1], 0, +					       sizeof(cb->args) - sizeof(cb->args[0])); +				if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, +							      cb->nlh->nlmsg_seq, +							      NFT_MSG_NEWCHAIN, +							      NLM_F_MULTI, +							      afi->family, table, chain) < 0) +					goto done; + +				nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +				idx++; +			} +		} +	} +done: +	rcu_read_unlock(); +	cb->args[0] = idx; +	return skb->len; +} + +static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	struct sk_buff *skb2; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	int err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_chains, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); +	if (chain->flags & NFT_CHAIN_INACTIVE) +		return -ENOENT; + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb2) +		return -ENOMEM; + +	err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, +					nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, +					family, table, chain); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { +	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 }, +	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 }, +}; + +static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) +{ +	struct nlattr *tb[NFTA_COUNTER_MAX+1]; +	struct nft_stats __percpu *newstats; +	struct nft_stats *stats; +	int err; + +	err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); +	if (err < 0) +		return ERR_PTR(err); + +	if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) +		return ERR_PTR(-EINVAL); + +	newstats = netdev_alloc_pcpu_stats(struct nft_stats); +	if (newstats == NULL) +		return ERR_PTR(-ENOMEM); + +	/* Restore old counters on this cpu, no problem. Per-cpu statistics +	 * are not exposed to userspace. +	 */ +	stats = this_cpu_ptr(newstats); +	stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); +	stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + +	return newstats; +} + +static void nft_chain_stats_replace(struct nft_base_chain *chain, +				    struct nft_stats __percpu *newstats) +{ +	if (chain->stats) { +		struct nft_stats __percpu *oldstats = +				nft_dereference(chain->stats); + +		rcu_assign_pointer(chain->stats, newstats); +		synchronize_rcu(); +		free_percpu(oldstats); +	} else +		rcu_assign_pointer(chain->stats, newstats); +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); +	if (trans == NULL) +		return -ENOMEM; + +	if (msg_type == NFT_MSG_NEWCHAIN) +		ctx->chain->flags |= NFT_CHAIN_INACTIVE; + +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; +} + +static void nf_tables_chain_destroy(struct nft_chain *chain) +{ +	BUG_ON(chain->use > 0); + +	if (chain->flags & NFT_BASE_CHAIN) { +		module_put(nft_base_chain(chain)->type->owner); +		free_percpu(nft_base_chain(chain)->stats); +		kfree(nft_base_chain(chain)); +	} else { +		kfree(chain); +	} +} + +static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nlattr * uninitialized_var(name); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct nft_chain *chain; +	struct nft_base_chain *basechain = NULL; +	struct nlattr *ha[NFTA_HOOK_MAX + 1]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	u8 policy = NF_ACCEPT; +	u64 handle = 0; +	unsigned int i; +	struct nft_stats __percpu *stats; +	int err; +	bool create; +	struct nft_ctx ctx; + +	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + +	afi = nf_tables_afinfo_lookup(net, family, true); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); + +	chain = NULL; +	name = nla[NFTA_CHAIN_NAME]; + +	if (nla[NFTA_CHAIN_HANDLE]) { +		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); +		chain = nf_tables_chain_lookup_byhandle(table, handle); +		if (IS_ERR(chain)) +			return PTR_ERR(chain); +	} else { +		chain = nf_tables_chain_lookup(table, name); +		if (IS_ERR(chain)) { +			if (PTR_ERR(chain) != -ENOENT) +				return PTR_ERR(chain); +			chain = NULL; +		} +	} + +	if (nla[NFTA_CHAIN_POLICY]) { +		if ((chain != NULL && +		    !(chain->flags & NFT_BASE_CHAIN)) || +		    nla[NFTA_CHAIN_HOOK] == NULL) +			return -EOPNOTSUPP; + +		policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); +		switch (policy) { +		case NF_DROP: +		case NF_ACCEPT: +			break; +		default: +			return -EINVAL; +		} +	} + +	if (chain != NULL) { +		struct nft_stats *stats = NULL; +		struct nft_trans *trans; + +		if (chain->flags & NFT_CHAIN_INACTIVE) +			return -ENOENT; +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EOPNOTSUPP; + +		if (nla[NFTA_CHAIN_HANDLE] && name && +		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) +			return -EEXIST; + +		if (nla[NFTA_CHAIN_COUNTERS]) { +			if (!(chain->flags & NFT_BASE_CHAIN)) +				return -EOPNOTSUPP; + +			stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); +			if (IS_ERR(stats)) +				return PTR_ERR(stats); +		} + +		nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); +		trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, +					sizeof(struct nft_trans_chain)); +		if (trans == NULL) +			return -ENOMEM; + +		nft_trans_chain_stats(trans) = stats; +		nft_trans_chain_update(trans) = true; + +		if (nla[NFTA_CHAIN_POLICY]) +			nft_trans_chain_policy(trans) = policy; +		else +			nft_trans_chain_policy(trans) = -1; + +		if (nla[NFTA_CHAIN_HANDLE] && name) { +			nla_strlcpy(nft_trans_chain_name(trans), name, +				    NFT_CHAIN_MAXNAMELEN); +		} +		list_add_tail(&trans->list, &net->nft.commit_list); +		return 0; +	} + +	if (table->use == UINT_MAX) +		return -EOVERFLOW; + +	if (nla[NFTA_CHAIN_HOOK]) { +		const struct nf_chain_type *type; +		struct nf_hook_ops *ops; +		nf_hookfn *hookfn; +		u32 hooknum, priority; + +		type = chain_type[family][NFT_CHAIN_T_DEFAULT]; +		if (nla[NFTA_CHAIN_TYPE]) { +			type = nf_tables_chain_type_lookup(afi, +							   nla[NFTA_CHAIN_TYPE], +							   create); +			if (IS_ERR(type)) +				return PTR_ERR(type); +		} + +		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], +				       nft_hook_policy); +		if (err < 0) +			return err; +		if (ha[NFTA_HOOK_HOOKNUM] == NULL || +		    ha[NFTA_HOOK_PRIORITY] == NULL) +			return -EINVAL; + +		hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); +		if (hooknum >= afi->nhooks) +			return -EINVAL; +		priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + +		if (!(type->hook_mask & (1 << hooknum))) +			return -EOPNOTSUPP; +		if (!try_module_get(type->owner)) +			return -ENOENT; +		hookfn = type->hooks[hooknum]; + +		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); +		if (basechain == NULL) +			return -ENOMEM; + +		if (nla[NFTA_CHAIN_COUNTERS]) { +			stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); +			if (IS_ERR(stats)) { +				module_put(type->owner); +				kfree(basechain); +				return PTR_ERR(stats); +			} +			basechain->stats = stats; +		} else { +			stats = netdev_alloc_pcpu_stats(struct nft_stats); +			if (IS_ERR(stats)) { +				module_put(type->owner); +				kfree(basechain); +				return PTR_ERR(stats); +			} +			rcu_assign_pointer(basechain->stats, stats); +		} + +		basechain->type = type; +		chain = &basechain->chain; + +		for (i = 0; i < afi->nops; i++) { +			ops = &basechain->ops[i]; +			ops->pf		= family; +			ops->owner	= afi->owner; +			ops->hooknum	= hooknum; +			ops->priority	= priority; +			ops->priv	= chain; +			ops->hook	= afi->hooks[ops->hooknum]; +			if (hookfn) +				ops->hook = hookfn; +			if (afi->hook_ops_init) +				afi->hook_ops_init(ops, i); +		} + +		chain->flags |= NFT_BASE_CHAIN; +		basechain->policy = policy; +	} else { +		chain = kzalloc(sizeof(*chain), GFP_KERNEL); +		if (chain == NULL) +			return -ENOMEM; +	} + +	INIT_LIST_HEAD(&chain->rules); +	chain->handle = nf_tables_alloc_handle(table); +	chain->net = net; +	chain->table = table; +	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + +	if (!(table->flags & NFT_TABLE_F_DORMANT) && +	    chain->flags & NFT_BASE_CHAIN) { +		err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); +		if (err < 0) +			goto err1; +	} + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); +	err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); +	if (err < 0) +		goto err2; + +	table->use++; +	list_add_tail_rcu(&chain->list, &table->chains); +	return 0; +err2: +	if (!(table->flags & NFT_TABLE_F_DORMANT) && +	    chain->flags & NFT_BASE_CHAIN) { +		nf_unregister_hooks(nft_base_chain(chain)->ops, +				    afi->nops); +	} +err1: +	nf_tables_chain_destroy(chain); +	return err; +} + +static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, +			      const struct nlmsghdr *nlh, +			      const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct nft_chain *chain; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	struct nft_ctx ctx; +	int err; + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); +	if (chain->flags & NFT_CHAIN_INACTIVE) +		return -ENOENT; +	if (chain->use > 0) +		return -EBUSY; + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); +	err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); +	if (err < 0) +		return err; + +	table->use--; +	list_del_rcu(&chain->list); +	return 0; +} + +/* + * Expressions + */ + +/** + *	nft_register_expr - register nf_tables expr type + *	@ops: expr type + * + *	Registers the expr type for use with nf_tables. Returns zero on + *	success or a negative errno code otherwise. + */ +int nft_register_expr(struct nft_expr_type *type) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	if (type->family == NFPROTO_UNSPEC) +		list_add_tail_rcu(&type->list, &nf_tables_expressions); +	else +		list_add_rcu(&type->list, &nf_tables_expressions); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return 0; +} +EXPORT_SYMBOL_GPL(nft_register_expr); + +/** + *	nft_unregister_expr - unregister nf_tables expr type + *	@ops: expr type + * + * 	Unregisters the expr typefor use with nf_tables. + */ +void nft_unregister_expr(struct nft_expr_type *type) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_del_rcu(&type->list); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_expr); + +static const struct nft_expr_type *__nft_expr_type_get(u8 family, +						       struct nlattr *nla) +{ +	const struct nft_expr_type *type; + +	list_for_each_entry(type, &nf_tables_expressions, list) { +		if (!nla_strcmp(nla, type->name) && +		    (!type->family || type->family == family)) +			return type; +	} +	return NULL; +} + +static const struct nft_expr_type *nft_expr_type_get(u8 family, +						     struct nlattr *nla) +{ +	const struct nft_expr_type *type; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	type = __nft_expr_type_get(family, nla); +	if (type != NULL && try_module_get(type->owner)) +		return type; + +#ifdef CONFIG_MODULES +	if (type == NULL) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-expr-%u-%.*s", family, +			       nla_len(nla), (char *)nla_data(nla)); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		if (__nft_expr_type_get(family, nla)) +			return ERR_PTR(-EAGAIN); + +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-expr-%.*s", +			       nla_len(nla), (char *)nla_data(nla)); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		if (__nft_expr_type_get(family, nla)) +			return ERR_PTR(-EAGAIN); +	} +#endif +	return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { +	[NFTA_EXPR_NAME]	= { .type = NLA_STRING }, +	[NFTA_EXPR_DATA]	= { .type = NLA_NESTED }, +}; + +static int nf_tables_fill_expr_info(struct sk_buff *skb, +				    const struct nft_expr *expr) +{ +	if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) +		goto nla_put_failure; + +	if (expr->ops->dump) { +		struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA); +		if (data == NULL) +			goto nla_put_failure; +		if (expr->ops->dump(skb, expr) < 0) +			goto nla_put_failure; +		nla_nest_end(skb, data); +	} + +	return skb->len; + +nla_put_failure: +	return -1; +}; + +struct nft_expr_info { +	const struct nft_expr_ops	*ops; +	struct nlattr			*tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nf_tables_expr_parse(const struct nft_ctx *ctx, +				const struct nlattr *nla, +				struct nft_expr_info *info) +{ +	const struct nft_expr_type *type; +	const struct nft_expr_ops *ops; +	struct nlattr *tb[NFTA_EXPR_MAX + 1]; +	int err; + +	err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy); +	if (err < 0) +		return err; + +	type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); +	if (IS_ERR(type)) +		return PTR_ERR(type); + +	if (tb[NFTA_EXPR_DATA]) { +		err = nla_parse_nested(info->tb, type->maxattr, +				       tb[NFTA_EXPR_DATA], type->policy); +		if (err < 0) +			goto err1; +	} else +		memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1)); + +	if (type->select_ops != NULL) { +		ops = type->select_ops(ctx, +				       (const struct nlattr * const *)info->tb); +		if (IS_ERR(ops)) { +			err = PTR_ERR(ops); +			goto err1; +		} +	} else +		ops = type->ops; + +	info->ops = ops; +	return 0; + +err1: +	module_put(type->owner); +	return err; +} + +static int nf_tables_newexpr(const struct nft_ctx *ctx, +			     const struct nft_expr_info *info, +			     struct nft_expr *expr) +{ +	const struct nft_expr_ops *ops = info->ops; +	int err; + +	expr->ops = ops; +	if (ops->init) { +		err = ops->init(ctx, expr, (const struct nlattr **)info->tb); +		if (err < 0) +			goto err1; +	} + +	return 0; + +err1: +	expr->ops = NULL; +	return err; +} + +static void nf_tables_expr_destroy(const struct nft_ctx *ctx, +				   struct nft_expr *expr) +{ +	if (expr->ops->destroy) +		expr->ops->destroy(ctx, expr); +	module_put(expr->ops->type->owner); +} + +/* + * Rules + */ + +static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, +						u64 handle) +{ +	struct nft_rule *rule; + +	// FIXME: this sucks +	list_for_each_entry(rule, &chain->rules, list) { +		if (handle == rule->handle) +			return rule; +	} + +	return ERR_PTR(-ENOENT); +} + +static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, +					      const struct nlattr *nla) +{ +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); +} + +static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { +	[NFTA_RULE_TABLE]	= { .type = NLA_STRING }, +	[NFTA_RULE_CHAIN]	= { .type = NLA_STRING, +				    .len = NFT_CHAIN_MAXNAMELEN - 1 }, +	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 }, +	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED }, +	[NFTA_RULE_COMPAT]	= { .type = NLA_NESTED }, +	[NFTA_RULE_POSITION]	= { .type = NLA_U64 }, +	[NFTA_RULE_USERDATA]	= { .type = NLA_BINARY, +				    .len = NFT_USERDATA_MAXLEN }, +}; + +static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, +				    int event, u32 flags, int family, +				    const struct nft_table *table, +				    const struct nft_chain *chain, +				    const struct nft_rule *rule) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	const struct nft_expr *expr, *next; +	struct nlattr *list; +	const struct nft_rule *prule; +	int type = event | NFNL_SUBSYS_NFTABLES << 8; + +	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), +			flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle))) +		goto nla_put_failure; + +	if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { +		prule = list_entry(rule->list.prev, struct nft_rule, list); +		if (nla_put_be64(skb, NFTA_RULE_POSITION, +				 cpu_to_be64(prule->handle))) +			goto nla_put_failure; +	} + +	list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS); +	if (list == NULL) +		goto nla_put_failure; +	nft_rule_for_each_expr(expr, next, rule) { +		struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM); +		if (elem == NULL) +			goto nla_put_failure; +		if (nf_tables_fill_expr_info(skb, expr) < 0) +			goto nla_put_failure; +		nla_nest_end(skb, elem); +	} +	nla_nest_end(skb, list); + +	if (rule->ulen && +	    nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_rule_notify(const struct nft_ctx *ctx, +				 const struct nft_rule *rule, +				 int event) +{ +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, +				       ctx->afi->family, ctx->table, +				       ctx->chain, rule); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, +			     ctx->report, GFP_KERNEL); +err: +	if (err < 0) { +		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, +				  err); +	} +	return err; +} + +static inline bool +nft_rule_is_active(struct net *net, const struct nft_rule *rule) +{ +	return (rule->genmask & (1 << net->nft.gencursor)) == 0; +} + +static inline int gencursor_next(struct net *net) +{ +	return net->nft.gencursor+1 == 1 ? 1 : 0; +} + +static inline int +nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) +{ +	return (rule->genmask & (1 << gencursor_next(net))) == 0; +} + +static inline void +nft_rule_activate_next(struct net *net, struct nft_rule *rule) +{ +	/* Now inactive, will be active in the future */ +	rule->genmask = (1 << net->nft.gencursor); +} + +static inline void +nft_rule_disactivate_next(struct net *net, struct nft_rule *rule) +{ +	rule->genmask = (1 << gencursor_next(net)); +} + +static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) +{ +	rule->genmask = 0; +} + +static int nf_tables_dump_rules(struct sk_buff *skb, +				struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	const struct nft_rule *rule; +	unsigned int idx = 0, s_idx = cb->args[0]; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (family != NFPROTO_UNSPEC && family != afi->family) +			continue; + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			list_for_each_entry_rcu(chain, &table->chains, list) { +				list_for_each_entry_rcu(rule, &chain->rules, list) { +					if (!nft_rule_is_active(net, rule)) +						goto cont; +					if (idx < s_idx) +						goto cont; +					if (idx > s_idx) +						memset(&cb->args[1], 0, +						       sizeof(cb->args) - sizeof(cb->args[0])); +					if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, +								      cb->nlh->nlmsg_seq, +								      NFT_MSG_NEWRULE, +								      NLM_F_MULTI | NLM_F_APPEND, +								      afi->family, table, chain, rule) < 0) +						goto done; + +					nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +					idx++; +				} +			} +		} +	} +done: +	rcu_read_unlock(); + +	cb->args[0] = idx; +	return skb->len; +} + +static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, +			     const struct nlmsghdr *nlh, +			     const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_af_info *afi; +	const struct nft_table *table; +	const struct nft_chain *chain; +	const struct nft_rule *rule; +	struct sk_buff *skb2; +	struct net *net = sock_net(skb->sk); +	int family = nfmsg->nfgen_family; +	int err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_rules, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); +	if (chain->flags & NFT_CHAIN_INACTIVE) +		return -ENOENT; + +	rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); +	if (IS_ERR(rule)) +		return PTR_ERR(rule); + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb2) +		return -ENOMEM; + +	err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, +				       nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, +				       family, table, chain, rule); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static void nf_tables_rule_destroy(const struct nft_ctx *ctx, +				   struct nft_rule *rule) +{ +	struct nft_expr *expr; + +	/* +	 * Careful: some expressions might not be initialized in case this +	 * is called on error from nf_tables_newrule(). +	 */ +	expr = nft_expr_first(rule); +	while (expr->ops && expr != nft_expr_last(rule)) { +		nf_tables_expr_destroy(ctx, expr); +		expr = nft_expr_next(expr); +	} +	kfree(rule); +} + +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, +					    struct nft_rule *rule) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); +	if (trans == NULL) +		return NULL; + +	nft_trans_rule(trans) = rule; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); + +	return trans; +} + +#define NFT_RULE_MAXEXPRS	128 + +static struct nft_expr_info *info; + +static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, +			     const struct nlmsghdr *nlh, +			     const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct net *net = sock_net(skb->sk); +	struct nft_table *table; +	struct nft_chain *chain; +	struct nft_rule *rule, *old_rule = NULL; +	struct nft_trans *trans = NULL; +	struct nft_expr *expr; +	struct nft_ctx ctx; +	struct nlattr *tmp; +	unsigned int size, i, n, ulen = 0; +	int err, rem; +	bool create; +	u64 handle, pos_handle; + +	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + +	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); + +	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); +	if (IS_ERR(chain)) +		return PTR_ERR(chain); + +	if (nla[NFTA_RULE_HANDLE]) { +		handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); +		rule = __nf_tables_rule_lookup(chain, handle); +		if (IS_ERR(rule)) +			return PTR_ERR(rule); + +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			old_rule = rule; +		else +			return -EOPNOTSUPP; +	} else { +		if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EINVAL; +		handle = nf_tables_alloc_handle(table); + +		if (chain->use == UINT_MAX) +			return -EOVERFLOW; +	} + +	if (nla[NFTA_RULE_POSITION]) { +		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) +			return -EOPNOTSUPP; + +		pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); +		old_rule = __nf_tables_rule_lookup(chain, pos_handle); +		if (IS_ERR(old_rule)) +			return PTR_ERR(old_rule); +	} + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + +	n = 0; +	size = 0; +	if (nla[NFTA_RULE_EXPRESSIONS]) { +		nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { +			err = -EINVAL; +			if (nla_type(tmp) != NFTA_LIST_ELEM) +				goto err1; +			if (n == NFT_RULE_MAXEXPRS) +				goto err1; +			err = nf_tables_expr_parse(&ctx, tmp, &info[n]); +			if (err < 0) +				goto err1; +			size += info[n].ops->size; +			n++; +		} +	} + +	if (nla[NFTA_RULE_USERDATA]) +		ulen = nla_len(nla[NFTA_RULE_USERDATA]); + +	err = -ENOMEM; +	rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); +	if (rule == NULL) +		goto err1; + +	nft_rule_activate_next(net, rule); + +	rule->handle = handle; +	rule->dlen   = size; +	rule->ulen   = ulen; + +	if (ulen) +		nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); + +	expr = nft_expr_first(rule); +	for (i = 0; i < n; i++) { +		err = nf_tables_newexpr(&ctx, &info[i], expr); +		if (err < 0) +			goto err2; +		info[i].ops = NULL; +		expr = nft_expr_next(expr); +	} + +	if (nlh->nlmsg_flags & NLM_F_REPLACE) { +		if (nft_rule_is_active_next(net, old_rule)) { +			trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, +						   old_rule); +			if (trans == NULL) { +				err = -ENOMEM; +				goto err2; +			} +			nft_rule_disactivate_next(net, old_rule); +			chain->use--; +			list_add_tail_rcu(&rule->list, &old_rule->list); +		} else { +			err = -ENOENT; +			goto err2; +		} +	} else if (nlh->nlmsg_flags & NLM_F_APPEND) +		if (old_rule) +			list_add_rcu(&rule->list, &old_rule->list); +		else +			list_add_tail_rcu(&rule->list, &chain->rules); +	else { +		if (old_rule) +			list_add_tail_rcu(&rule->list, &old_rule->list); +		else +			list_add_rcu(&rule->list, &chain->rules); +	} + +	if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { +		err = -ENOMEM; +		goto err3; +	} +	chain->use++; +	return 0; + +err3: +	list_del_rcu(&rule->list); +	if (trans) { +		list_del_rcu(&nft_trans_rule(trans)->list); +		nft_rule_clear(net, nft_trans_rule(trans)); +		nft_trans_destroy(trans); +		chain->use++; +	} +err2: +	nf_tables_rule_destroy(&ctx, rule); +err1: +	for (i = 0; i < n; i++) { +		if (info[i].ops != NULL) +			module_put(info[i].ops->type->owner); +	} +	return err; +} + +static int +nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) +{ +	/* You cannot delete the same rule twice */ +	if (nft_rule_is_active_next(ctx->net, rule)) { +		if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) +			return -ENOMEM; +		nft_rule_disactivate_next(ctx->net, rule); +		ctx->chain->use--; +		return 0; +	} +	return -ENOENT; +} + +static int nf_table_delrule_by_chain(struct nft_ctx *ctx) +{ +	struct nft_rule *rule; +	int err; + +	list_for_each_entry(rule, &ctx->chain->rules, list) { +		err = nf_tables_delrule_one(ctx, rule); +		if (err < 0) +			return err; +	} +	return 0; +} + +static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, +			     const struct nlmsghdr *nlh, +			     const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct net *net = sock_net(skb->sk); +	struct nft_table *table; +	struct nft_chain *chain = NULL; +	struct nft_rule *rule; +	int family = nfmsg->nfgen_family, err = 0; +	struct nft_ctx ctx; + +	afi = nf_tables_afinfo_lookup(net, family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (table->flags & NFT_TABLE_INACTIVE) +		return -ENOENT; + +	if (nla[NFTA_RULE_CHAIN]) { +		chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); +		if (IS_ERR(chain)) +			return PTR_ERR(chain); +	} + +	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + +	if (chain) { +		if (nla[NFTA_RULE_HANDLE]) { +			rule = nf_tables_rule_lookup(chain, +						     nla[NFTA_RULE_HANDLE]); +			if (IS_ERR(rule)) +				return PTR_ERR(rule); + +			err = nf_tables_delrule_one(&ctx, rule); +		} else { +			err = nf_table_delrule_by_chain(&ctx); +		} +	} else { +		list_for_each_entry(chain, &table->chains, list) { +			ctx.chain = chain; +			err = nf_table_delrule_by_chain(&ctx); +			if (err < 0) +				break; +		} +	} + +	return err; +} + +/* + * Sets + */ + +static LIST_HEAD(nf_tables_set_ops); + +int nft_register_set(struct nft_set_ops *ops) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_add_tail_rcu(&ops->list, &nf_tables_set_ops); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +	return 0; +} +EXPORT_SYMBOL_GPL(nft_register_set); + +void nft_unregister_set(struct nft_set_ops *ops) +{ +	nfnl_lock(NFNL_SUBSYS_NFTABLES); +	list_del_rcu(&ops->list); +	nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_set); + +/* + * Select a set implementation based on the data characteristics and the + * given policy. The total memory use might not be known if no size is + * given, in that case the amount of memory per element is used. + */ +static const struct nft_set_ops * +nft_select_set_ops(const struct nlattr * const nla[], +		   const struct nft_set_desc *desc, +		   enum nft_set_policies policy) +{ +	const struct nft_set_ops *ops, *bops; +	struct nft_set_estimate est, best; +	u32 features; + +#ifdef CONFIG_MODULES +	if (list_empty(&nf_tables_set_ops)) { +		nfnl_unlock(NFNL_SUBSYS_NFTABLES); +		request_module("nft-set"); +		nfnl_lock(NFNL_SUBSYS_NFTABLES); +		if (!list_empty(&nf_tables_set_ops)) +			return ERR_PTR(-EAGAIN); +	} +#endif +	features = 0; +	if (nla[NFTA_SET_FLAGS] != NULL) { +		features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); +		features &= NFT_SET_INTERVAL | NFT_SET_MAP; +	} + +	bops	   = NULL; +	best.size  = ~0; +	best.class = ~0; + +	list_for_each_entry(ops, &nf_tables_set_ops, list) { +		if ((ops->features & features) != features) +			continue; +		if (!ops->estimate(desc, features, &est)) +			continue; + +		switch (policy) { +		case NFT_SET_POL_PERFORMANCE: +			if (est.class < best.class) +				break; +			if (est.class == best.class && est.size < best.size) +				break; +			continue; +		case NFT_SET_POL_MEMORY: +			if (est.size < best.size) +				break; +			if (est.size == best.size && est.class < best.class) +				break; +			continue; +		default: +			break; +		} + +		if (!try_module_get(ops->owner)) +			continue; +		if (bops != NULL) +			module_put(bops->owner); + +		bops = ops; +		best = est; +	} + +	if (bops != NULL) +		return bops; + +	return ERR_PTR(-EOPNOTSUPP); +} + +static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { +	[NFTA_SET_TABLE]		= { .type = NLA_STRING }, +	[NFTA_SET_NAME]			= { .type = NLA_STRING, +					    .len = IFNAMSIZ - 1 }, +	[NFTA_SET_FLAGS]		= { .type = NLA_U32 }, +	[NFTA_SET_KEY_TYPE]		= { .type = NLA_U32 }, +	[NFTA_SET_KEY_LEN]		= { .type = NLA_U32 }, +	[NFTA_SET_DATA_TYPE]		= { .type = NLA_U32 }, +	[NFTA_SET_DATA_LEN]		= { .type = NLA_U32 }, +	[NFTA_SET_POLICY]		= { .type = NLA_U32 }, +	[NFTA_SET_DESC]			= { .type = NLA_NESTED }, +	[NFTA_SET_ID]			= { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { +	[NFTA_SET_DESC_SIZE]		= { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, +				     const struct sk_buff *skb, +				     const struct nlmsghdr *nlh, +				     const struct nlattr * const nla[]) +{ +	struct net *net = sock_net(skb->sk); +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi = NULL; +	struct nft_table *table = NULL; + +	if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { +		afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); +		if (IS_ERR(afi)) +			return PTR_ERR(afi); +	} + +	if (nla[NFTA_SET_TABLE] != NULL) { +		if (afi == NULL) +			return -EAFNOSUPPORT; + +		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); +		if (IS_ERR(table)) +			return PTR_ERR(table); +		if (table->flags & NFT_TABLE_INACTIVE) +			return -ENOENT; +	} + +	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); +	return 0; +} + +struct nft_set *nf_tables_set_lookup(const struct nft_table *table, +				     const struct nlattr *nla) +{ +	struct nft_set *set; + +	if (nla == NULL) +		return ERR_PTR(-EINVAL); + +	list_for_each_entry(set, &table->sets, list) { +		if (!nla_strcmp(nla, set->name)) +			return set; +	} +	return ERR_PTR(-ENOENT); +} + +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, +					  const struct nlattr *nla) +{ +	struct nft_trans *trans; +	u32 id = ntohl(nla_get_be32(nla)); + +	list_for_each_entry(trans, &net->nft.commit_list, list) { +		if (trans->msg_type == NFT_MSG_NEWSET && +		    id == nft_trans_set_id(trans)) +			return nft_trans_set(trans); +	} +	return ERR_PTR(-ENOENT); +} + +static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, +				    const char *name) +{ +	const struct nft_set *i; +	const char *p; +	unsigned long *inuse; +	unsigned int n = 0, min = 0; + +	p = strnchr(name, IFNAMSIZ, '%'); +	if (p != NULL) { +		if (p[1] != 'd' || strchr(p + 2, '%')) +			return -EINVAL; + +		inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); +		if (inuse == NULL) +			return -ENOMEM; +cont: +		list_for_each_entry(i, &ctx->table->sets, list) { +			int tmp; + +			if (!sscanf(i->name, name, &tmp)) +				continue; +			if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) +				continue; + +			set_bit(tmp - min, inuse); +		} + +		n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); +		if (n >= BITS_PER_BYTE * PAGE_SIZE) { +			min += BITS_PER_BYTE * PAGE_SIZE; +			memset(inuse, 0, PAGE_SIZE); +			goto cont; +		} +		free_page((unsigned long)inuse); +	} + +	snprintf(set->name, sizeof(set->name), name, min + n); +	list_for_each_entry(i, &ctx->table->sets, list) { +		if (!strcmp(set->name, i->name)) +			return -ENFILE; +	} +	return 0; +} + +static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, +			      const struct nft_set *set, u16 event, u16 flags) +{ +	struct nfgenmsg *nfmsg; +	struct nlmsghdr *nlh; +	struct nlattr *desc; +	u32 portid = ctx->portid; +	u32 seq = ctx->seq; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), +			flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= ctx->afi->family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_SET_NAME, set->name)) +		goto nla_put_failure; +	if (set->flags != 0) +		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) +			goto nla_put_failure; + +	if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen))) +		goto nla_put_failure; +	if (set->flags & NFT_SET_MAP) { +		if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) +			goto nla_put_failure; +	} + +	desc = nla_nest_start(skb, NFTA_SET_DESC); +	if (desc == NULL) +		goto nla_put_failure; +	if (set->size && +	    nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) +		goto nla_put_failure; +	nla_nest_end(skb, desc); + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_set_notify(const struct nft_ctx *ctx, +				const struct nft_set *set, +				int event, gfp_t gfp_flags) +{ +	struct sk_buff *skb; +	u32 portid = ctx->portid; +	int err; + +	if (!ctx->report && +	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_set(skb, ctx, set, event, 0); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, +			     ctx->report, gfp_flags); +err: +	if (err < 0) +		nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); +	return err; +} + +static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, +				     struct netlink_callback *cb) +{ +	const struct nft_set *set; +	unsigned int idx = 0, s_idx = cb->args[0]; + +	if (cb->args[1]) +		return skb->len; + +	rcu_read_lock(); +	cb->seq = ctx->net->nft.base_seq; + +	list_for_each_entry_rcu(set, &ctx->table->sets, list) { +		if (idx < s_idx) +			goto cont; +		if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, +				       NLM_F_MULTI) < 0) { +			cb->args[0] = idx; +			goto done; +		} +		nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +		idx++; +	} +	cb->args[1] = 1; +done: +	rcu_read_unlock(); +	return skb->len; +} + +static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, +				      struct netlink_callback *cb) +{ +	const struct nft_set *set; +	unsigned int idx, s_idx = cb->args[0]; +	struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; + +	if (cb->args[1]) +		return skb->len; + +	rcu_read_lock(); +	cb->seq = ctx->net->nft.base_seq; + +	list_for_each_entry_rcu(table, &ctx->afi->tables, list) { +		if (cur_table) { +			if (cur_table != table) +				continue; + +			cur_table = NULL; +		} +		ctx->table = table; +		idx = 0; +		list_for_each_entry_rcu(set, &ctx->table->sets, list) { +			if (idx < s_idx) +				goto cont; +			if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, +					       NLM_F_MULTI) < 0) { +				cb->args[0] = idx; +				cb->args[2] = (unsigned long) table; +				goto done; +			} +			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +			idx++; +		} +	} +	cb->args[1] = 1; +done: +	rcu_read_unlock(); +	return skb->len; +} + +static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, +				   struct netlink_callback *cb) +{ +	const struct nft_set *set; +	unsigned int idx, s_idx = cb->args[0]; +	struct nft_af_info *afi; +	struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; +	struct net *net = sock_net(skb->sk); +	int cur_family = cb->args[3]; + +	if (cb->args[1]) +		return skb->len; + +	rcu_read_lock(); +	cb->seq = net->nft.base_seq; + +	list_for_each_entry_rcu(afi, &net->nft.af_info, list) { +		if (cur_family) { +			if (afi->family != cur_family) +				continue; + +			cur_family = 0; +		} + +		list_for_each_entry_rcu(table, &afi->tables, list) { +			if (cur_table) { +				if (cur_table != table) +					continue; + +				cur_table = NULL; +			} + +			ctx->table = table; +			ctx->afi = afi; +			idx = 0; +			list_for_each_entry_rcu(set, &ctx->table->sets, list) { +				if (idx < s_idx) +					goto cont; +				if (nf_tables_fill_set(skb, ctx, set, +						       NFT_MSG_NEWSET, +						       NLM_F_MULTI) < 0) { +					cb->args[0] = idx; +					cb->args[2] = (unsigned long) table; +					cb->args[3] = afi->family; +					goto done; +				} +				nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +				idx++; +			} +			if (s_idx) +				s_idx = 0; +		} +	} +	cb->args[1] = 1; +done: +	rcu_read_unlock(); +	return skb->len; +} + +static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); +	struct nlattr *nla[NFTA_SET_MAX + 1]; +	struct nft_ctx ctx; +	int err, ret; + +	err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX, +			  nft_set_policy); +	if (err < 0) +		return err; + +	err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla); +	if (err < 0) +		return err; + +	if (ctx.table == NULL) { +		if (ctx.afi == NULL) +			ret = nf_tables_dump_sets_all(&ctx, skb, cb); +		else +			ret = nf_tables_dump_sets_family(&ctx, skb, cb); +	} else +		ret = nf_tables_dump_sets_table(&ctx, skb, cb); + +	return ret; +} + +#define NFT_SET_INACTIVE	(1 << 15)	/* Internal set flag */ + +static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, +			    const struct nlmsghdr *nlh, +			    const struct nlattr * const nla[]) +{ +	const struct nft_set *set; +	struct nft_ctx ctx; +	struct sk_buff *skb2; +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	int err; + +	/* Verify existance before starting dump */ +	err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); +	if (err < 0) +		return err; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_sets, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} + +	/* Only accept unspec with dump */ +	if (nfmsg->nfgen_family == NFPROTO_UNSPEC) +		return -EAFNOSUPPORT; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; + +	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); +	if (err < 0) +		goto err; + +	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: +	kfree_skb(skb2); +	return err; +} + +static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, +				    struct nft_set_desc *desc, +				    const struct nlattr *nla) +{ +	struct nlattr *da[NFTA_SET_DESC_MAX + 1]; +	int err; + +	err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); +	if (err < 0) +		return err; + +	if (da[NFTA_SET_DESC_SIZE] != NULL) +		desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + +	return 0; +} + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, +			     struct nft_set *set) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); +	if (trans == NULL) +		return -ENOMEM; + +	if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { +		nft_trans_set_id(trans) = +			ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); +		set->flags |= NFT_SET_INACTIVE; +	} +	nft_trans_set(trans) = set; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); + +	return 0; +} + +static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, +			    const struct nlmsghdr *nlh, +			    const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	const struct nft_set_ops *ops; +	struct nft_af_info *afi; +	struct net *net = sock_net(skb->sk); +	struct nft_table *table; +	struct nft_set *set; +	struct nft_ctx ctx; +	char name[IFNAMSIZ]; +	unsigned int size; +	bool create; +	u32 ktype, dtype, flags, policy; +	struct nft_set_desc desc; +	int err; + +	if (nla[NFTA_SET_TABLE] == NULL || +	    nla[NFTA_SET_NAME] == NULL || +	    nla[NFTA_SET_KEY_LEN] == NULL || +	    nla[NFTA_SET_ID] == NULL) +		return -EINVAL; + +	memset(&desc, 0, sizeof(desc)); + +	ktype = NFT_DATA_VALUE; +	if (nla[NFTA_SET_KEY_TYPE] != NULL) { +		ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); +		if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) +			return -EINVAL; +	} + +	desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); +	if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) +		return -EINVAL; + +	flags = 0; +	if (nla[NFTA_SET_FLAGS] != NULL) { +		flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); +		if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | +			      NFT_SET_INTERVAL | NFT_SET_MAP)) +			return -EINVAL; +	} + +	dtype = 0; +	if (nla[NFTA_SET_DATA_TYPE] != NULL) { +		if (!(flags & NFT_SET_MAP)) +			return -EINVAL; + +		dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); +		if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && +		    dtype != NFT_DATA_VERDICT) +			return -EINVAL; + +		if (dtype != NFT_DATA_VERDICT) { +			if (nla[NFTA_SET_DATA_LEN] == NULL) +				return -EINVAL; +			desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); +			if (desc.dlen == 0 || +			    desc.dlen > FIELD_SIZEOF(struct nft_data, data)) +				return -EINVAL; +		} else +			desc.dlen = sizeof(struct nft_data); +	} else if (flags & NFT_SET_MAP) +		return -EINVAL; + +	policy = NFT_SET_POL_PERFORMANCE; +	if (nla[NFTA_SET_POLICY] != NULL) +		policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + +	if (nla[NFTA_SET_DESC] != NULL) { +		err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); +		if (err < 0) +			return err; +	} + +	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + +	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); + +	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + +	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); +	if (IS_ERR(set)) { +		if (PTR_ERR(set) != -ENOENT) +			return PTR_ERR(set); +		set = NULL; +	} + +	if (set != NULL) { +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; +		if (nlh->nlmsg_flags & NLM_F_REPLACE) +			return -EOPNOTSUPP; +		return 0; +	} + +	if (!(nlh->nlmsg_flags & NLM_F_CREATE)) +		return -ENOENT; + +	ops = nft_select_set_ops(nla, &desc, policy); +	if (IS_ERR(ops)) +		return PTR_ERR(ops); + +	size = 0; +	if (ops->privsize != NULL) +		size = ops->privsize(nla); + +	err = -ENOMEM; +	set = kzalloc(sizeof(*set) + size, GFP_KERNEL); +	if (set == NULL) +		goto err1; + +	nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name)); +	err = nf_tables_set_alloc_name(&ctx, set, name); +	if (err < 0) +		goto err2; + +	INIT_LIST_HEAD(&set->bindings); +	set->ops   = ops; +	set->ktype = ktype; +	set->klen  = desc.klen; +	set->dtype = dtype; +	set->dlen  = desc.dlen; +	set->flags = flags; +	set->size  = desc.size; + +	err = ops->init(set, &desc, nla); +	if (err < 0) +		goto err2; + +	err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); +	if (err < 0) +		goto err2; + +	list_add_tail_rcu(&set->list, &table->sets); +	table->use++; +	return 0; + +err2: +	kfree(set); +err1: +	module_put(ops->owner); +	return err; +} + +static void nft_set_destroy(struct nft_set *set) +{ +	set->ops->destroy(set); +	module_put(set->ops->owner); +	kfree(set); +} + +static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ +	list_del_rcu(&set->list); +	nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); +	nft_set_destroy(set); +} + +static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, +			    const struct nlmsghdr *nlh, +			    const struct nlattr * const nla[]) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_set *set; +	struct nft_ctx ctx; +	int err; + +	if (nfmsg->nfgen_family == NFPROTO_UNSPEC) +		return -EAFNOSUPPORT; +	if (nla[NFTA_SET_TABLE] == NULL) +		return -EINVAL; + +	err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; +	if (!list_empty(&set->bindings)) +		return -EBUSY; + +	err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); +	if (err < 0) +		return err; + +	list_del_rcu(&set->list); +	ctx.table->use--; +	return 0; +} + +static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, +					const struct nft_set *set, +					const struct nft_set_iter *iter, +					const struct nft_set_elem *elem) +{ +	enum nft_registers dreg; + +	dreg = nft_type_to_reg(set->dtype); +	return nft_validate_data_load(ctx, dreg, &elem->data, +				      set->dtype == NFT_DATA_VERDICT ? +				      NFT_DATA_VERDICT : NFT_DATA_VALUE); +} + +int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, +		       struct nft_set_binding *binding) +{ +	struct nft_set_binding *i; +	struct nft_set_iter iter; + +	if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) +		return -EBUSY; + +	if (set->flags & NFT_SET_MAP) { +		/* If the set is already bound to the same chain all +		 * jumps are already validated for that chain. +		 */ +		list_for_each_entry(i, &set->bindings, list) { +			if (i->chain == binding->chain) +				goto bind; +		} + +		iter.skip 	= 0; +		iter.count	= 0; +		iter.err	= 0; +		iter.fn		= nf_tables_bind_check_setelem; + +		set->ops->walk(ctx, set, &iter); +		if (iter.err < 0) { +			/* Destroy anonymous sets if binding fails */ +			if (set->flags & NFT_SET_ANONYMOUS) +				nf_tables_set_destroy(ctx, set); + +			return iter.err; +		} +	} +bind: +	binding->chain = ctx->chain; +	list_add_tail_rcu(&binding->list, &set->bindings); +	return 0; +} + +void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, +			  struct nft_set_binding *binding) +{ +	list_del_rcu(&binding->list); + +	if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && +	    !(set->flags & NFT_SET_INACTIVE)) +		nf_tables_set_destroy(ctx, set); +} + +/* + * Set elements + */ + +static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { +	[NFTA_SET_ELEM_KEY]		= { .type = NLA_NESTED }, +	[NFTA_SET_ELEM_DATA]		= { .type = NLA_NESTED }, +	[NFTA_SET_ELEM_FLAGS]		= { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { +	[NFTA_SET_ELEM_LIST_TABLE]	= { .type = NLA_STRING }, +	[NFTA_SET_ELEM_LIST_SET]	= { .type = NLA_STRING }, +	[NFTA_SET_ELEM_LIST_ELEMENTS]	= { .type = NLA_NESTED }, +	[NFTA_SET_ELEM_LIST_SET_ID]	= { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, +				      const struct sk_buff *skb, +				      const struct nlmsghdr *nlh, +				      const struct nlattr * const nla[], +				      bool trans) +{ +	const struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nft_af_info *afi; +	struct nft_table *table; +	struct net *net = sock_net(skb->sk); + +	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); +	if (IS_ERR(afi)) +		return PTR_ERR(afi); + +	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); +	if (IS_ERR(table)) +		return PTR_ERR(table); +	if (!trans && (table->flags & NFT_TABLE_INACTIVE)) +		return -ENOENT; + +	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); +	return 0; +} + +static int nf_tables_fill_setelem(struct sk_buff *skb, +				  const struct nft_set *set, +				  const struct nft_set_elem *elem) +{ +	unsigned char *b = skb_tail_pointer(skb); +	struct nlattr *nest; + +	nest = nla_nest_start(skb, NFTA_LIST_ELEM); +	if (nest == NULL) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE, +			  set->klen) < 0) +		goto nla_put_failure; + +	if (set->flags & NFT_SET_MAP && +	    !(elem->flags & NFT_SET_ELEM_INTERVAL_END) && +	    nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data, +			  set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, +			  set->dlen) < 0) +		goto nla_put_failure; + +	if (elem->flags != 0) +		if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags))) +			goto nla_put_failure; + +	nla_nest_end(skb, nest); +	return 0; + +nla_put_failure: +	nlmsg_trim(skb, b); +	return -EMSGSIZE; +} + +struct nft_set_dump_args { +	const struct netlink_callback	*cb; +	struct nft_set_iter		iter; +	struct sk_buff			*skb; +}; + +static int nf_tables_dump_setelem(const struct nft_ctx *ctx, +				  const struct nft_set *set, +				  const struct nft_set_iter *iter, +				  const struct nft_set_elem *elem) +{ +	struct nft_set_dump_args *args; + +	args = container_of(iter, struct nft_set_dump_args, iter); +	return nf_tables_fill_setelem(args->skb, set, elem); +} + +static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) +{ +	const struct nft_set *set; +	struct nft_set_dump_args args; +	struct nft_ctx ctx; +	struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; +	struct nfgenmsg *nfmsg; +	struct nlmsghdr *nlh; +	struct nlattr *nest; +	u32 portid, seq; +	int event, err; + +	err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, +			  NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy); +	if (err < 0) +		return err; + +	err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, +					 false); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; + +	event  = NFT_MSG_NEWSETELEM; +	event |= NFNL_SUBSYS_NFTABLES << 8; +	portid = NETLINK_CB(cb->skb).portid; +	seq    = cb->nlh->nlmsg_seq; + +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), +			NLM_F_MULTI); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = ctx.afi->family; +	nfmsg->version      = NFNETLINK_V0; +	nfmsg->res_id       = 0; + +	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) +		goto nla_put_failure; + +	nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); +	if (nest == NULL) +		goto nla_put_failure; + +	args.cb		= cb; +	args.skb	= skb; +	args.iter.skip	= cb->args[0]; +	args.iter.count	= 0; +	args.iter.err   = 0; +	args.iter.fn	= nf_tables_dump_setelem; +	set->ops->walk(&ctx, set, &args.iter); + +	nla_nest_end(skb, nest); +	nlmsg_end(skb, nlh); + +	if (args.iter.err && args.iter.err != -EMSGSIZE) +		return args.iter.err; +	if (args.iter.count == cb->args[0]) +		return 0; + +	cb->args[0] = args.iter.count; +	return skb->len; + +nla_put_failure: +	return -ENOSPC; +} + +static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, +				const struct nlmsghdr *nlh, +				const struct nlattr * const nla[]) +{ +	const struct nft_set *set; +	struct nft_ctx ctx; +	int err; + +	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (set->flags & NFT_SET_INACTIVE) +		return -ENOENT; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nf_tables_dump_set, +		}; +		return netlink_dump_start(nlsk, skb, nlh, &c); +	} +	return -EOPNOTSUPP; +} + +static int nf_tables_fill_setelem_info(struct sk_buff *skb, +				       const struct nft_ctx *ctx, u32 seq, +				       u32 portid, int event, u16 flags, +				       const struct nft_set *set, +				       const struct nft_set_elem *elem) +{ +	struct nfgenmsg *nfmsg; +	struct nlmsghdr *nlh; +	struct nlattr *nest; +	int err; + +	event |= NFNL_SUBSYS_NFTABLES << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), +			flags); +	if (nlh == NULL) +		goto nla_put_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family	= ctx->afi->family; +	nfmsg->version		= NFNETLINK_V0; +	nfmsg->res_id		= 0; + +	if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) +		goto nla_put_failure; +	if (nla_put_string(skb, NFTA_SET_NAME, set->name)) +		goto nla_put_failure; + +	nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); +	if (nest == NULL) +		goto nla_put_failure; + +	err = nf_tables_fill_setelem(skb, set, elem); +	if (err < 0) +		goto nla_put_failure; + +	nla_nest_end(skb, nest); + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_trim(skb, nlh); +	return -1; +} + +static int nf_tables_setelem_notify(const struct nft_ctx *ctx, +				    const struct nft_set *set, +				    const struct nft_set_elem *elem, +				    int event, u16 flags) +{ +	struct net *net = ctx->net; +	u32 portid = ctx->portid; +	struct sk_buff *skb; +	int err; + +	if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) +		return 0; + +	err = -ENOBUFS; +	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +	if (skb == NULL) +		goto err; + +	err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, +					  set, elem); +	if (err < 0) { +		kfree_skb(skb); +		goto err; +	} + +	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, +			     GFP_KERNEL); +err: +	if (err < 0) +		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); +	return err; +} + +static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, +					      int msg_type, +					      struct nft_set *set) +{ +	struct nft_trans *trans; + +	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); +	if (trans == NULL) +		return NULL; + +	nft_trans_elem_set(trans) = set; +	return trans; +} + +static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, +			    const struct nlattr *attr) +{ +	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; +	struct nft_data_desc d1, d2; +	struct nft_set_elem elem; +	struct nft_set_binding *binding; +	enum nft_registers dreg; +	struct nft_trans *trans; +	int err; + +	if (set->size && set->nelems == set->size) +		return -ENFILE; + +	err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, +			       nft_set_elem_policy); +	if (err < 0) +		return err; + +	if (nla[NFTA_SET_ELEM_KEY] == NULL) +		return -EINVAL; + +	elem.flags = 0; +	if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { +		elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); +		if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END) +			return -EINVAL; +	} + +	if (set->flags & NFT_SET_MAP) { +		if (nla[NFTA_SET_ELEM_DATA] == NULL && +		    !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) +			return -EINVAL; +		if (nla[NFTA_SET_ELEM_DATA] != NULL && +		    elem.flags & NFT_SET_ELEM_INTERVAL_END) +			return -EINVAL; +	} else { +		if (nla[NFTA_SET_ELEM_DATA] != NULL) +			return -EINVAL; +	} + +	err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]); +	if (err < 0) +		goto err1; +	err = -EINVAL; +	if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) +		goto err2; + +	err = -EEXIST; +	if (set->ops->get(set, &elem) == 0) +		goto err2; + +	if (nla[NFTA_SET_ELEM_DATA] != NULL) { +		err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]); +		if (err < 0) +			goto err2; + +		err = -EINVAL; +		if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) +			goto err3; + +		dreg = nft_type_to_reg(set->dtype); +		list_for_each_entry(binding, &set->bindings, list) { +			struct nft_ctx bind_ctx = { +				.afi	= ctx->afi, +				.table	= ctx->table, +				.chain	= (struct nft_chain *)binding->chain, +			}; + +			err = nft_validate_data_load(&bind_ctx, dreg, +						     &elem.data, d2.type); +			if (err < 0) +				goto err3; +		} +	} + +	trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); +	if (trans == NULL) +		goto err3; + +	err = set->ops->insert(set, &elem); +	if (err < 0) +		goto err4; + +	nft_trans_elem(trans) = elem; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); +	return 0; + +err4: +	kfree(trans); +err3: +	if (nla[NFTA_SET_ELEM_DATA] != NULL) +		nft_data_uninit(&elem.data, d2.type); +err2: +	nft_data_uninit(&elem.key, d1.type); +err1: +	return err; +} + +static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, +				const struct nlmsghdr *nlh, +				const struct nlattr * const nla[]) +{ +	struct net *net = sock_net(skb->sk); +	const struct nlattr *attr; +	struct nft_set *set; +	struct nft_ctx ctx; +	int rem, err = 0; + +	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) { +		if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { +			set = nf_tables_set_lookup_byid(net, +					nla[NFTA_SET_ELEM_LIST_SET_ID]); +		} +		if (IS_ERR(set)) +			return PTR_ERR(set); +	} + +	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) +		return -EBUSY; + +	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { +		err = nft_add_set_elem(&ctx, set, attr); +		if (err < 0) +			break; + +		set->nelems++; +	} +	return err; +} + +static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, +			   const struct nlattr *attr) +{ +	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; +	struct nft_data_desc desc; +	struct nft_set_elem elem; +	struct nft_trans *trans; +	int err; + +	err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, +			       nft_set_elem_policy); +	if (err < 0) +		goto err1; + +	err = -EINVAL; +	if (nla[NFTA_SET_ELEM_KEY] == NULL) +		goto err1; + +	err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]); +	if (err < 0) +		goto err1; + +	err = -EINVAL; +	if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) +		goto err2; + +	err = set->ops->get(set, &elem); +	if (err < 0) +		goto err2; + +	trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); +	if (trans == NULL) +		goto err2; + +	nft_trans_elem(trans) = elem; +	list_add_tail(&trans->list, &ctx->net->nft.commit_list); + +	nft_data_uninit(&elem.key, NFT_DATA_VALUE); +	if (set->flags & NFT_SET_MAP) +		nft_data_uninit(&elem.data, set->dtype); + +err2: +	nft_data_uninit(&elem.key, desc.type); +err1: +	return err; +} + +static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, +				const struct nlmsghdr *nlh, +				const struct nlattr * const nla[]) +{ +	const struct nlattr *attr; +	struct nft_set *set; +	struct nft_ctx ctx; +	int rem, err = 0; + +	err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); +	if (err < 0) +		return err; + +	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); +	if (IS_ERR(set)) +		return PTR_ERR(set); +	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) +		return -EBUSY; + +	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { +		err = nft_del_setelem(&ctx, set, attr); +		if (err < 0) +			break; + +		set->nelems--; +	} +	return err; +} + +static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { +	[NFT_MSG_NEWTABLE] = { +		.call_batch	= nf_tables_newtable, +		.attr_count	= NFTA_TABLE_MAX, +		.policy		= nft_table_policy, +	}, +	[NFT_MSG_GETTABLE] = { +		.call		= nf_tables_gettable, +		.attr_count	= NFTA_TABLE_MAX, +		.policy		= nft_table_policy, +	}, +	[NFT_MSG_DELTABLE] = { +		.call_batch	= nf_tables_deltable, +		.attr_count	= NFTA_TABLE_MAX, +		.policy		= nft_table_policy, +	}, +	[NFT_MSG_NEWCHAIN] = { +		.call_batch	= nf_tables_newchain, +		.attr_count	= NFTA_CHAIN_MAX, +		.policy		= nft_chain_policy, +	}, +	[NFT_MSG_GETCHAIN] = { +		.call		= nf_tables_getchain, +		.attr_count	= NFTA_CHAIN_MAX, +		.policy		= nft_chain_policy, +	}, +	[NFT_MSG_DELCHAIN] = { +		.call_batch	= nf_tables_delchain, +		.attr_count	= NFTA_CHAIN_MAX, +		.policy		= nft_chain_policy, +	}, +	[NFT_MSG_NEWRULE] = { +		.call_batch	= nf_tables_newrule, +		.attr_count	= NFTA_RULE_MAX, +		.policy		= nft_rule_policy, +	}, +	[NFT_MSG_GETRULE] = { +		.call		= nf_tables_getrule, +		.attr_count	= NFTA_RULE_MAX, +		.policy		= nft_rule_policy, +	}, +	[NFT_MSG_DELRULE] = { +		.call_batch	= nf_tables_delrule, +		.attr_count	= NFTA_RULE_MAX, +		.policy		= nft_rule_policy, +	}, +	[NFT_MSG_NEWSET] = { +		.call_batch	= nf_tables_newset, +		.attr_count	= NFTA_SET_MAX, +		.policy		= nft_set_policy, +	}, +	[NFT_MSG_GETSET] = { +		.call		= nf_tables_getset, +		.attr_count	= NFTA_SET_MAX, +		.policy		= nft_set_policy, +	}, +	[NFT_MSG_DELSET] = { +		.call_batch	= nf_tables_delset, +		.attr_count	= NFTA_SET_MAX, +		.policy		= nft_set_policy, +	}, +	[NFT_MSG_NEWSETELEM] = { +		.call_batch	= nf_tables_newsetelem, +		.attr_count	= NFTA_SET_ELEM_LIST_MAX, +		.policy		= nft_set_elem_list_policy, +	}, +	[NFT_MSG_GETSETELEM] = { +		.call		= nf_tables_getsetelem, +		.attr_count	= NFTA_SET_ELEM_LIST_MAX, +		.policy		= nft_set_elem_list_policy, +	}, +	[NFT_MSG_DELSETELEM] = { +		.call_batch	= nf_tables_delsetelem, +		.attr_count	= NFTA_SET_ELEM_LIST_MAX, +		.policy		= nft_set_elem_list_policy, +	}, +}; + +static void nft_chain_commit_update(struct nft_trans *trans) +{ +	struct nft_base_chain *basechain; + +	if (nft_trans_chain_name(trans)[0]) +		strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + +	if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) +		return; + +	basechain = nft_base_chain(trans->ctx.chain); +	nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + +	switch (nft_trans_chain_policy(trans)) { +	case NF_DROP: +	case NF_ACCEPT: +		basechain->policy = nft_trans_chain_policy(trans); +		break; +	} +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * removed rules. + */ +static void nf_tables_commit_release_rcu(struct rcu_head *rt) +{ +	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + +	switch (trans->msg_type) { +	case NFT_MSG_DELTABLE: +		nf_tables_table_destroy(&trans->ctx); +		break; +	case NFT_MSG_DELCHAIN: +		nf_tables_chain_destroy(trans->ctx.chain); +		break; +	case NFT_MSG_DELRULE: +		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); +		break; +	case NFT_MSG_DELSET: +		nft_set_destroy(nft_trans_set(trans)); +		break; +	} +	kfree(trans); +} + +static int nf_tables_commit(struct sk_buff *skb) +{ +	struct net *net = sock_net(skb->sk); +	struct nft_trans *trans, *next; +	struct nft_set *set; + +	/* Bump generation counter, invalidate any dump in progress */ +	while (++net->nft.base_seq == 0); + +	/* A new generation has just started */ +	net->nft.gencursor = gencursor_next(net); + +	/* Make sure all packets have left the previous generation before +	 * purging old rules. +	 */ +	synchronize_rcu(); + +	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { +		switch (trans->msg_type) { +		case NFT_MSG_NEWTABLE: +			if (nft_trans_table_update(trans)) { +				if (!nft_trans_table_enable(trans)) { +					nf_tables_table_disable(trans->ctx.afi, +								trans->ctx.table); +					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; +				} +			} else { +				trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; +			} +			nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELTABLE: +			nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); +			break; +		case NFT_MSG_NEWCHAIN: +			if (nft_trans_chain_update(trans)) +				nft_chain_commit_update(trans); +			else +				trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + +			nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELCHAIN: +			nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); +			if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && +			    trans->ctx.chain->flags & NFT_BASE_CHAIN) { +				nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, +						    trans->ctx.afi->nops); +			} +			break; +		case NFT_MSG_NEWRULE: +			nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); +			nf_tables_rule_notify(&trans->ctx, +					      nft_trans_rule(trans), +					      NFT_MSG_NEWRULE); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELRULE: +			list_del_rcu(&nft_trans_rule(trans)->list); +			nf_tables_rule_notify(&trans->ctx, +					      nft_trans_rule(trans), +					      NFT_MSG_DELRULE); +			break; +		case NFT_MSG_NEWSET: +			nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; +			/* This avoids hitting -EBUSY when deleting the table +			 * from the transaction. +			 */ +			if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS && +			    !list_empty(&nft_trans_set(trans)->bindings)) +				trans->ctx.table->use--; + +			nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), +					     NFT_MSG_NEWSET, GFP_KERNEL); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELSET: +			nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), +					     NFT_MSG_DELSET, GFP_KERNEL); +			break; +		case NFT_MSG_NEWSETELEM: +			nf_tables_setelem_notify(&trans->ctx, +						 nft_trans_elem_set(trans), +						 &nft_trans_elem(trans), +						 NFT_MSG_NEWSETELEM, 0); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELSETELEM: +			nf_tables_setelem_notify(&trans->ctx, +						 nft_trans_elem_set(trans), +						 &nft_trans_elem(trans), +						 NFT_MSG_DELSETELEM, 0); +			set = nft_trans_elem_set(trans); +			set->ops->get(set, &nft_trans_elem(trans)); +			set->ops->remove(set, &nft_trans_elem(trans)); +			nft_trans_destroy(trans); +			break; +		} +	} + +	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { +		list_del(&trans->list); +		trans->ctx.nla = NULL; +		call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); +	} + +	return 0; +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * aborted rules. + */ +static void nf_tables_abort_release_rcu(struct rcu_head *rt) +{ +	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + +	switch (trans->msg_type) { +	case NFT_MSG_NEWTABLE: +		nf_tables_table_destroy(&trans->ctx); +		break; +	case NFT_MSG_NEWCHAIN: +		nf_tables_chain_destroy(trans->ctx.chain); +		break; +	case NFT_MSG_NEWRULE: +		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); +		break; +	case NFT_MSG_NEWSET: +		nft_set_destroy(nft_trans_set(trans)); +		break; +	} +	kfree(trans); +} + +static int nf_tables_abort(struct sk_buff *skb) +{ +	struct net *net = sock_net(skb->sk); +	struct nft_trans *trans, *next; +	struct nft_set *set; + +	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { +		switch (trans->msg_type) { +		case NFT_MSG_NEWTABLE: +			if (nft_trans_table_update(trans)) { +				if (nft_trans_table_enable(trans)) { +					nf_tables_table_disable(trans->ctx.afi, +								trans->ctx.table); +					trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; +				} +				nft_trans_destroy(trans); +			} else { +				list_del_rcu(&trans->ctx.table->list); +			} +			break; +		case NFT_MSG_DELTABLE: +			list_add_tail_rcu(&trans->ctx.table->list, +					  &trans->ctx.afi->tables); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWCHAIN: +			if (nft_trans_chain_update(trans)) { +				if (nft_trans_chain_stats(trans)) +					free_percpu(nft_trans_chain_stats(trans)); + +				nft_trans_destroy(trans); +			} else { +				trans->ctx.table->use--; +				list_del_rcu(&trans->ctx.chain->list); +				if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && +				    trans->ctx.chain->flags & NFT_BASE_CHAIN) { +					nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, +							    trans->ctx.afi->nops); +				} +			} +			break; +		case NFT_MSG_DELCHAIN: +			trans->ctx.table->use++; +			list_add_tail_rcu(&trans->ctx.chain->list, +					  &trans->ctx.table->chains); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWRULE: +			trans->ctx.chain->use--; +			list_del_rcu(&nft_trans_rule(trans)->list); +			break; +		case NFT_MSG_DELRULE: +			trans->ctx.chain->use++; +			nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWSET: +			trans->ctx.table->use--; +			list_del_rcu(&nft_trans_set(trans)->list); +			break; +		case NFT_MSG_DELSET: +			trans->ctx.table->use++; +			list_add_tail_rcu(&nft_trans_set(trans)->list, +					  &trans->ctx.table->sets); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_NEWSETELEM: +			nft_trans_elem_set(trans)->nelems--; +			set = nft_trans_elem_set(trans); +			set->ops->get(set, &nft_trans_elem(trans)); +			set->ops->remove(set, &nft_trans_elem(trans)); +			nft_trans_destroy(trans); +			break; +		case NFT_MSG_DELSETELEM: +			nft_trans_elem_set(trans)->nelems++; +			nft_trans_destroy(trans); +			break; +		} +	} + +	list_for_each_entry_safe_reverse(trans, next, +					 &net->nft.commit_list, list) { +		list_del(&trans->list); +		trans->ctx.nla = NULL; +		call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); +	} + +	return 0; +} + +static const struct nfnetlink_subsystem nf_tables_subsys = { +	.name		= "nf_tables", +	.subsys_id	= NFNL_SUBSYS_NFTABLES, +	.cb_count	= NFT_MSG_MAX, +	.cb		= nf_tables_cb, +	.commit		= nf_tables_commit, +	.abort		= nf_tables_abort, +}; + +/* + * Loop detection - walk through the ruleset beginning at the destination chain + * of a new jump until either the source chain is reached (loop) or all + * reachable chains have been traversed. + * + * The loop check is performed whenever a new jump verdict is added to an + * expression or verdict map or a verdict map is bound to a new chain. + */ + +static int nf_tables_check_loops(const struct nft_ctx *ctx, +				 const struct nft_chain *chain); + +static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, +					const struct nft_set *set, +					const struct nft_set_iter *iter, +					const struct nft_set_elem *elem) +{ +	if (elem->flags & NFT_SET_ELEM_INTERVAL_END) +		return 0; + +	switch (elem->data.verdict) { +	case NFT_JUMP: +	case NFT_GOTO: +		return nf_tables_check_loops(ctx, elem->data.chain); +	default: +		return 0; +	} +} + +static int nf_tables_check_loops(const struct nft_ctx *ctx, +				 const struct nft_chain *chain) +{ +	const struct nft_rule *rule; +	const struct nft_expr *expr, *last; +	const struct nft_set *set; +	struct nft_set_binding *binding; +	struct nft_set_iter iter; + +	if (ctx->chain == chain) +		return -ELOOP; + +	list_for_each_entry(rule, &chain->rules, list) { +		nft_rule_for_each_expr(expr, last, rule) { +			const struct nft_data *data = NULL; +			int err; + +			if (!expr->ops->validate) +				continue; + +			err = expr->ops->validate(ctx, expr, &data); +			if (err < 0) +				return err; + +			if (data == NULL) +				continue; + +			switch (data->verdict) { +			case NFT_JUMP: +			case NFT_GOTO: +				err = nf_tables_check_loops(ctx, data->chain); +				if (err < 0) +					return err; +			default: +				break; +			} +		} +	} + +	list_for_each_entry(set, &ctx->table->sets, list) { +		if (!(set->flags & NFT_SET_MAP) || +		    set->dtype != NFT_DATA_VERDICT) +			continue; + +		list_for_each_entry(binding, &set->bindings, list) { +			if (binding->chain != chain) +				continue; + +			iter.skip 	= 0; +			iter.count	= 0; +			iter.err	= 0; +			iter.fn		= nf_tables_loop_check_setelem; + +			set->ops->walk(ctx, set, &iter); +			if (iter.err < 0) +				return iter.err; +		} +	} + +	return 0; +} + +/** + *	nft_validate_input_register - validate an expressions' input register + * + *	@reg: the register number + * + * 	Validate that the input register is one of the general purpose + * 	registers. + */ +int nft_validate_input_register(enum nft_registers reg) +{ +	if (reg <= NFT_REG_VERDICT) +		return -EINVAL; +	if (reg > NFT_REG_MAX) +		return -ERANGE; +	return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_input_register); + +/** + *	nft_validate_output_register - validate an expressions' output register + * + *	@reg: the register number + * + * 	Validate that the output register is one of the general purpose + * 	registers or the verdict register. + */ +int nft_validate_output_register(enum nft_registers reg) +{ +	if (reg < NFT_REG_VERDICT) +		return -EINVAL; +	if (reg > NFT_REG_MAX) +		return -ERANGE; +	return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_output_register); + +/** + *	nft_validate_data_load - validate an expressions' data load + * + *	@ctx: context of the expression performing the load + * 	@reg: the destination register number + * 	@data: the data to load + * 	@type: the data type + * + * 	Validate that a data load uses the appropriate data type for + * 	the destination register. A value of NULL for the data means + * 	that its runtime gathered data, which is always of type + * 	NFT_DATA_VALUE. + */ +int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg, +			   const struct nft_data *data, +			   enum nft_data_types type) +{ +	int err; + +	switch (reg) { +	case NFT_REG_VERDICT: +		if (data == NULL || type != NFT_DATA_VERDICT) +			return -EINVAL; + +		if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) { +			err = nf_tables_check_loops(ctx, data->chain); +			if (err < 0) +				return err; + +			if (ctx->chain->level + 1 > data->chain->level) { +				if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE) +					return -EMLINK; +				data->chain->level = ctx->chain->level + 1; +			} +		} + +		return 0; +	default: +		if (data != NULL && type != NFT_DATA_VALUE) +			return -EINVAL; +		return 0; +	} +} +EXPORT_SYMBOL_GPL(nft_validate_data_load); + +static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { +	[NFTA_VERDICT_CODE]	= { .type = NLA_U32 }, +	[NFTA_VERDICT_CHAIN]	= { .type = NLA_STRING, +				    .len = NFT_CHAIN_MAXNAMELEN - 1 }, +}; + +static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, +			    struct nft_data_desc *desc, const struct nlattr *nla) +{ +	struct nlattr *tb[NFTA_VERDICT_MAX + 1]; +	struct nft_chain *chain; +	int err; + +	err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy); +	if (err < 0) +		return err; + +	if (!tb[NFTA_VERDICT_CODE]) +		return -EINVAL; +	data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); + +	switch (data->verdict) { +	default: +		switch (data->verdict & NF_VERDICT_MASK) { +		case NF_ACCEPT: +		case NF_DROP: +		case NF_QUEUE: +			break; +		default: +			return -EINVAL; +		} +		/* fall through */ +	case NFT_CONTINUE: +	case NFT_BREAK: +	case NFT_RETURN: +		desc->len = sizeof(data->verdict); +		break; +	case NFT_JUMP: +	case NFT_GOTO: +		if (!tb[NFTA_VERDICT_CHAIN]) +			return -EINVAL; +		chain = nf_tables_chain_lookup(ctx->table, +					       tb[NFTA_VERDICT_CHAIN]); +		if (IS_ERR(chain)) +			return PTR_ERR(chain); +		if (chain->flags & NFT_BASE_CHAIN) +			return -EOPNOTSUPP; + +		chain->use++; +		data->chain = chain; +		desc->len = sizeof(data); +		break; +	} + +	desc->type = NFT_DATA_VERDICT; +	return 0; +} + +static void nft_verdict_uninit(const struct nft_data *data) +{ +	switch (data->verdict) { +	case NFT_JUMP: +	case NFT_GOTO: +		data->chain->use--; +		break; +	} +} + +static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data) +{ +	struct nlattr *nest; + +	nest = nla_nest_start(skb, NFTA_DATA_VERDICT); +	if (!nest) +		goto nla_put_failure; + +	if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict))) +		goto nla_put_failure; + +	switch (data->verdict) { +	case NFT_JUMP: +	case NFT_GOTO: +		if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name)) +			goto nla_put_failure; +	} +	nla_nest_end(skb, nest); +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data, +			  struct nft_data_desc *desc, const struct nlattr *nla) +{ +	unsigned int len; + +	len = nla_len(nla); +	if (len == 0) +		return -EINVAL; +	if (len > sizeof(data->data)) +		return -EOVERFLOW; + +	nla_memcpy(data->data, nla, sizeof(data->data)); +	desc->type = NFT_DATA_VALUE; +	desc->len  = len; +	return 0; +} + +static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data, +			  unsigned int len) +{ +	return nla_put(skb, NFTA_DATA_VALUE, len, data->data); +} + +static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { +	[NFTA_DATA_VALUE]	= { .type = NLA_BINARY, +				    .len  = FIELD_SIZEOF(struct nft_data, data) }, +	[NFTA_DATA_VERDICT]	= { .type = NLA_NESTED }, +}; + +/** + *	nft_data_init - parse nf_tables data netlink attributes + * + *	@ctx: context of the expression using the data + *	@data: destination struct nft_data + *	@desc: data description + *	@nla: netlink attribute containing data + * + *	Parse the netlink data attributes and initialize a struct nft_data. + *	The type and length of data are returned in the data description. + * + *	The caller can indicate that it only wants to accept data of type + *	NFT_DATA_VALUE by passing NULL for the ctx argument. + */ +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, +		  struct nft_data_desc *desc, const struct nlattr *nla) +{ +	struct nlattr *tb[NFTA_DATA_MAX + 1]; +	int err; + +	err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy); +	if (err < 0) +		return err; + +	if (tb[NFTA_DATA_VALUE]) +		return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); +	if (tb[NFTA_DATA_VERDICT] && ctx != NULL) +		return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); +	return -EINVAL; +} +EXPORT_SYMBOL_GPL(nft_data_init); + +/** + *	nft_data_uninit - release a nft_data item + * + *	@data: struct nft_data to release + *	@type: type of data + * + *	Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, + *	all others need to be released by calling this function. + */ +void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) +{ +	switch (type) { +	case NFT_DATA_VALUE: +		return; +	case NFT_DATA_VERDICT: +		return nft_verdict_uninit(data); +	default: +		WARN_ON(1); +	} +} +EXPORT_SYMBOL_GPL(nft_data_uninit); + +int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, +		  enum nft_data_types type, unsigned int len) +{ +	struct nlattr *nest; +	int err; + +	nest = nla_nest_start(skb, attr); +	if (nest == NULL) +		return -1; + +	switch (type) { +	case NFT_DATA_VALUE: +		err = nft_value_dump(skb, data, len); +		break; +	case NFT_DATA_VERDICT: +		err = nft_verdict_dump(skb, data); +		break; +	default: +		err = -EINVAL; +		WARN_ON(1); +	} + +	nla_nest_end(skb, nest); +	return err; +} +EXPORT_SYMBOL_GPL(nft_data_dump); + +static int nf_tables_init_net(struct net *net) +{ +	INIT_LIST_HEAD(&net->nft.af_info); +	INIT_LIST_HEAD(&net->nft.commit_list); +	net->nft.base_seq = 1; +	return 0; +} + +static struct pernet_operations nf_tables_net_ops = { +	.init	= nf_tables_init_net, +}; + +static int __init nf_tables_module_init(void) +{ +	int err; + +	info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS, +		       GFP_KERNEL); +	if (info == NULL) { +		err = -ENOMEM; +		goto err1; +	} + +	err = nf_tables_core_module_init(); +	if (err < 0) +		goto err2; + +	err = nfnetlink_subsys_register(&nf_tables_subsys); +	if (err < 0) +		goto err3; + +	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n"); +	return register_pernet_subsys(&nf_tables_net_ops); +err3: +	nf_tables_core_module_exit(); +err2: +	kfree(info); +err1: +	return err; +} + +static void __exit nf_tables_module_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_net_ops); +	nfnetlink_subsys_unregister(&nf_tables_subsys); +	nf_tables_core_module_exit(); +	kfree(info); +} + +module_init(nf_tables_module_init); +module_exit(nf_tables_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c new file mode 100644 index 00000000000..3b90eb2b2c5 --- /dev/null +++ b/net/netfilter/nf_tables_core.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> + +static void nft_cmp_fast_eval(const struct nft_expr *expr, +			      struct nft_data data[NFT_REG_MAX + 1]) +{ +	const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); +	u32 mask = nft_cmp_fast_mask(priv->len); + +	if ((data[priv->sreg].data[0] & mask) == priv->data) +		return; +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static bool nft_payload_fast_eval(const struct nft_expr *expr, +				  struct nft_data data[NFT_REG_MAX + 1], +				  const struct nft_pktinfo *pkt) +{ +	const struct nft_payload *priv = nft_expr_priv(expr); +	const struct sk_buff *skb = pkt->skb; +	struct nft_data *dest = &data[priv->dreg]; +	unsigned char *ptr; + +	if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) +		ptr = skb_network_header(skb); +	else +		ptr = skb_network_header(skb) + pkt->xt.thoff; + +	ptr += priv->offset; + +	if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) +		return false; + +	if (priv->len == 2) +		*(u16 *)dest->data = *(u16 *)ptr; +	else if (priv->len == 4) +		*(u32 *)dest->data = *(u32 *)ptr; +	else +		*(u8 *)dest->data = *(u8 *)ptr; +	return true; +} + +struct nft_jumpstack { +	const struct nft_chain	*chain; +	const struct nft_rule	*rule; +	int			rulenum; +}; + +enum nft_trace { +	NFT_TRACE_RULE, +	NFT_TRACE_RETURN, +	NFT_TRACE_POLICY, +}; + +static const char *const comments[] = { +	[NFT_TRACE_RULE]	= "rule", +	[NFT_TRACE_RETURN]	= "return", +	[NFT_TRACE_POLICY]	= "policy", +}; + +static struct nf_loginfo trace_loginfo = { +	.type = NF_LOG_TYPE_LOG, +	.u = { +		.log = { +			.level = 4, +			.logflags = NF_LOG_MASK, +	        }, +	}, +}; + +static void nft_trace_packet(const struct nft_pktinfo *pkt, +			     const struct nft_chain *chain, +			     int rulenum, enum nft_trace type) +{ +	struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + +	nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, +		      pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", +		      chain->table->name, chain->name, comments[type], +		      rulenum); +} + +unsigned int +nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +{ +	const struct nft_chain *chain = ops->priv, *basechain = chain; +	const struct nft_rule *rule; +	const struct nft_expr *expr, *last; +	struct nft_data data[NFT_REG_MAX + 1]; +	unsigned int stackptr = 0; +	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; +	struct nft_stats *stats; +	int rulenum; +	/* +	 * Cache cursor to avoid problems in case that the cursor is updated +	 * while traversing the ruleset. +	 */ +	unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); + +do_chain: +	rulenum = 0; +	rule = list_entry(&chain->rules, struct nft_rule, list); +next_rule: +	data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +	list_for_each_entry_continue_rcu(rule, &chain->rules, list) { + +		/* This rule is not active, skip. */ +		if (unlikely(rule->genmask & (1 << gencursor))) +			continue; + +		rulenum++; + +		nft_rule_for_each_expr(expr, last, rule) { +			if (expr->ops == &nft_cmp_fast_ops) +				nft_cmp_fast_eval(expr, data); +			else if (expr->ops != &nft_payload_fast_ops || +				 !nft_payload_fast_eval(expr, data, pkt)) +				expr->ops->eval(expr, data, pkt); + +			if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE) +				break; +		} + +		switch (data[NFT_REG_VERDICT].verdict) { +		case NFT_BREAK: +			data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +			continue; +		case NFT_CONTINUE: +			if (unlikely(pkt->skb->nf_trace)) +				nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); +			continue; +		} +		break; +	} + +	switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) { +	case NF_ACCEPT: +	case NF_DROP: +	case NF_QUEUE: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + +		return data[NFT_REG_VERDICT].verdict; +	} + +	switch (data[NFT_REG_VERDICT].verdict) { +	case NFT_JUMP: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + +		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); +		jumpstack[stackptr].chain = chain; +		jumpstack[stackptr].rule  = rule; +		jumpstack[stackptr].rulenum = rulenum; +		stackptr++; +		chain = data[NFT_REG_VERDICT].chain; +		goto do_chain; +	case NFT_GOTO: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + +		chain = data[NFT_REG_VERDICT].chain; +		goto do_chain; +	case NFT_RETURN: +		if (unlikely(pkt->skb->nf_trace)) +			nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); +		break; +	case NFT_CONTINUE: +		if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) +			nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); +		break; +	default: +		WARN_ON(1); +	} + +	if (stackptr > 0) { +		stackptr--; +		chain = jumpstack[stackptr].chain; +		rule  = jumpstack[stackptr].rule; +		rulenum = jumpstack[stackptr].rulenum; +		goto next_rule; +	} + +	if (unlikely(pkt->skb->nf_trace)) +		nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + +	rcu_read_lock_bh(); +	stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); +	u64_stats_update_begin(&stats->syncp); +	stats->pkts++; +	stats->bytes += pkt->skb->len; +	u64_stats_update_end(&stats->syncp); +	rcu_read_unlock_bh(); + +	return nft_base_chain(basechain)->policy; +} +EXPORT_SYMBOL_GPL(nft_do_chain); + +int __init nf_tables_core_module_init(void) +{ +	int err; + +	err = nft_immediate_module_init(); +	if (err < 0) +		goto err1; + +	err = nft_cmp_module_init(); +	if (err < 0) +		goto err2; + +	err = nft_lookup_module_init(); +	if (err < 0) +		goto err3; + +	err = nft_bitwise_module_init(); +	if (err < 0) +		goto err4; + +	err = nft_byteorder_module_init(); +	if (err < 0) +		goto err5; + +	err = nft_payload_module_init(); +	if (err < 0) +		goto err6; + +	return 0; + +err6: +	nft_byteorder_module_exit(); +err5: +	nft_bitwise_module_exit(); +err4: +	nft_lookup_module_exit(); +err3: +	nft_cmp_module_exit(); +err2: +	nft_immediate_module_exit(); +err1: +	return err; +} + +void nf_tables_core_module_exit(void) +{ +	nft_payload_module_exit(); +	nft_byteorder_module_exit(); +	nft_bitwise_module_exit(); +	nft_lookup_module_exit(); +	nft_cmp_module_exit(); +	nft_immediate_module_exit(); +} diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c new file mode 100644 index 00000000000..9dd2d216cfc --- /dev/null +++ b/net/netfilter/nf_tables_inet.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/ip.h> + +static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n) +{ +	struct nft_af_info *afi; + +	if (n == 1) +		afi = &nft_af_ipv4; +	else +		afi = &nft_af_ipv6; + +	ops->pf = afi->family; +	if (afi->hooks[ops->hooknum]) +		ops->hook = afi->hooks[ops->hooknum]; +} + +static struct nft_af_info nft_af_inet __read_mostly = { +	.family		= NFPROTO_INET, +	.nhooks		= NF_INET_NUMHOOKS, +	.owner		= THIS_MODULE, +	.nops		= 2, +	.hook_ops_init	= nft_inet_hook_ops_init, +}; + +static int __net_init nf_tables_inet_init_net(struct net *net) +{ +	net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); +	if (net->nft.inet == NULL) +		return -ENOMEM; +	memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet)); + +	if (nft_register_afinfo(net, net->nft.inet) < 0) +		goto err; + +	return 0; + +err: +	kfree(net->nft.inet); +	return -ENOMEM; +} + +static void __net_exit nf_tables_inet_exit_net(struct net *net) +{ +	nft_unregister_afinfo(net->nft.inet); +	kfree(net->nft.inet); +} + +static struct pernet_operations nf_tables_inet_net_ops = { +	.init	= nf_tables_inet_init_net, +	.exit	= nf_tables_inet_exit_net, +}; + +static const struct nf_chain_type filter_inet = { +	.name		= "filter", +	.type		= NFT_CHAIN_T_DEFAULT, +	.family		= NFPROTO_INET, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_LOCAL_IN) | +			  (1 << NF_INET_LOCAL_OUT) | +			  (1 << NF_INET_FORWARD) | +			  (1 << NF_INET_PRE_ROUTING) | +			  (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_inet_init(void) +{ +	int ret; + +	nft_register_chain_type(&filter_inet); +	ret = register_pernet_subsys(&nf_tables_inet_net_ops); +	if (ret < 0) +		nft_unregister_chain_type(&filter_inet); + +	return ret; +} + +static void __exit nf_tables_inet_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_inet_net_ops); +	nft_unregister_chain_type(&filter_inet); +} + +module_init(nf_tables_inet_init); +module_exit(nf_tables_inet_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(1); diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c deleted file mode 100644 index 4d87befb04c..00000000000 --- a/net/netfilter/nf_tproxy_core.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Transparent proxy support for Linux/iptables - * - * Copyright (c) 2006-2007 BalaBit IT Ltd. - * Author: Balazs Scheidler, Krisztian Kovacs - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> - -#include <linux/net.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <net/udp.h> -#include <net/netfilter/nf_tproxy_core.h> - - -static void -nf_tproxy_destructor(struct sk_buff *skb) -{ -	struct sock *sk = skb->sk; - -	skb->sk = NULL; -	skb->destructor = NULL; - -	if (sk) -		nf_tproxy_put_sock(sk); -} - -/* consumes sk */ -int -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) -{ -	bool transparent = (sk->sk_state == TCP_TIME_WAIT) ? -				inet_twsk(sk)->tw_transparent : -				inet_sk(sk)->transparent; - -	if (transparent) { -		skb_orphan(skb); -		skb->sk = sk; -		skb->destructor = nf_tproxy_destructor; -		return 1; -	} else -		nf_tproxy_put_sock(sk); - -	return 0; -} -EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); - -static int __init nf_tproxy_init(void) -{ -	pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n"); -	pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n"); -	return 0; -} - -module_init(nf_tproxy_init); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Krisztian Kovacs"); -MODULE_DESCRIPTION("Transparent proxy support core routines"); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index b4a4532823e..c138b8fbe28 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -23,12 +23,10 @@  #include <linux/net.h>  #include <linux/skbuff.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <net/sock.h> -#include <net/netlink.h>  #include <linux/init.h> -#include <linux/netlink.h> +#include <net/netlink.h>  #include <linux/netfilter/nfnetlink.h>  MODULE_LICENSE("GPL"); @@ -37,30 +35,49 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);  static char __initdata nfversion[] = "0.30"; -static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; -static DEFINE_MUTEX(nfnl_mutex); +static struct { +	struct mutex				mutex; +	const struct nfnetlink_subsystem __rcu	*subsys; +} table[NFNL_SUBSYS_COUNT]; + +static const int nfnl_group2type[NFNLGRP_MAX+1] = { +	[NFNLGRP_CONNTRACK_NEW]		= NFNL_SUBSYS_CTNETLINK, +	[NFNLGRP_CONNTRACK_UPDATE]	= NFNL_SUBSYS_CTNETLINK, +	[NFNLGRP_CONNTRACK_DESTROY]	= NFNL_SUBSYS_CTNETLINK, +	[NFNLGRP_CONNTRACK_EXP_NEW]	= NFNL_SUBSYS_CTNETLINK_EXP, +	[NFNLGRP_CONNTRACK_EXP_UPDATE]	= NFNL_SUBSYS_CTNETLINK_EXP, +	[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, +}; -void nfnl_lock(void) +void nfnl_lock(__u8 subsys_id)  { -	mutex_lock(&nfnl_mutex); +	mutex_lock(&table[subsys_id].mutex);  }  EXPORT_SYMBOL_GPL(nfnl_lock); -void nfnl_unlock(void) +void nfnl_unlock(__u8 subsys_id)  { -	mutex_unlock(&nfnl_mutex); +	mutex_unlock(&table[subsys_id].mutex);  }  EXPORT_SYMBOL_GPL(nfnl_unlock); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_nfnl_is_held(u8 subsys_id) +{ +	return lockdep_is_held(&table[subsys_id].mutex); +} +EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); +#endif +  int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)  { -	nfnl_lock(); -	if (subsys_table[n->subsys_id]) { -		nfnl_unlock(); +	nfnl_lock(n->subsys_id); +	if (table[n->subsys_id].subsys) { +		nfnl_unlock(n->subsys_id);  		return -EBUSY;  	} -	subsys_table[n->subsys_id] = n; -	nfnl_unlock(); +	rcu_assign_pointer(table[n->subsys_id].subsys, n); +	nfnl_unlock(n->subsys_id);  	return 0;  } @@ -68,10 +85,10 @@ EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);  int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)  { -	nfnl_lock(); -	subsys_table[n->subsys_id] = NULL; -	nfnl_unlock(); - +	nfnl_lock(n->subsys_id); +	table[n->subsys_id].subsys = NULL; +	nfnl_unlock(n->subsys_id); +	synchronize_rcu();  	return 0;  }  EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); @@ -83,7 +100,7 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t  	if (subsys_id >= NFNL_SUBSYS_COUNT)  		return NULL; -	return subsys_table[subsys_id]; +	return rcu_dereference(table[subsys_id].subsys);  }  static inline const struct nfnl_callback * @@ -103,22 +120,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)  }  EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, -		   unsigned group, int echo, gfp_t flags) +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, +				    u32 dst_portid, gfp_t gfp_mask) +{ +	return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); +} +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, +		   unsigned int group, int echo, gfp_t flags)  { -	return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); +	return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);  }  EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)  { -	return netlink_set_err(net->nfnl, pid, group, error); +	return netlink_set_err(net->nfnl, portid, group, error);  }  EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, +		      int flags)  { -	return netlink_unicast(net->nfnl, skb, pid, flags); +	return netlink_unicast(net->nfnl, skb, portid, flags);  }  EXPORT_SYMBOL_GPL(nfnetlink_unicast); @@ -130,63 +155,276 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	const struct nfnetlink_subsystem *ss;  	int type, err; -	if (security_netlink_recv(skb, CAP_NET_ADMIN)) -		return -EPERM; -  	/* All the messages must at least contain nfgenmsg */ -	if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg))) +	if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))  		return 0;  	type = nlh->nlmsg_type;  replay: +	rcu_read_lock();  	ss = nfnetlink_get_subsys(type);  	if (!ss) {  #ifdef CONFIG_MODULES -		nfnl_unlock(); +		rcu_read_unlock();  		request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); -		nfnl_lock(); +		rcu_read_lock();  		ss = nfnetlink_get_subsys(type);  		if (!ss)  #endif +		{ +			rcu_read_unlock();  			return -EINVAL; +		}  	}  	nc = nfnetlink_find_client(type, ss); -	if (!nc) +	if (!nc) { +		rcu_read_unlock();  		return -EINVAL; +	}  	{ -		int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); +		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));  		u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);  		struct nlattr *cda[ss->cb[cb_id].attr_count + 1];  		struct nlattr *attr = (void *)nlh + min_len;  		int attrlen = nlh->nlmsg_len - min_len; +		__u8 subsys_id = NFNL_SUBSYS_ID(type);  		err = nla_parse(cda, ss->cb[cb_id].attr_count,  				attr, attrlen, ss->cb[cb_id].policy); -		if (err < 0) +		if (err < 0) { +			rcu_read_unlock();  			return err; - -		err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda); +		} + +		if (nc->call_rcu) { +			err = nc->call_rcu(net->nfnl, skb, nlh, +					   (const struct nlattr **)cda); +			rcu_read_unlock(); +		} else { +			rcu_read_unlock(); +			nfnl_lock(subsys_id); +			if (rcu_dereference_protected(table[subsys_id].subsys, +				lockdep_is_held(&table[subsys_id].mutex)) != ss || +			    nfnetlink_find_client(type, ss) != nc) +				err = -EAGAIN; +			else if (nc->call) +				err = nc->call(net->nfnl, skb, nlh, +						   (const struct nlattr **)cda); +			else +				err = -EINVAL; +			nfnl_unlock(subsys_id); +		}  		if (err == -EAGAIN)  			goto replay;  		return err;  	}  } +static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, +				u_int16_t subsys_id) +{ +	struct sk_buff *nskb, *oskb = skb; +	struct net *net = sock_net(skb->sk); +	const struct nfnetlink_subsystem *ss; +	const struct nfnl_callback *nc; +	bool success = true, done = false; +	int err; + +	if (subsys_id >= NFNL_SUBSYS_COUNT) +		return netlink_ack(skb, nlh, -EINVAL); +replay: +	nskb = netlink_skb_clone(oskb, GFP_KERNEL); +	if (!nskb) +		return netlink_ack(oskb, nlh, -ENOMEM); + +	nskb->sk = oskb->sk; +	skb = nskb; + +	nfnl_lock(subsys_id); +	ss = rcu_dereference_protected(table[subsys_id].subsys, +				       lockdep_is_held(&table[subsys_id].mutex)); +	if (!ss) { +#ifdef CONFIG_MODULES +		nfnl_unlock(subsys_id); +		request_module("nfnetlink-subsys-%d", subsys_id); +		nfnl_lock(subsys_id); +		ss = rcu_dereference_protected(table[subsys_id].subsys, +					       lockdep_is_held(&table[subsys_id].mutex)); +		if (!ss) +#endif +		{ +			nfnl_unlock(subsys_id); +			netlink_ack(skb, nlh, -EOPNOTSUPP); +			return kfree_skb(nskb); +		} +	} + +	if (!ss->commit || !ss->abort) { +		nfnl_unlock(subsys_id); +		netlink_ack(skb, nlh, -EOPNOTSUPP); +		return kfree_skb(skb); +	} + +	while (skb->len >= nlmsg_total_size(0)) { +		int msglen, type; + +		nlh = nlmsg_hdr(skb); +		err = 0; + +		if (nlh->nlmsg_len < NLMSG_HDRLEN) { +			err = -EINVAL; +			goto ack; +		} + +		/* Only requests are handled by the kernel */ +		if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { +			err = -EINVAL; +			goto ack; +		} + +		type = nlh->nlmsg_type; +		if (type == NFNL_MSG_BATCH_BEGIN) { +			/* Malformed: Batch begin twice */ +			success = false; +			goto done; +		} else if (type == NFNL_MSG_BATCH_END) { +			done = true; +			goto done; +		} else if (type < NLMSG_MIN_TYPE) { +			err = -EINVAL; +			goto ack; +		} + +		/* We only accept a batch with messages for the same +		 * subsystem. +		 */ +		if (NFNL_SUBSYS_ID(type) != subsys_id) { +			err = -EINVAL; +			goto ack; +		} + +		nc = nfnetlink_find_client(type, ss); +		if (!nc) { +			err = -EINVAL; +			goto ack; +		} + +		{ +			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); +			u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); +			struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; +			struct nlattr *attr = (void *)nlh + min_len; +			int attrlen = nlh->nlmsg_len - min_len; + +			err = nla_parse(cda, ss->cb[cb_id].attr_count, +					attr, attrlen, ss->cb[cb_id].policy); +			if (err < 0) +				goto ack; + +			if (nc->call_batch) { +				err = nc->call_batch(net->nfnl, skb, nlh, +						     (const struct nlattr **)cda); +			} + +			/* The lock was released to autoload some module, we +			 * have to abort and start from scratch using the +			 * original skb. +			 */ +			if (err == -EAGAIN) { +				ss->abort(skb); +				nfnl_unlock(subsys_id); +				kfree_skb(nskb); +				goto replay; +			} +		} +ack: +		if (nlh->nlmsg_flags & NLM_F_ACK || err) { +			/* We don't stop processing the batch on errors, thus, +			 * userspace gets all the errors that the batch +			 * triggers. +			 */ +			netlink_ack(skb, nlh, err); +			if (err) +				success = false; +		} + +		msglen = NLMSG_ALIGN(nlh->nlmsg_len); +		if (msglen > skb->len) +			msglen = skb->len; +		skb_pull(skb, msglen); +	} +done: +	if (success && done) +		ss->commit(skb); +	else +		ss->abort(skb); + +	nfnl_unlock(subsys_id); +	kfree_skb(nskb); +} +  static void nfnetlink_rcv(struct sk_buff *skb)  { -	nfnl_lock(); -	netlink_rcv_skb(skb, &nfnetlink_rcv_msg); -	nfnl_unlock(); +	struct nlmsghdr *nlh = nlmsg_hdr(skb); +	int msglen; + +	if (nlh->nlmsg_len < NLMSG_HDRLEN || +	    skb->len < nlh->nlmsg_len) +		return; + +	if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { +		netlink_ack(skb, nlh, -EPERM); +		return; +	} + +	if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { +		struct nfgenmsg *nfgenmsg; + +		msglen = NLMSG_ALIGN(nlh->nlmsg_len); +		if (msglen > skb->len) +			msglen = skb->len; + +		if (nlh->nlmsg_len < NLMSG_HDRLEN || +		    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) +			return; + +		nfgenmsg = nlmsg_data(nlh); +		skb_pull(skb, msglen); +		nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); +	} else { +		netlink_rcv_skb(skb, &nfnetlink_rcv_msg); +	}  } +#ifdef CONFIG_MODULES +static int nfnetlink_bind(int group) +{ +	const struct nfnetlink_subsystem *ss; +	int type = nfnl_group2type[group]; + +	rcu_read_lock(); +	ss = nfnetlink_get_subsys(type); +	rcu_read_unlock(); +	if (!ss) +		request_module("nfnetlink-subsys-%d", type); +	return 0; +} +#endif +  static int __net_init nfnetlink_net_init(struct net *net)  {  	struct sock *nfnl; +	struct netlink_kernel_cfg cfg = { +		.groups	= NFNLGRP_MAX, +		.input	= nfnetlink_rcv, +#ifdef CONFIG_MODULES +		.bind	= nfnetlink_bind, +#endif +	}; -	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, -				     nfnetlink_rcv, NULL, THIS_MODULE); +	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);  	if (!nfnl)  		return -ENOMEM;  	net->nfnl_stash = nfnl; @@ -199,7 +437,7 @@ static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)  	struct net *net;  	list_for_each_entry(net, net_exit_list, exit_list) -		rcu_assign_pointer(net->nfnl, NULL); +		RCU_INIT_POINTER(net->nfnl, NULL);  	synchronize_net();  	list_for_each_entry(net, net_exit_list, exit_list)  		netlink_kernel_release(net->nfnl_stash); @@ -212,6 +450,11 @@ static struct pernet_operations nfnetlink_net_ops = {  static int __init nfnetlink_init(void)  { +	int i; + +	for (i=0; i<NFNL_SUBSYS_COUNT; i++) +		mutex_init(&table[i].mutex); +  	pr_info("Netfilter messages via NETLINK v%s.\n", nfversion);  	return register_pernet_subsys(&nfnetlink_net_ops);  } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c new file mode 100644 index 00000000000..2baa125c2e8 --- /dev/null +++ b/net/netfilter/nfnetlink_acct.c @@ -0,0 +1,454 @@ +/* + * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2011 Intra2net AG <http://www.intra2net.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/atomic.h> +#include <linux/netlink.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <net/netlink.h> +#include <net/sock.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_acct.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); + +static LIST_HEAD(nfnl_acct_list); + +struct nf_acct { +	atomic64_t		pkts; +	atomic64_t		bytes; +	unsigned long		flags; +	struct list_head	head; +	atomic_t		refcnt; +	char			name[NFACCT_NAME_MAX]; +	struct rcu_head		rcu_head; +	char			data[0]; +}; + +#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) + +static int +nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, +	     const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	struct nf_acct *nfacct, *matching = NULL; +	char *acct_name; +	unsigned int size = 0; +	u32 flags = 0; + +	if (!tb[NFACCT_NAME]) +		return -EINVAL; + +	acct_name = nla_data(tb[NFACCT_NAME]); +	if (strlen(acct_name) == 0) +		return -EINVAL; + +	list_for_each_entry(nfacct, &nfnl_acct_list, head) { +		if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) +			continue; + +                if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; + +		matching = nfacct; +		break; +        } + +	if (matching) { +		if (nlh->nlmsg_flags & NLM_F_REPLACE) { +			/* reset counters if you request a replacement. */ +			atomic64_set(&matching->pkts, 0); +			atomic64_set(&matching->bytes, 0); +			smp_mb__before_atomic(); +			/* reset overquota flag if quota is enabled. */ +			if ((matching->flags & NFACCT_F_QUOTA)) +				clear_bit(NFACCT_F_OVERQUOTA, &matching->flags); +			return 0; +		} +		return -EBUSY; +	} + +	if (tb[NFACCT_FLAGS]) { +		flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); +		if (flags & ~NFACCT_F_QUOTA) +			return -EOPNOTSUPP; +		if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) +			return -EINVAL; +		if (flags & NFACCT_F_OVERQUOTA) +			return -EINVAL; + +		size += sizeof(u64); +	} + +	nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL); +	if (nfacct == NULL) +		return -ENOMEM; + +	if (flags & NFACCT_F_QUOTA) { +		u64 *quota = (u64 *)nfacct->data; + +		*quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); +		nfacct->flags = flags; +	} + +	strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); + +	if (tb[NFACCT_BYTES]) { +		atomic64_set(&nfacct->bytes, +			     be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES]))); +	} +	if (tb[NFACCT_PKTS]) { +		atomic64_set(&nfacct->pkts, +			     be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); +	} +	atomic_set(&nfacct->refcnt, 1); +	list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); +	return 0; +} + +static int +nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, +		   int event, struct nf_acct *acct) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0; +	u64 pkts, bytes; + +	event |= NFNL_SUBSYS_ACCT << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	if (nla_put_string(skb, NFACCT_NAME, acct->name)) +		goto nla_put_failure; + +	if (type == NFNL_MSG_ACCT_GET_CTRZERO) { +		pkts = atomic64_xchg(&acct->pkts, 0); +		bytes = atomic64_xchg(&acct->bytes, 0); +		smp_mb__before_atomic(); +		if (acct->flags & NFACCT_F_QUOTA) +			clear_bit(NFACCT_F_OVERQUOTA, &acct->flags); +	} else { +		pkts = atomic64_read(&acct->pkts); +		bytes = atomic64_read(&acct->bytes); +	} +	if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) || +	    nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) || +	    nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) +		goto nla_put_failure; +	if (acct->flags & NFACCT_F_QUOTA) { +		u64 *quota = (u64 *)acct->data; + +		if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) || +		    nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) +			goto nla_put_failure; +	} +	nlmsg_end(skb, nlh); +	return skb->len; + +nlmsg_failure: +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct nf_acct *cur, *last; + +	if (cb->args[2]) +		return 0; + +	last = (struct nf_acct *)cb->args[1]; +	if (cb->args[1]) +		cb->args[1] = 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { +		if (last) { +			if (cur != last) +				continue; + +			last = NULL; +		} +		if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, +				       cb->nlh->nlmsg_seq, +				       NFNL_MSG_TYPE(cb->nlh->nlmsg_type), +				       NFNL_MSG_ACCT_NEW, cur) < 0) { +			cb->args[1] = (unsigned long)cur; +			break; +		} +	} +	if (!cb->args[1]) +		cb->args[2] = 1; +	rcu_read_unlock(); +	return skb->len; +} + +static int +nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, +	     const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	int ret = -ENOENT; +	struct nf_acct *cur; +	char *acct_name; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nfnl_acct_dump, +		}; +		return netlink_dump_start(nfnl, skb, nlh, &c); +	} + +	if (!tb[NFACCT_NAME]) +		return -EINVAL; +	acct_name = nla_data(tb[NFACCT_NAME]); + +	list_for_each_entry(cur, &nfnl_acct_list, head) { +		struct sk_buff *skb2; + +		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) +			continue; + +		skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +		if (skb2 == NULL) { +			ret = -ENOMEM; +			break; +		} + +		ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, +					 nlh->nlmsg_seq, +					 NFNL_MSG_TYPE(nlh->nlmsg_type), +					 NFNL_MSG_ACCT_NEW, cur); +		if (ret <= 0) { +			kfree_skb(skb2); +			break; +		} +		ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, +					MSG_DONTWAIT); +		if (ret > 0) +			ret = 0; + +		/* this avoids a loop in nfnetlink. */ +		return ret == -EAGAIN ? -ENOBUFS : ret; +	} +	return ret; +} + +/* try to delete object, fail if it is still in use. */ +static int nfnl_acct_try_del(struct nf_acct *cur) +{ +	int ret = 0; + +	/* we want to avoid races with nfnl_acct_find_get. */ +	if (atomic_dec_and_test(&cur->refcnt)) { +		/* We are protected by nfnl mutex. */ +		list_del_rcu(&cur->head); +		kfree_rcu(cur, rcu_head); +	} else { +		/* still in use, restore reference counter. */ +		atomic_inc(&cur->refcnt); +		ret = -EBUSY; +	} +	return ret; +} + +static int +nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, +	     const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	char *acct_name; +	struct nf_acct *cur; +	int ret = -ENOENT; + +	if (!tb[NFACCT_NAME]) { +		list_for_each_entry(cur, &nfnl_acct_list, head) +			nfnl_acct_try_del(cur); + +		return 0; +	} +	acct_name = nla_data(tb[NFACCT_NAME]); + +	list_for_each_entry(cur, &nfnl_acct_list, head) { +		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) +			continue; + +		ret = nfnl_acct_try_del(cur); +		if (ret < 0) +			return ret; + +		break; +	} +	return ret; +} + +static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { +	[NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, +	[NFACCT_BYTES] = { .type = NLA_U64 }, +	[NFACCT_PKTS] = { .type = NLA_U64 }, +	[NFACCT_FLAGS] = { .type = NLA_U32 }, +	[NFACCT_QUOTA] = { .type = NLA_U64 }, +}; + +static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { +	[NFNL_MSG_ACCT_NEW]		= { .call = nfnl_acct_new, +					    .attr_count = NFACCT_MAX, +					    .policy = nfnl_acct_policy }, +	[NFNL_MSG_ACCT_GET] 		= { .call = nfnl_acct_get, +					    .attr_count = NFACCT_MAX, +					    .policy = nfnl_acct_policy }, +	[NFNL_MSG_ACCT_GET_CTRZERO] 	= { .call = nfnl_acct_get, +					    .attr_count = NFACCT_MAX, +					    .policy = nfnl_acct_policy }, +	[NFNL_MSG_ACCT_DEL]		= { .call = nfnl_acct_del, +					    .attr_count = NFACCT_MAX, +					    .policy = nfnl_acct_policy }, +}; + +static const struct nfnetlink_subsystem nfnl_acct_subsys = { +	.name				= "acct", +	.subsys_id			= NFNL_SUBSYS_ACCT, +	.cb_count			= NFNL_MSG_ACCT_MAX, +	.cb				= nfnl_acct_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); + +struct nf_acct *nfnl_acct_find_get(const char *acct_name) +{ +	struct nf_acct *cur, *acct = NULL; + +	rcu_read_lock(); +	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { +		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) +			continue; + +		if (!try_module_get(THIS_MODULE)) +			goto err; + +		if (!atomic_inc_not_zero(&cur->refcnt)) { +			module_put(THIS_MODULE); +			goto err; +		} + +		acct = cur; +		break; +	} +err: +	rcu_read_unlock(); +	return acct; +} +EXPORT_SYMBOL_GPL(nfnl_acct_find_get); + +void nfnl_acct_put(struct nf_acct *acct) +{ +	atomic_dec(&acct->refcnt); +	module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(nfnl_acct_put); + +void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) +{ +	atomic64_inc(&nfacct->pkts); +	atomic64_add(skb->len, &nfacct->bytes); +} +EXPORT_SYMBOL_GPL(nfnl_acct_update); + +static void nfnl_overquota_report(struct nf_acct *nfacct) +{ +	int ret; +	struct sk_buff *skb; + +	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); +	if (skb == NULL) +		return; + +	ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, +				  nfacct); +	if (ret <= 0) { +		kfree_skb(skb); +		return; +	} +	netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, +			  GFP_ATOMIC); +} + +int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) +{ +	u64 now; +	u64 *quota; +	int ret = NFACCT_UNDERQUOTA; + +	/* no place here if we don't have a quota */ +	if (!(nfacct->flags & NFACCT_F_QUOTA)) +		return NFACCT_NO_QUOTA; + +	quota = (u64 *)nfacct->data; +	now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? +	       atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); + +	ret = now > *quota; + +	if (now >= *quota && +	    !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) { +		nfnl_overquota_report(nfacct); +	} + +	return ret; +} +EXPORT_SYMBOL_GPL(nfnl_acct_overquota); + +static int __init nfnl_acct_init(void) +{ +	int ret; + +	pr_info("nfnl_acct: registering with nfnetlink.\n"); +	ret = nfnetlink_subsys_register(&nfnl_acct_subsys); +	if (ret < 0) { +		pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); +		goto err_out; +	} +	return 0; +err_out: +	return ret; +} + +static void __exit nfnl_acct_exit(void) +{ +	struct nf_acct *cur, *tmp; + +	pr_info("nfnl_acct: unregistering from nfnetlink.\n"); +	nfnetlink_subsys_unregister(&nfnl_acct_subsys); + +	list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { +		list_del_rcu(&cur->head); +		/* We are sure that our objects have no clients at this point, +		 * it's safe to release them all without checking refcnt. */ +		kfree_rcu(cur, rcu_head); +	} +} + +module_init(nfnl_acct_init); +module_exit(nfnl_acct_exit); diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c new file mode 100644 index 00000000000..9e287cb56a0 --- /dev/null +++ b/net/netfilter/nfnetlink_cthelper.c @@ -0,0 +1,680 @@ +/* + * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + * + * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com> + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/errno.h> +#include <net/netlink.h> +#include <net/sock.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_ecache.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <linux/netfilter/nfnetlink_cthelper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); + +static int +nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, +			struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ +	const struct nf_conn_help *help; +	struct nf_conntrack_helper *helper; + +	help = nfct_help(ct); +	if (help == NULL) +		return NF_DROP; + +	/* rcu_read_lock()ed by nf_hook_slow */ +	helper = rcu_dereference(help->helper); +	if (helper == NULL) +		return NF_DROP; + +	/* This is an user-space helper not yet configured, skip. */ +	if ((helper->flags & +	    (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == +	     NF_CT_HELPER_F_USERSPACE) +		return NF_ACCEPT; + +	/* If the user-space helper is not available, don't block traffic. */ +	return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS; +} + +static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = { +	[NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, }, +	[NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, }, +}; + +static int +nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, +			  const struct nlattr *attr) +{ +	int err; +	struct nlattr *tb[NFCTH_TUPLE_MAX+1]; + +	err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol); +	if (err < 0) +		return err; + +	if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) +		return -EINVAL; + +	tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); +	tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); + +	return 0; +} + +static int +nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) +{ +	const struct nf_conn_help *help = nfct_help(ct); + +	if (attr == NULL) +		return -EINVAL; + +	if (help->helper->data_len == 0) +		return -EINVAL; + +	memcpy(&help->data, nla_data(attr), help->helper->data_len); +	return 0; +} + +static int +nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct) +{ +	const struct nf_conn_help *help = nfct_help(ct); + +	if (help->helper->data_len && +	    nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data)) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = { +	[NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING, +				.len = NF_CT_HELPER_NAME_LEN-1 }, +	[NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, }, +	[NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, }, +}; + +static int +nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, +			    const struct nlattr *attr) +{ +	int err; +	struct nlattr *tb[NFCTH_POLICY_MAX+1]; + +	err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol); +	if (err < 0) +		return err; + +	if (!tb[NFCTH_POLICY_NAME] || +	    !tb[NFCTH_POLICY_EXPECT_MAX] || +	    !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) +		return -EINVAL; + +	strncpy(expect_policy->name, +		nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN); +	expect_policy->max_expected = +		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); +	expect_policy->timeout = +		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); + +	return 0; +} + +static const struct nla_policy +nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = { +	[NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, }, +}; + +static int +nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, +				  const struct nlattr *attr) +{ +	int i, ret; +	struct nf_conntrack_expect_policy *expect_policy; +	struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; + +	ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, +			       nfnl_cthelper_expect_policy_set); +	if (ret < 0) +		return ret; + +	if (!tb[NFCTH_POLICY_SET_NUM]) +		return -EINVAL; + +	helper->expect_class_max = +		ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); + +	if (helper->expect_class_max != 0 && +	    helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES) +		return -EOVERFLOW; + +	expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * +				helper->expect_class_max, GFP_KERNEL); +	if (expect_policy == NULL) +		return -ENOMEM; + +	for (i=0; i<helper->expect_class_max; i++) { +		if (!tb[NFCTH_POLICY_SET+i]) +			goto err; + +		ret = nfnl_cthelper_expect_policy(&expect_policy[i], +						  tb[NFCTH_POLICY_SET+i]); +		if (ret < 0) +			goto err; +	} +	helper->expect_policy = expect_policy; +	return 0; +err: +	kfree(expect_policy); +	return -EINVAL; +} + +static int +nfnl_cthelper_create(const struct nlattr * const tb[], +		     struct nf_conntrack_tuple *tuple) +{ +	struct nf_conntrack_helper *helper; +	int ret; + +	if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) +		return -EINVAL; + +	helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL); +	if (helper == NULL) +		return -ENOMEM; + +	ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); +	if (ret < 0) +		goto err; + +	strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); +	helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); +	helper->flags |= NF_CT_HELPER_F_USERSPACE; +	memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); + +	helper->me = THIS_MODULE; +	helper->help = nfnl_userspace_cthelper; +	helper->from_nlattr = nfnl_cthelper_from_nlattr; +	helper->to_nlattr = nfnl_cthelper_to_nlattr; + +	/* Default to queue number zero, this can be updated at any time. */ +	if (tb[NFCTH_QUEUE_NUM]) +		helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); + +	if (tb[NFCTH_STATUS]) { +		int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); + +		switch(status) { +		case NFCT_HELPER_STATUS_ENABLED: +			helper->flags |= NF_CT_HELPER_F_CONFIGURED; +			break; +		case NFCT_HELPER_STATUS_DISABLED: +			helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; +			break; +		} +	} + +	ret = nf_conntrack_helper_register(helper); +	if (ret < 0) +		goto err; + +	return 0; +err: +	kfree(helper); +	return ret; +} + +static int +nfnl_cthelper_update(const struct nlattr * const tb[], +		     struct nf_conntrack_helper *helper) +{ +	int ret; + +	if (tb[NFCTH_PRIV_DATA_LEN]) +		return -EBUSY; + +	if (tb[NFCTH_POLICY]) { +		ret = nfnl_cthelper_parse_expect_policy(helper, +							tb[NFCTH_POLICY]); +		if (ret < 0) +			return ret; +	} +	if (tb[NFCTH_QUEUE_NUM]) +		helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); + +	if (tb[NFCTH_STATUS]) { +		int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); + +		switch(status) { +		case NFCT_HELPER_STATUS_ENABLED: +			helper->flags |= NF_CT_HELPER_F_CONFIGURED; +			break; +		case NFCT_HELPER_STATUS_DISABLED: +			helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; +			break; +		} +	} +	return 0; +} + +static int +nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, +		  const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	const char *helper_name; +	struct nf_conntrack_helper *cur, *helper = NULL; +	struct nf_conntrack_tuple tuple; +	int ret = 0, i; + +	if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) +		return -EINVAL; + +	helper_name = nla_data(tb[NFCTH_NAME]); + +	ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); +	if (ret < 0) +		return ret; + +	rcu_read_lock(); +	for (i = 0; i < nf_ct_helper_hsize && !helper; i++) { +		hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { + +			/* skip non-userspace conntrack helpers. */ +			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) +				continue; + +			if (strncmp(cur->name, helper_name, +					NF_CT_HELPER_NAME_LEN) != 0) +				continue; + +			if ((tuple.src.l3num != cur->tuple.src.l3num || +			     tuple.dst.protonum != cur->tuple.dst.protonum)) +				continue; + +			if (nlh->nlmsg_flags & NLM_F_EXCL) { +				ret = -EEXIST; +				goto err; +			} +			helper = cur; +			break; +		} +	} +	rcu_read_unlock(); + +	if (helper == NULL) +		ret = nfnl_cthelper_create(tb, &tuple); +	else +		ret = nfnl_cthelper_update(tb, helper); + +	return ret; +err: +	rcu_read_unlock(); +	return ret; +} + +static int +nfnl_cthelper_dump_tuple(struct sk_buff *skb, +			 struct nf_conntrack_helper *helper) +{ +	struct nlattr *nest_parms; + +	nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED); +	if (nest_parms == NULL) +		goto nla_put_failure; + +	if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM, +			 htons(helper->tuple.src.l3num))) +		goto nla_put_failure; + +	if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum)) +		goto nla_put_failure; + +	nla_nest_end(skb, nest_parms); +	return 0; + +nla_put_failure: +	return -1; +} + +static int +nfnl_cthelper_dump_policy(struct sk_buff *skb, +			struct nf_conntrack_helper *helper) +{ +	int i; +	struct nlattr *nest_parms1, *nest_parms2; + +	nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED); +	if (nest_parms1 == NULL) +		goto nla_put_failure; + +	if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, +			 htonl(helper->expect_class_max))) +		goto nla_put_failure; + +	for (i=0; i<helper->expect_class_max; i++) { +		nest_parms2 = nla_nest_start(skb, +				(NFCTH_POLICY_SET+i) | NLA_F_NESTED); +		if (nest_parms2 == NULL) +			goto nla_put_failure; + +		if (nla_put_string(skb, NFCTH_POLICY_NAME, +				   helper->expect_policy[i].name)) +			goto nla_put_failure; + +		if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX, +				 htonl(helper->expect_policy[i].max_expected))) +			goto nla_put_failure; + +		if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT, +				 htonl(helper->expect_policy[i].timeout))) +			goto nla_put_failure; + +		nla_nest_end(skb, nest_parms2); +	} +	nla_nest_end(skb, nest_parms1); +	return 0; + +nla_put_failure: +	return -1; +} + +static int +nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, +			int event, struct nf_conntrack_helper *helper) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0; +	int status; + +	event |= NFNL_SUBSYS_CTHELPER << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	if (nla_put_string(skb, NFCTH_NAME, helper->name)) +		goto nla_put_failure; + +	if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num))) +		goto nla_put_failure; + +	if (nfnl_cthelper_dump_tuple(skb, helper) < 0) +		goto nla_put_failure; + +	if (nfnl_cthelper_dump_policy(skb, helper) < 0) +		goto nla_put_failure; + +	if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len))) +		goto nla_put_failure; + +	if (helper->flags & NF_CT_HELPER_F_CONFIGURED) +		status = NFCT_HELPER_STATUS_ENABLED; +	else +		status = NFCT_HELPER_STATUS_DISABLED; + +	if (nla_put_be32(skb, NFCTH_STATUS, htonl(status))) +		goto nla_put_failure; + +	nlmsg_end(skb, nlh); +	return skb->len; + +nlmsg_failure: +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct nf_conntrack_helper *cur, *last; + +	rcu_read_lock(); +	last = (struct nf_conntrack_helper *)cb->args[1]; +	for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) { +restart: +		hlist_for_each_entry_rcu(cur, +				&nf_ct_helper_hash[cb->args[0]], hnode) { + +			/* skip non-userspace conntrack helpers. */ +			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) +				continue; + +			if (cb->args[1]) { +				if (cur != last) +					continue; +				cb->args[1] = 0; +			} +			if (nfnl_cthelper_fill_info(skb, +					    NETLINK_CB(cb->skb).portid, +					    cb->nlh->nlmsg_seq, +					    NFNL_MSG_TYPE(cb->nlh->nlmsg_type), +					    NFNL_MSG_CTHELPER_NEW, cur) < 0) { +				cb->args[1] = (unsigned long)cur; +				goto out; +			} +		} +	} +	if (cb->args[1]) { +		cb->args[1] = 0; +		goto restart; +	} +out: +	rcu_read_unlock(); +	return skb->len; +} + +static int +nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, +		  const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	int ret = -ENOENT, i; +	struct nf_conntrack_helper *cur; +	struct sk_buff *skb2; +	char *helper_name = NULL; +	struct nf_conntrack_tuple tuple; +	bool tuple_set = false; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = nfnl_cthelper_dump_table, +		}; +		return netlink_dump_start(nfnl, skb, nlh, &c); +	} + +	if (tb[NFCTH_NAME]) +		helper_name = nla_data(tb[NFCTH_NAME]); + +	if (tb[NFCTH_TUPLE]) { +		ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); +		if (ret < 0) +			return ret; + +		tuple_set = true; +	} + +	for (i = 0; i < nf_ct_helper_hsize; i++) { +		hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { + +			/* skip non-userspace conntrack helpers. */ +			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) +				continue; + +			if (helper_name && strncmp(cur->name, helper_name, +						NF_CT_HELPER_NAME_LEN) != 0) { +				continue; +			} +			if (tuple_set && +			    (tuple.src.l3num != cur->tuple.src.l3num || +			     tuple.dst.protonum != cur->tuple.dst.protonum)) +				continue; + +			skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +			if (skb2 == NULL) { +				ret = -ENOMEM; +				break; +			} + +			ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, +						nlh->nlmsg_seq, +						NFNL_MSG_TYPE(nlh->nlmsg_type), +						NFNL_MSG_CTHELPER_NEW, cur); +			if (ret <= 0) { +				kfree_skb(skb2); +				break; +			} + +			ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, +						MSG_DONTWAIT); +			if (ret > 0) +				ret = 0; + +			/* this avoids a loop in nfnetlink. */ +			return ret == -EAGAIN ? -ENOBUFS : ret; +		} +	} +	return ret; +} + +static int +nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, +	     const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	char *helper_name = NULL; +	struct nf_conntrack_helper *cur; +	struct hlist_node *tmp; +	struct nf_conntrack_tuple tuple; +	bool tuple_set = false, found = false; +	int i, j = 0, ret; + +	if (tb[NFCTH_NAME]) +		helper_name = nla_data(tb[NFCTH_NAME]); + +	if (tb[NFCTH_TUPLE]) { +		ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); +		if (ret < 0) +			return ret; + +		tuple_set = true; +	} + +	for (i = 0; i < nf_ct_helper_hsize; i++) { +		hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], +								hnode) { +			/* skip non-userspace conntrack helpers. */ +			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) +				continue; + +			j++; + +			if (helper_name && strncmp(cur->name, helper_name, +						NF_CT_HELPER_NAME_LEN) != 0) { +				continue; +			} +			if (tuple_set && +			    (tuple.src.l3num != cur->tuple.src.l3num || +			     tuple.dst.protonum != cur->tuple.dst.protonum)) +				continue; + +			found = true; +			nf_conntrack_helper_unregister(cur); +		} +	} +	/* Make sure we return success if we flush and there is no helpers */ +	return (found || j == 0) ? 0 : -ENOENT; +} + +static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { +	[NFCTH_NAME] = { .type = NLA_NUL_STRING, +			 .len = NF_CT_HELPER_NAME_LEN-1 }, +	[NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, +}; + +static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { +	[NFNL_MSG_CTHELPER_NEW]		= { .call = nfnl_cthelper_new, +					    .attr_count = NFCTH_MAX, +					    .policy = nfnl_cthelper_policy }, +	[NFNL_MSG_CTHELPER_GET]		= { .call = nfnl_cthelper_get, +					    .attr_count = NFCTH_MAX, +					    .policy = nfnl_cthelper_policy }, +	[NFNL_MSG_CTHELPER_DEL]		= { .call = nfnl_cthelper_del, +					    .attr_count = NFCTH_MAX, +					    .policy = nfnl_cthelper_policy }, +}; + +static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { +	.name				= "cthelper", +	.subsys_id			= NFNL_SUBSYS_CTHELPER, +	.cb_count			= NFNL_MSG_CTHELPER_MAX, +	.cb				= nfnl_cthelper_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER); + +static int __init nfnl_cthelper_init(void) +{ +	int ret; + +	ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys); +	if (ret < 0) { +		pr_err("nfnl_cthelper: cannot register with nfnetlink.\n"); +		goto err_out; +	} +	return 0; +err_out: +	return ret; +} + +static void __exit nfnl_cthelper_exit(void) +{ +	struct nf_conntrack_helper *cur; +	struct hlist_node *tmp; +	int i; + +	nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); + +	for (i=0; i<nf_ct_helper_hsize; i++) { +		hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], +									hnode) { +			/* skip non-userspace conntrack helpers. */ +			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) +				continue; + +			nf_conntrack_helper_unregister(cur); +		} +	} +} + +module_init(nfnl_cthelper_init); +module_exit(nfnl_cthelper_exit); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c new file mode 100644 index 00000000000..476accd1714 --- /dev/null +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -0,0 +1,585 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/rculist.h> +#include <linux/rculist_nulls.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/security.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); + +static LIST_HEAD(cttimeout_list); + +static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { +	[CTA_TIMEOUT_NAME]	= { .type = NLA_NUL_STRING, +				    .len  = CTNL_TIMEOUT_NAME_MAX - 1}, +	[CTA_TIMEOUT_L3PROTO]	= { .type = NLA_U16 }, +	[CTA_TIMEOUT_L4PROTO]	= { .type = NLA_U8 }, +	[CTA_TIMEOUT_DATA]	= { .type = NLA_NESTED }, +}; + +static int +ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, +			  struct net *net, const struct nlattr *attr) +{ +	int ret = 0; + +	if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) { +		struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1]; + +		ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, +				       attr, l4proto->ctnl_timeout.nla_policy); +		if (ret < 0) +			return ret; + +		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); +	} +	return ret; +} + +static int +cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb, +		      const struct nlmsghdr *nlh, +		      const struct nlattr * const cda[]) +{ +	__u16 l3num; +	__u8 l4num; +	struct nf_conntrack_l4proto *l4proto; +	struct ctnl_timeout *timeout, *matching = NULL; +	struct net *net = sock_net(skb->sk); +	char *name; +	int ret; + +	if (!cda[CTA_TIMEOUT_NAME] || +	    !cda[CTA_TIMEOUT_L3PROTO] || +	    !cda[CTA_TIMEOUT_L4PROTO] || +	    !cda[CTA_TIMEOUT_DATA]) +		return -EINVAL; + +	name = nla_data(cda[CTA_TIMEOUT_NAME]); +	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); +	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + +	list_for_each_entry(timeout, &cttimeout_list, head) { +		if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) +			continue; + +		if (nlh->nlmsg_flags & NLM_F_EXCL) +			return -EEXIST; + +		matching = timeout; +		break; +	} + +	l4proto = nf_ct_l4proto_find_get(l3num, l4num); + +	/* This protocol is not supportted, skip. */ +	if (l4proto->l4proto != l4num) { +		ret = -EOPNOTSUPP; +		goto err_proto_put; +	} + +	if (matching) { +		if (nlh->nlmsg_flags & NLM_F_REPLACE) { +			/* You cannot replace one timeout policy by another of +			 * different kind, sorry. +			 */ +			if (matching->l3num != l3num || +			    matching->l4proto->l4proto != l4num) { +				ret = -EINVAL; +				goto err_proto_put; +			} + +			ret = ctnl_timeout_parse_policy(&matching->data, +							l4proto, net, +							cda[CTA_TIMEOUT_DATA]); +			return ret; +		} +		ret = -EBUSY; +		goto err_proto_put; +	} + +	timeout = kzalloc(sizeof(struct ctnl_timeout) + +			  l4proto->ctnl_timeout.obj_size, GFP_KERNEL); +	if (timeout == NULL) { +		ret = -ENOMEM; +		goto err_proto_put; +	} + +	ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net, +					cda[CTA_TIMEOUT_DATA]); +	if (ret < 0) +		goto err; + +	strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); +	timeout->l3num = l3num; +	timeout->l4proto = l4proto; +	atomic_set(&timeout->refcnt, 1); +	list_add_tail_rcu(&timeout->head, &cttimeout_list); + +	return 0; +err: +	kfree(timeout); +err_proto_put: +	nf_ct_l4proto_put(l4proto); +	return ret; +} + +static int +ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, +		       int event, struct ctnl_timeout *timeout) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0; +	struct nf_conntrack_l4proto *l4proto = timeout->l4proto; + +	event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || +	    nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) || +	    nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || +	    nla_put_be32(skb, CTA_TIMEOUT_USE, +			 htonl(atomic_read(&timeout->refcnt)))) +		goto nla_put_failure; + +	if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { +		struct nlattr *nest_parms; +		int ret; + +		nest_parms = nla_nest_start(skb, +					    CTA_TIMEOUT_DATA | NLA_F_NESTED); +		if (!nest_parms) +			goto nla_put_failure; + +		ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); +		if (ret < 0) +			goto nla_put_failure; + +		nla_nest_end(skb, nest_parms); +	} + +	nlmsg_end(skb, nlh); +	return skb->len; + +nlmsg_failure: +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct ctnl_timeout *cur, *last; + +	if (cb->args[2]) +		return 0; + +	last = (struct ctnl_timeout *)cb->args[1]; +	if (cb->args[1]) +		cb->args[1] = 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(cur, &cttimeout_list, head) { +		if (last) { +			if (cur != last) +				continue; + +			last = NULL; +		} +		if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid, +					   cb->nlh->nlmsg_seq, +					   NFNL_MSG_TYPE(cb->nlh->nlmsg_type), +					   IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { +			cb->args[1] = (unsigned long)cur; +			break; +		} +	} +	if (!cb->args[1]) +		cb->args[2] = 1; +	rcu_read_unlock(); +	return skb->len; +} + +static int +cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, +		      const struct nlmsghdr *nlh, +		      const struct nlattr * const cda[]) +{ +	int ret = -ENOENT; +	char *name; +	struct ctnl_timeout *cur; + +	if (nlh->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = ctnl_timeout_dump, +		}; +		return netlink_dump_start(ctnl, skb, nlh, &c); +	} + +	if (!cda[CTA_TIMEOUT_NAME]) +		return -EINVAL; +	name = nla_data(cda[CTA_TIMEOUT_NAME]); + +	list_for_each_entry(cur, &cttimeout_list, head) { +		struct sk_buff *skb2; + +		if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) +			continue; + +		skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +		if (skb2 == NULL) { +			ret = -ENOMEM; +			break; +		} + +		ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, +					     nlh->nlmsg_seq, +					     NFNL_MSG_TYPE(nlh->nlmsg_type), +					     IPCTNL_MSG_TIMEOUT_NEW, cur); +		if (ret <= 0) { +			kfree_skb(skb2); +			break; +		} +		ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, +					MSG_DONTWAIT); +		if (ret > 0) +			ret = 0; + +		/* this avoids a loop in nfnetlink. */ +		return ret == -EAGAIN ? -ENOBUFS : ret; +	} +	return ret; +} + +/* try to delete object, fail if it is still in use. */ +static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) +{ +	int ret = 0; + +	/* we want to avoid races with nf_ct_timeout_find_get. */ +	if (atomic_dec_and_test(&timeout->refcnt)) { +		/* We are protected by nfnl mutex. */ +		list_del_rcu(&timeout->head); +		nf_ct_l4proto_put(timeout->l4proto); +		kfree_rcu(timeout, rcu_head); +	} else { +		/* still in use, restore reference counter. */ +		atomic_inc(&timeout->refcnt); +		ret = -EBUSY; +	} +	return ret; +} + +static int +cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb, +		      const struct nlmsghdr *nlh, +		      const struct nlattr * const cda[]) +{ +	char *name; +	struct ctnl_timeout *cur; +	int ret = -ENOENT; + +	if (!cda[CTA_TIMEOUT_NAME]) { +		list_for_each_entry(cur, &cttimeout_list, head) +			ctnl_timeout_try_del(cur); + +		return 0; +	} +	name = nla_data(cda[CTA_TIMEOUT_NAME]); + +	list_for_each_entry(cur, &cttimeout_list, head) { +		if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) +			continue; + +		ret = ctnl_timeout_try_del(cur); +		if (ret < 0) +			return ret; + +		break; +	} +	return ret; +} + +static int +cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb, +		      const struct nlmsghdr *nlh, +		      const struct nlattr * const cda[]) +{ +	__u16 l3num; +	__u8 l4num; +	struct nf_conntrack_l4proto *l4proto; +	struct net *net = sock_net(skb->sk); +	unsigned int *timeouts; +	int ret; + +	if (!cda[CTA_TIMEOUT_L3PROTO] || +	    !cda[CTA_TIMEOUT_L4PROTO] || +	    !cda[CTA_TIMEOUT_DATA]) +		return -EINVAL; + +	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); +	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); +	l4proto = nf_ct_l4proto_find_get(l3num, l4num); + +	/* This protocol is not supported, skip. */ +	if (l4proto->l4proto != l4num) { +		ret = -EOPNOTSUPP; +		goto err; +	} + +	timeouts = l4proto->get_timeouts(net); + +	ret = ctnl_timeout_parse_policy(timeouts, l4proto, net, +					cda[CTA_TIMEOUT_DATA]); +	if (ret < 0) +		goto err; + +	nf_ct_l4proto_put(l4proto); +	return 0; +err: +	nf_ct_l4proto_put(l4proto); +	return ret; +} + +static int +cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, +			    u32 seq, u32 type, int event, +			    struct nf_conntrack_l4proto *l4proto) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0; + +	event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = AF_UNSPEC; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) || +	    nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) +		goto nla_put_failure; + +	if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { +		struct nlattr *nest_parms; +		unsigned int *timeouts = l4proto->get_timeouts(net); +		int ret; + +		nest_parms = nla_nest_start(skb, +					    CTA_TIMEOUT_DATA | NLA_F_NESTED); +		if (!nest_parms) +			goto nla_put_failure; + +		ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); +		if (ret < 0) +			goto nla_put_failure; + +		nla_nest_end(skb, nest_parms); +	} + +	nlmsg_end(skb, nlh); +	return skb->len; + +nlmsg_failure: +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb, +				 const struct nlmsghdr *nlh, +				 const struct nlattr * const cda[]) +{ +	__u16 l3num; +	__u8 l4num; +	struct nf_conntrack_l4proto *l4proto; +	struct net *net = sock_net(skb->sk); +	struct sk_buff *skb2; +	int ret, err; + +	if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) +		return -EINVAL; + +	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); +	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); +	l4proto = nf_ct_l4proto_find_get(l3num, l4num); + +	/* This protocol is not supported, skip. */ +	if (l4proto->l4proto != l4num) { +		err = -EOPNOTSUPP; +		goto err; +	} + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) { +		err = -ENOMEM; +		goto err; +	} + +	ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, +					  nlh->nlmsg_seq, +					  NFNL_MSG_TYPE(nlh->nlmsg_type), +					  IPCTNL_MSG_TIMEOUT_DEFAULT_SET, +					  l4proto); +	if (ret <= 0) { +		kfree_skb(skb2); +		err = -ENOMEM; +		goto err; +	} +	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); +	if (ret > 0) +		ret = 0; + +	/* this avoids a loop in nfnetlink. */ +	return ret == -EAGAIN ? -ENOBUFS : ret; +err: +	nf_ct_l4proto_put(l4proto); +	return err; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static struct ctnl_timeout *ctnl_timeout_find_get(const char *name) +{ +	struct ctnl_timeout *timeout, *matching = NULL; + +	rcu_read_lock(); +	list_for_each_entry_rcu(timeout, &cttimeout_list, head) { +		if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) +			continue; + +		if (!try_module_get(THIS_MODULE)) +			goto err; + +		if (!atomic_inc_not_zero(&timeout->refcnt)) { +			module_put(THIS_MODULE); +			goto err; +		} +		matching = timeout; +		break; +	} +err: +	rcu_read_unlock(); +	return matching; +} + +static void ctnl_timeout_put(struct ctnl_timeout *timeout) +{ +	atomic_dec(&timeout->refcnt); +	module_put(THIS_MODULE); +} +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + +static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { +	[IPCTNL_MSG_TIMEOUT_NEW]	= { .call = cttimeout_new_timeout, +					    .attr_count = CTA_TIMEOUT_MAX, +					    .policy = cttimeout_nla_policy }, +	[IPCTNL_MSG_TIMEOUT_GET]	= { .call = cttimeout_get_timeout, +					    .attr_count = CTA_TIMEOUT_MAX, +					    .policy = cttimeout_nla_policy }, +	[IPCTNL_MSG_TIMEOUT_DELETE]	= { .call = cttimeout_del_timeout, +					    .attr_count = CTA_TIMEOUT_MAX, +					    .policy = cttimeout_nla_policy }, +	[IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set, +					    .attr_count = CTA_TIMEOUT_MAX, +					    .policy = cttimeout_nla_policy }, +	[IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get, +					    .attr_count = CTA_TIMEOUT_MAX, +					    .policy = cttimeout_nla_policy }, +}; + +static const struct nfnetlink_subsystem cttimeout_subsys = { +	.name				= "conntrack_timeout", +	.subsys_id			= NFNL_SUBSYS_CTNETLINK_TIMEOUT, +	.cb_count			= IPCTNL_MSG_TIMEOUT_MAX, +	.cb				= cttimeout_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); + +static int __init cttimeout_init(void) +{ +	int ret; + +	ret = nfnetlink_subsys_register(&cttimeout_subsys); +	if (ret < 0) { +		pr_err("cttimeout_init: cannot register cttimeout with " +			"nfnetlink.\n"); +		goto err_out; +	} +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +	RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get); +	RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ +	return 0; + +err_out: +	return ret; +} + +static void __exit cttimeout_exit(void) +{ +	struct ctnl_timeout *cur, *tmp; + +	pr_info("cttimeout: unregistering from nfnetlink.\n"); + +	nfnetlink_subsys_unregister(&cttimeout_subsys); +	list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) { +		list_del_rcu(&cur->head); +		/* We are sure that our objects have no clients at this point, +		 * it's safe to release them all without checking refcnt. +		 */ +		nf_ct_l4proto_put(cur->l4proto); +		kfree_rcu(cur, rcu_head); +	} +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +	RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); +	RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ +} + +module_init(cttimeout_init); +module_exit(cttimeout_exit); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 6a1572b0ab4..d292c8d286e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -3,6 +3,7 @@   * nfetlink.   *   * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * Based on the old ipv4-only ipt_ULOG.c:   * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> @@ -13,12 +14,13 @@   */  #include <linux/module.h>  #include <linux/skbuff.h> +#include <linux/if_arp.h>  #include <linux/init.h>  #include <linux/ip.h>  #include <linux/ipv6.h>  #include <linux/netdevice.h>  #include <linux/netfilter.h> -#include <linux/netlink.h> +#include <net/netlink.h>  #include <linux/netfilter/nfnetlink.h>  #include <linux/netfilter/nfnetlink_log.h>  #include <linux/spinlock.h> @@ -26,14 +28,13 @@  #include <linux/proc_fs.h>  #include <linux/security.h>  #include <linux/list.h> -#include <linux/jhash.h> -#include <linux/random.h>  #include <linux/slab.h>  #include <net/sock.h>  #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h>  #include <net/netfilter/nfnetlink_log.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #ifdef CONFIG_BRIDGE_NETFILTER  #include "../bridge/br_private.h" @@ -55,7 +56,9 @@ struct nfulnl_instance {  	unsigned int qlen;		/* number of nlmsgs in skb */  	struct sk_buff *skb;		/* pre-allocatd skb */  	struct timer_list timer; -	int peer_pid;			/* PID of the peer process */ +	struct net *net; +	struct user_namespace *peer_user_ns;	/* User namespace of the peer process */ +	int peer_portid;			/* PORTID of the peer process */  	/* configurable parameters */  	unsigned int flushtimeout;	/* timeout until queue flush */ @@ -69,12 +72,20 @@ struct nfulnl_instance {  	struct rcu_head rcu;  }; -static DEFINE_SPINLOCK(instances_lock); -static atomic_t global_seq; -  #define INSTANCE_BUCKETS	16 -static struct hlist_head instance_table[INSTANCE_BUCKETS]; -static unsigned int hash_init; + +static int nfnl_log_net_id __read_mostly; + +struct nfnl_log_net { +	spinlock_t instances_lock; +	struct hlist_head instance_table[INSTANCE_BUCKETS]; +	atomic_t global_seq; +}; + +static struct nfnl_log_net *nfnl_log_pernet(struct net *net) +{ +	return net_generic(net, nfnl_log_net_id); +}  static inline u_int8_t instance_hashfn(u_int16_t group_num)  { @@ -82,14 +93,13 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num)  }  static struct nfulnl_instance * -__instance_lookup(u_int16_t group_num) +__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)  {  	struct hlist_head *head; -	struct hlist_node *pos;  	struct nfulnl_instance *inst; -	head = &instance_table[instance_hashfn(group_num)]; -	hlist_for_each_entry_rcu(inst, pos, head, hlist) { +	head = &log->instance_table[instance_hashfn(group_num)]; +	hlist_for_each_entry_rcu(inst, head, hlist) {  		if (inst->group_num == group_num)  			return inst;  	} @@ -103,12 +113,12 @@ instance_get(struct nfulnl_instance *inst)  }  static struct nfulnl_instance * -instance_lookup_get(u_int16_t group_num) +instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)  {  	struct nfulnl_instance *inst;  	rcu_read_lock_bh(); -	inst = __instance_lookup(group_num); +	inst = __instance_lookup(log, group_num);  	if (inst && !atomic_inc_not_zero(&inst->use))  		inst = NULL;  	rcu_read_unlock_bh(); @@ -118,7 +128,11 @@ instance_lookup_get(u_int16_t group_num)  static void nfulnl_instance_free_rcu(struct rcu_head *head)  { -	kfree(container_of(head, struct nfulnl_instance, rcu)); +	struct nfulnl_instance *inst = +		container_of(head, struct nfulnl_instance, rcu); + +	put_net(inst->net); +	kfree(inst);  	module_put(THIS_MODULE);  } @@ -132,13 +146,15 @@ instance_put(struct nfulnl_instance *inst)  static void nfulnl_timer(unsigned long data);  static struct nfulnl_instance * -instance_create(u_int16_t group_num, int pid) +instance_create(struct net *net, u_int16_t group_num, +		int portid, struct user_namespace *user_ns)  {  	struct nfulnl_instance *inst; +	struct nfnl_log_net *log = nfnl_log_pernet(net);  	int err; -	spin_lock_bh(&instances_lock); -	if (__instance_lookup(group_num)) { +	spin_lock_bh(&log->instances_lock); +	if (__instance_lookup(log, group_num)) {  		err = -EEXIST;  		goto out_unlock;  	} @@ -162,7 +178,9 @@ instance_create(u_int16_t group_num, int pid)  	setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); -	inst->peer_pid = pid; +	inst->net = get_net(net); +	inst->peer_user_ns = user_ns; +	inst->peer_portid = portid;  	inst->group_num = group_num;  	inst->qthreshold 	= NFULNL_QTHRESH_DEFAULT; @@ -172,14 +190,15 @@ instance_create(u_int16_t group_num, int pid)  	inst->copy_range 	= NFULNL_COPY_RANGE_MAX;  	hlist_add_head_rcu(&inst->hlist, -		       &instance_table[instance_hashfn(group_num)]); +		       &log->instance_table[instance_hashfn(group_num)]); + -	spin_unlock_bh(&instances_lock); +	spin_unlock_bh(&log->instances_lock);  	return inst;  out_unlock: -	spin_unlock_bh(&instances_lock); +	spin_unlock_bh(&log->instances_lock);  	return ERR_PTR(err);  } @@ -208,11 +227,12 @@ __instance_destroy(struct nfulnl_instance *inst)  }  static inline void -instance_destroy(struct nfulnl_instance *inst) +instance_destroy(struct nfnl_log_net *log, +		 struct nfulnl_instance *inst)  { -	spin_lock_bh(&instances_lock); +	spin_lock_bh(&log->instances_lock);  	__instance_destroy(inst); -	spin_unlock_bh(&instances_lock); +	spin_unlock_bh(&log->instances_lock);  }  static int @@ -296,7 +316,8 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)  }  static struct sk_buff * -nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, +		 unsigned int pkt_size)  {  	struct sk_buff *skb;  	unsigned int n; @@ -305,19 +326,17 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)  	 * message.  WARNING: has to be <= 128k due to slab restrictions */  	n = max(inst_size, pkt_size); -	skb = alloc_skb(n, GFP_ATOMIC); +	skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);  	if (!skb) { -		pr_notice("nfnetlink_log: can't alloc whole buffer (%u bytes)\n", -			inst_size); -  		if (n > pkt_size) {  			/* try to allocate only as much as we need for current  			 * packet */ -			skb = alloc_skb(pkt_size, GFP_ATOMIC); +			skb = nfnetlink_alloc_skb(net, pkt_size, +						  peer_portid, GFP_ATOMIC);  			if (!skb) -				pr_err("nfnetlink_log: can't even alloc %u " -				       "bytes\n", pkt_size); +				pr_err("nfnetlink_log: can't even alloc %u bytes\n", +				       pkt_size);  		}  	} @@ -329,18 +348,20 @@ __nfulnl_send(struct nfulnl_instance *inst)  {  	int status = -1; -	if (inst->qlen > 1) -		NLMSG_PUT(inst->skb, 0, 0, -			  NLMSG_DONE, -			  sizeof(struct nfgenmsg)); - -	status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid, +	if (inst->qlen > 1) { +		struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0, +						 NLMSG_DONE, +						 sizeof(struct nfgenmsg), +						 0); +		if (!nlh) +			goto out; +	} +	status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,  				   MSG_DONTWAIT);  	inst->qlen = 0;  	inst->skb = NULL; - -nlmsg_failure: +out:  	return status;  } @@ -369,111 +390,139 @@ nfulnl_timer(unsigned long data)  /* This is an inline function, we don't really care about a long   * list of arguments */  static inline int -__build_packet_message(struct nfulnl_instance *inst, +__build_packet_message(struct nfnl_log_net *log, +			struct nfulnl_instance *inst,  			const struct sk_buff *skb,  			unsigned int data_len,  			u_int8_t pf,  			unsigned int hooknum,  			const struct net_device *indev,  			const struct net_device *outdev, -			const struct nf_loginfo *li,  			const char *prefix, unsigned int plen)  {  	struct nfulnl_msg_packet_hdr pmsg;  	struct nlmsghdr *nlh;  	struct nfgenmsg *nfmsg; -	__be32 tmp_uint;  	sk_buff_data_t old_tail = inst->skb->tail; +	struct sock *sk; +	const unsigned char *hwhdrp; -	nlh = NLMSG_PUT(inst->skb, 0, 0, +	nlh = nlmsg_put(inst->skb, 0, 0,  			NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, -			sizeof(struct nfgenmsg)); -	nfmsg = NLMSG_DATA(nlh); +			sizeof(struct nfgenmsg), 0); +	if (!nlh) +		return -1; +	nfmsg = nlmsg_data(nlh);  	nfmsg->nfgen_family = pf;  	nfmsg->version = NFNETLINK_V0;  	nfmsg->res_id = htons(inst->group_num); +	memset(&pmsg, 0, sizeof(pmsg));  	pmsg.hw_protocol	= skb->protocol;  	pmsg.hook		= hooknum; -	NLA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); +	if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg)) +		goto nla_put_failure; -	if (prefix) -		NLA_PUT(inst->skb, NFULA_PREFIX, plen, prefix); +	if (prefix && +	    nla_put(inst->skb, NFULA_PREFIX, plen, prefix)) +		goto nla_put_failure;  	if (indev) {  #ifndef CONFIG_BRIDGE_NETFILTER -		NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, -			     htonl(indev->ifindex)); +		if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, +				 htonl(indev->ifindex))) +			goto nla_put_failure;  #else  		if (pf == PF_BRIDGE) {  			/* Case 1: outdev is physical input device, we need to  			 * look for bridge group (when called from  			 * netfilter_bridge) */ -			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, -				     htonl(indev->ifindex)); +			if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, +					 htonl(indev->ifindex)) ||  			/* this is the bridge group "brX" */  			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ -			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, -				     htonl(br_port_get_rcu(indev)->br->dev->ifindex)); +			    nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, +					 htonl(br_port_get_rcu(indev)->br->dev->ifindex))) +				goto nla_put_failure;  		} else {  			/* Case 2: indev is bridge group, we need to look for  			 * physical device (when called from ipv4) */ -			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, -				     htonl(indev->ifindex)); -			if (skb->nf_bridge && skb->nf_bridge->physindev) -				NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, -					     htonl(skb->nf_bridge->physindev->ifindex)); +			if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, +					 htonl(indev->ifindex))) +				goto nla_put_failure; +			if (skb->nf_bridge && skb->nf_bridge->physindev && +			    nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, +					 htonl(skb->nf_bridge->physindev->ifindex))) +				goto nla_put_failure;  		}  #endif  	}  	if (outdev) { -		tmp_uint = htonl(outdev->ifindex);  #ifndef CONFIG_BRIDGE_NETFILTER -		NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, -			     htonl(outdev->ifindex)); +		if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, +				 htonl(outdev->ifindex))) +			goto nla_put_failure;  #else  		if (pf == PF_BRIDGE) {  			/* Case 1: outdev is physical output device, we need to  			 * look for bridge group (when called from  			 * netfilter_bridge) */ -			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, -				     htonl(outdev->ifindex)); +			if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, +					 htonl(outdev->ifindex)) ||  			/* this is the bridge group "brX" */  			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ -			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, -				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); +			    nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, +					 htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) +				goto nla_put_failure;  		} else {  			/* Case 2: indev is a bridge group, we need to look  			 * for physical device (when called from ipv4) */ -			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, -				     htonl(outdev->ifindex)); -			if (skb->nf_bridge && skb->nf_bridge->physoutdev) -				NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, -					     htonl(skb->nf_bridge->physoutdev->ifindex)); +			if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, +					 htonl(outdev->ifindex))) +				goto nla_put_failure; +			if (skb->nf_bridge && skb->nf_bridge->physoutdev && +			    nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, +					 htonl(skb->nf_bridge->physoutdev->ifindex))) +				goto nla_put_failure;  		}  #endif  	} -	if (skb->mark) -		NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark)); +	if (skb->mark && +	    nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark))) +		goto nla_put_failure; -	if (indev && skb->dev) { +	if (indev && skb->dev && +	    skb->mac_header != skb->network_header) {  		struct nfulnl_msg_packet_hw phw; -		int len = dev_parse_header(skb, phw.hw_addr); +		int len; + +		memset(&phw, 0, sizeof(phw)); +		len = dev_parse_header(skb, phw.hw_addr);  		if (len > 0) {  			phw.hw_addrlen = htons(len); -			NLA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); +			if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) +				goto nla_put_failure;  		}  	}  	if (indev && skb_mac_header_was_set(skb)) { -		NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)); -		NLA_PUT_BE16(inst->skb, NFULA_HWLEN, -			     htons(skb->dev->hard_header_len)); -		NLA_PUT(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, -			skb_mac_header(skb)); +		if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || +		    nla_put_be16(inst->skb, NFULA_HWLEN, +				 htons(skb->dev->hard_header_len))) +			goto nla_put_failure; + +		hwhdrp = skb_mac_header(skb); + +		if (skb->dev->type == ARPHRD_SIT) +			hwhdrp -= ETH_HLEN; + +		if (hwhdrp >= skb->head && +		    nla_put(inst->skb, NFULA_HWHEADER, +			    skb->dev->hard_header_len, hwhdrp)) +			goto nla_put_failure;  	}  	if (skb->tstamp.tv64) { @@ -482,32 +531,38 @@ __build_packet_message(struct nfulnl_instance *inst,  		ts.sec = cpu_to_be64(tv.tv_sec);  		ts.usec = cpu_to_be64(tv.tv_usec); -		NLA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); +		if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) +			goto nla_put_failure;  	}  	/* UID */ -	if (skb->sk) { -		read_lock_bh(&skb->sk->sk_callback_lock); -		if (skb->sk->sk_socket && skb->sk->sk_socket->file) { -			struct file *file = skb->sk->sk_socket->file; -			__be32 uid = htonl(file->f_cred->fsuid); -			__be32 gid = htonl(file->f_cred->fsgid); -			/* need to unlock here since NLA_PUT may goto */ -			read_unlock_bh(&skb->sk->sk_callback_lock); -			NLA_PUT_BE32(inst->skb, NFULA_UID, uid); -			NLA_PUT_BE32(inst->skb, NFULA_GID, gid); +	sk = skb->sk; +	if (sk && sk->sk_state != TCP_TIME_WAIT) { +		read_lock_bh(&sk->sk_callback_lock); +		if (sk->sk_socket && sk->sk_socket->file) { +			struct file *file = sk->sk_socket->file; +			const struct cred *cred = file->f_cred; +			struct user_namespace *user_ns = inst->peer_user_ns; +			__be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid)); +			__be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid)); +			read_unlock_bh(&sk->sk_callback_lock); +			if (nla_put_be32(inst->skb, NFULA_UID, uid) || +			    nla_put_be32(inst->skb, NFULA_GID, gid)) +				goto nla_put_failure;  		} else -			read_unlock_bh(&skb->sk->sk_callback_lock); +			read_unlock_bh(&sk->sk_callback_lock);  	}  	/* local sequence number */ -	if (inst->flags & NFULNL_CFG_F_SEQ) -		NLA_PUT_BE32(inst->skb, NFULA_SEQ, htonl(inst->seq++)); +	if ((inst->flags & NFULNL_CFG_F_SEQ) && +	    nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++))) +		goto nla_put_failure;  	/* global sequence number */ -	if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) -		NLA_PUT_BE32(inst->skb, NFULA_SEQ_GLOBAL, -			     htonl(atomic_inc_return(&global_seq))); +	if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && +	    nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, +			 htonl(atomic_inc_return(&log->global_seq)))) +		goto nla_put_failure;  	if (data_len) {  		struct nlattr *nla; @@ -515,7 +570,7 @@ __build_packet_message(struct nfulnl_instance *inst,  		if (skb_tailroom(inst->skb) < nla_total_size(data_len)) {  			printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); -			goto nlmsg_failure; +			return -1;  		}  		nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len)); @@ -529,7 +584,6 @@ __build_packet_message(struct nfulnl_instance *inst,  	nlh->nlmsg_len = inst->skb->tail - old_tail;  	return 0; -nlmsg_failure:  nla_put_failure:  	PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");  	return -1; @@ -550,7 +604,8 @@ static struct nf_loginfo default_loginfo = {  /* log handler for internal netfilter logging api */  void -nfulnl_log_packet(u_int8_t pf, +nfulnl_log_packet(struct net *net, +		  u_int8_t pf,  		  unsigned int hooknum,  		  const struct sk_buff *skb,  		  const struct net_device *in, @@ -563,13 +618,14 @@ nfulnl_log_packet(u_int8_t pf,  	const struct nf_loginfo *li;  	unsigned int qthreshold;  	unsigned int plen; +	struct nfnl_log_net *log = nfnl_log_pernet(net);  	if (li_user && li_user->type == NF_LOG_TYPE_ULOG)  		li = li_user;  	else  		li = &default_loginfo; -	inst = instance_lookup_get(li->u.ulog.group); +	inst = instance_lookup_get(log, li->u.ulog.group);  	if (!inst)  		return; @@ -580,7 +636,7 @@ nfulnl_log_packet(u_int8_t pf,  	/* FIXME: do we want to make the size calculation conditional based on  	 * what is actually present?  way more branches and checks, but more  	 * memory efficient... */ -	size =    NLMSG_SPACE(sizeof(struct nfgenmsg)) +	size =    nlmsg_total_size(sizeof(struct nfgenmsg))  		+ nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))  		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */  		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ @@ -644,15 +700,16 @@ nfulnl_log_packet(u_int8_t pf,  	}  	if (!inst->skb) { -		inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); +		inst->skb = nfulnl_alloc_skb(net, inst->peer_portid, +					     inst->nlbufsiz, size);  		if (!inst->skb)  			goto alloc_failure;  	}  	inst->qlen++; -	__build_packet_message(inst, skb, data_len, pf, -				hooknum, in, out, li, prefix, plen); +	__build_packet_message(log, inst, skb, data_len, pf, +				hooknum, in, out, prefix, plen);  	if (inst->qlen >= qthreshold)  		__nfulnl_flush(inst); @@ -680,24 +737,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this,  		   unsigned long event, void *ptr)  {  	struct netlink_notify *n = ptr; +	struct nfnl_log_net *log = nfnl_log_pernet(n->net);  	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {  		int i; -		/* destroy all instances for this pid */ -		spin_lock_bh(&instances_lock); +		/* destroy all instances for this portid */ +		spin_lock_bh(&log->instances_lock);  		for  (i = 0; i < INSTANCE_BUCKETS; i++) { -			struct hlist_node *tmp, *t2; +			struct hlist_node *t2;  			struct nfulnl_instance *inst; -			struct hlist_head *head = &instance_table[i]; +			struct hlist_head *head = &log->instance_table[i]; -			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { -				if ((net_eq(n->net, &init_net)) && -				    (n->pid == inst->peer_pid)) +			hlist_for_each_entry_safe(inst, t2, head, hlist) { +				if (n->portid == inst->peer_portid)  					__instance_destroy(inst);  			}  		} -		spin_unlock_bh(&instances_lock); +		spin_unlock_bh(&log->instances_lock);  	}  	return NOTIFY_DONE;  } @@ -734,10 +791,12 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,  		   const struct nlmsghdr *nlh,  		   const struct nlattr * const nfula[])  { -	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); +	struct nfgenmsg *nfmsg = nlmsg_data(nlh);  	u_int16_t group_num = ntohs(nfmsg->res_id);  	struct nfulnl_instance *inst;  	struct nfulnl_msg_config_cmd *cmd = NULL; +	struct net *net = sock_net(ctnl); +	struct nfnl_log_net *log = nfnl_log_pernet(net);  	int ret = 0;  	if (nfula[NFULA_CFG_CMD]) { @@ -747,15 +806,15 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,  		/* Commands without queue context */  		switch (cmd->command) {  		case NFULNL_CFG_CMD_PF_BIND: -			return nf_log_bind_pf(pf, &nfulnl_logger); +			return nf_log_bind_pf(net, pf, &nfulnl_logger);  		case NFULNL_CFG_CMD_PF_UNBIND: -			nf_log_unbind_pf(pf); +			nf_log_unbind_pf(net, pf);  			return 0;  		}  	} -	inst = instance_lookup_get(group_num); -	if (inst && inst->peer_pid != NETLINK_CB(skb).pid) { +	inst = instance_lookup_get(log, group_num); +	if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {  		ret = -EPERM;  		goto out_put;  	} @@ -768,8 +827,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,  				goto out_put;  			} -			inst = instance_create(group_num, -					       NETLINK_CB(skb).pid); +			inst = instance_create(net, group_num, +					       NETLINK_CB(skb).portid, +					       sk_user_ns(NETLINK_CB(skb).sk));  			if (IS_ERR(inst)) {  				ret = PTR_ERR(inst);  				goto out; @@ -781,7 +841,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,  				goto out;  			} -			instance_destroy(inst); +			instance_destroy(log, inst);  			goto out_put;  		default:  			ret = -ENOTSUPP; @@ -864,55 +924,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = {  #ifdef CONFIG_PROC_FS  struct iter_state { +	struct seq_net_private p;  	unsigned int bucket;  }; -static struct hlist_node *get_first(struct iter_state *st) +static struct hlist_node *get_first(struct net *net, struct iter_state *st)  { +	struct nfnl_log_net *log;  	if (!st)  		return NULL; +	log = nfnl_log_pernet(net); +  	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { -		if (!hlist_empty(&instance_table[st->bucket])) -			return rcu_dereference_bh(instance_table[st->bucket].first); +		struct hlist_head *head = &log->instance_table[st->bucket]; + +		if (!hlist_empty(head)) +			return rcu_dereference_bh(hlist_first_rcu(head));  	}  	return NULL;  } -static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) +static struct hlist_node *get_next(struct net *net, struct iter_state *st, +				   struct hlist_node *h)  { -	h = rcu_dereference_bh(h->next); +	h = rcu_dereference_bh(hlist_next_rcu(h));  	while (!h) { +		struct nfnl_log_net *log; +		struct hlist_head *head; +  		if (++st->bucket >= INSTANCE_BUCKETS)  			return NULL; -		h = rcu_dereference_bh(instance_table[st->bucket].first); +		log = nfnl_log_pernet(net); +		head = &log->instance_table[st->bucket]; +		h = rcu_dereference_bh(hlist_first_rcu(head));  	}  	return h;  } -static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) +static struct hlist_node *get_idx(struct net *net, struct iter_state *st, +				  loff_t pos)  {  	struct hlist_node *head; -	head = get_first(st); +	head = get_first(net, st);  	if (head) -		while (pos && (head = get_next(st, head))) +		while (pos && (head = get_next(net, st, head)))  			pos--;  	return pos ? NULL : head;  } -static void *seq_start(struct seq_file *seq, loff_t *pos) +static void *seq_start(struct seq_file *s, loff_t *pos)  	__acquires(rcu_bh)  {  	rcu_read_lock_bh(); -	return get_idx(seq->private, *pos); +	return get_idx(seq_file_net(s), s->private, *pos);  }  static void *seq_next(struct seq_file *s, void *v, loff_t *pos)  {  	(*pos)++; -	return get_next(s->private, v); +	return get_next(seq_file_net(s), s->private, v);  }  static void seq_stop(struct seq_file *s, void *v) @@ -927,7 +1000,7 @@ static int seq_show(struct seq_file *s, void *v)  	return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",  			  inst->group_num, -			  inst->peer_pid, inst->qlen, +			  inst->peer_portid, inst->qlen,  			  inst->copy_mode, inst->copy_range,  			  inst->flushtimeout, atomic_read(&inst->use));  } @@ -941,8 +1014,8 @@ static const struct seq_operations nful_seq_ops = {  static int nful_open(struct inode *inode, struct file *file)  { -	return seq_open_private(file, &nful_seq_ops, -			sizeof(struct iter_state)); +	return seq_open_net(inode, file, &nful_seq_ops, +			    sizeof(struct iter_state));  }  static const struct file_operations nful_file_ops = { @@ -950,47 +1023,69 @@ static const struct file_operations nful_file_ops = {  	.open	 = nful_open,  	.read	 = seq_read,  	.llseek	 = seq_lseek, -	.release = seq_release_private, +	.release = seq_release_net,  };  #endif /* PROC_FS */ -static int __init nfnetlink_log_init(void) +static int __net_init nfnl_log_net_init(struct net *net)  { -	int i, status = -ENOMEM; +	unsigned int i; +	struct nfnl_log_net *log = nfnl_log_pernet(net);  	for (i = 0; i < INSTANCE_BUCKETS; i++) -		INIT_HLIST_HEAD(&instance_table[i]); +		INIT_HLIST_HEAD(&log->instance_table[i]); +	spin_lock_init(&log->instances_lock); + +#ifdef CONFIG_PROC_FS +	if (!proc_create("nfnetlink_log", 0440, +			 net->nf.proc_netfilter, &nful_file_ops)) +		return -ENOMEM; +#endif +	return 0; +} -	/* it's not really all that important to have a random value, so -	 * we can do this from the init function, even if there hasn't -	 * been that much entropy yet */ -	get_random_bytes(&hash_init, sizeof(hash_init)); +static void __net_exit nfnl_log_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS +	remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); +#endif +	nf_log_unset(net, &nfulnl_logger); +} + +static struct pernet_operations nfnl_log_net_ops = { +	.init	= nfnl_log_net_init, +	.exit	= nfnl_log_net_exit, +	.id	= &nfnl_log_net_id, +	.size	= sizeof(struct nfnl_log_net), +}; + +static int __init nfnetlink_log_init(void) +{ +	int status = -ENOMEM;  	netlink_register_notifier(&nfulnl_rtnl_notifier);  	status = nfnetlink_subsys_register(&nfulnl_subsys);  	if (status < 0) { -		printk(KERN_ERR "log: failed to create netlink socket\n"); +		pr_err("log: failed to create netlink socket\n");  		goto cleanup_netlink_notifier;  	}  	status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);  	if (status < 0) { -		printk(KERN_ERR "log: failed to register logger\n"); +		pr_err("log: failed to register logger\n");  		goto cleanup_subsys;  	} -#ifdef CONFIG_PROC_FS -	if (!proc_create("nfnetlink_log", 0440, -			 proc_net_netfilter, &nful_file_ops)) +	status = register_pernet_subsys(&nfnl_log_net_ops); +	if (status < 0) { +		pr_err("log: failed to register pernet ops\n");  		goto cleanup_logger; -#endif +	}  	return status; -#ifdef CONFIG_PROC_FS  cleanup_logger:  	nf_log_unregister(&nfulnl_logger); -#endif  cleanup_subsys:  	nfnetlink_subsys_unregister(&nfulnl_subsys);  cleanup_netlink_notifier: @@ -1000,10 +1095,8 @@ cleanup_netlink_notifier:  static void __exit nfnetlink_log_fini(void)  { +	unregister_pernet_subsys(&nfnl_log_net_ops);  	nf_log_unregister(&nfulnl_logger); -#ifdef CONFIG_PROC_FS -	remove_proc_entry("nfnetlink_log", proc_net_netfilter); -#endif  	nfnetlink_subsys_unregister(&nfulnl_subsys);  	netlink_unregister_notifier(&nfulnl_rtnl_notifier);  } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c deleted file mode 100644 index 68e67d19724..00000000000 --- a/net/netfilter/nfnetlink_queue.c +++ /dev/null @@ -1,943 +0,0 @@ -/* - * This is a module which is used for queueing packets and communicating with - * userspace via nfnetlink. - * - * (C) 2005 by Harald Welte <laforge@netfilter.org> - * (C) 2007 by Patrick McHardy <kaber@trash.net> - * - * Based on the old ipv4-only ip_queue.c: - * (C) 2000-2002 James Morris <jmorris@intercode.com.au> - * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/slab.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/proc_fs.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_queue.h> -#include <linux/list.h> -#include <net/sock.h> -#include <net/netfilter/nf_queue.h> - -#include <asm/atomic.h> - -#ifdef CONFIG_BRIDGE_NETFILTER -#include "../bridge/br_private.h" -#endif - -#define NFQNL_QMAX_DEFAULT 1024 - -struct nfqnl_instance { -	struct hlist_node hlist;		/* global list of queues */ -	struct rcu_head rcu; - -	int peer_pid; -	unsigned int queue_maxlen; -	unsigned int copy_range; -	unsigned int queue_dropped; -	unsigned int queue_user_dropped; - - -	u_int16_t queue_num;			/* number of this queue */ -	u_int8_t copy_mode; -/* - * Following fields are dirtied for each queued packet, - * keep them in same cache line if possible. - */ -	spinlock_t	lock; -	unsigned int	queue_total; -	atomic_t	id_sequence;		/* 'sequence' of pkt ids */ -	struct list_head queue_list;		/* packets in queue */ -}; - -typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); - -static DEFINE_SPINLOCK(instances_lock); - -#define INSTANCE_BUCKETS	16 -static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly; - -static inline u_int8_t instance_hashfn(u_int16_t queue_num) -{ -	return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; -} - -static struct nfqnl_instance * -instance_lookup(u_int16_t queue_num) -{ -	struct hlist_head *head; -	struct hlist_node *pos; -	struct nfqnl_instance *inst; - -	head = &instance_table[instance_hashfn(queue_num)]; -	hlist_for_each_entry_rcu(inst, pos, head, hlist) { -		if (inst->queue_num == queue_num) -			return inst; -	} -	return NULL; -} - -static struct nfqnl_instance * -instance_create(u_int16_t queue_num, int pid) -{ -	struct nfqnl_instance *inst; -	unsigned int h; -	int err; - -	spin_lock(&instances_lock); -	if (instance_lookup(queue_num)) { -		err = -EEXIST; -		goto out_unlock; -	} - -	inst = kzalloc(sizeof(*inst), GFP_ATOMIC); -	if (!inst) { -		err = -ENOMEM; -		goto out_unlock; -	} - -	inst->queue_num = queue_num; -	inst->peer_pid = pid; -	inst->queue_maxlen = NFQNL_QMAX_DEFAULT; -	inst->copy_range = 0xfffff; -	inst->copy_mode = NFQNL_COPY_NONE; -	spin_lock_init(&inst->lock); -	INIT_LIST_HEAD(&inst->queue_list); - -	if (!try_module_get(THIS_MODULE)) { -		err = -EAGAIN; -		goto out_free; -	} - -	h = instance_hashfn(queue_num); -	hlist_add_head_rcu(&inst->hlist, &instance_table[h]); - -	spin_unlock(&instances_lock); - -	return inst; - -out_free: -	kfree(inst); -out_unlock: -	spin_unlock(&instances_lock); -	return ERR_PTR(err); -} - -static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, -			unsigned long data); - -static void -instance_destroy_rcu(struct rcu_head *head) -{ -	struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, -						   rcu); - -	nfqnl_flush(inst, NULL, 0); -	kfree(inst); -	module_put(THIS_MODULE); -} - -static void -__instance_destroy(struct nfqnl_instance *inst) -{ -	hlist_del_rcu(&inst->hlist); -	call_rcu(&inst->rcu, instance_destroy_rcu); -} - -static void -instance_destroy(struct nfqnl_instance *inst) -{ -	spin_lock(&instances_lock); -	__instance_destroy(inst); -	spin_unlock(&instances_lock); -} - -static inline void -__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) -{ -       list_add_tail(&entry->list, &queue->queue_list); -       queue->queue_total++; -} - -static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) -{ -	struct nf_queue_entry *entry = NULL, *i; - -	spin_lock_bh(&queue->lock); - -	list_for_each_entry(i, &queue->queue_list, list) { -		if (i->id == id) { -			entry = i; -			break; -		} -	} - -	if (entry) { -		list_del(&entry->list); -		queue->queue_total--; -	} - -	spin_unlock_bh(&queue->lock); - -	return entry; -} - -static void -nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) -{ -	struct nf_queue_entry *entry, *next; - -	spin_lock_bh(&queue->lock); -	list_for_each_entry_safe(entry, next, &queue->queue_list, list) { -		if (!cmpfn || cmpfn(entry, data)) { -			list_del(&entry->list); -			queue->queue_total--; -			nf_reinject(entry, NF_DROP); -		} -	} -	spin_unlock_bh(&queue->lock); -} - -static struct sk_buff * -nfqnl_build_packet_message(struct nfqnl_instance *queue, -			   struct nf_queue_entry *entry) -{ -	sk_buff_data_t old_tail; -	size_t size; -	size_t data_len = 0; -	struct sk_buff *skb; -	struct nfqnl_msg_packet_hdr pmsg; -	struct nlmsghdr *nlh; -	struct nfgenmsg *nfmsg; -	struct sk_buff *entskb = entry->skb; -	struct net_device *indev; -	struct net_device *outdev; - -	size =    NLMSG_SPACE(sizeof(struct nfgenmsg)) -		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) -		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ -		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ -#ifdef CONFIG_BRIDGE_NETFILTER -		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ -		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ -#endif -		+ nla_total_size(sizeof(u_int32_t))	/* mark */ -		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) -		+ nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); - -	outdev = entry->outdev; - -	switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { -	case NFQNL_COPY_META: -	case NFQNL_COPY_NONE: -		break; - -	case NFQNL_COPY_PACKET: -		if (entskb->ip_summed == CHECKSUM_PARTIAL && -		    skb_checksum_help(entskb)) -			return NULL; - -		data_len = ACCESS_ONCE(queue->copy_range); -		if (data_len == 0 || data_len > entskb->len) -			data_len = entskb->len; - -		size += nla_total_size(data_len); -		break; -	} - - -	skb = alloc_skb(size, GFP_ATOMIC); -	if (!skb) -		goto nlmsg_failure; - -	old_tail = skb->tail; -	nlh = NLMSG_PUT(skb, 0, 0, -			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, -			sizeof(struct nfgenmsg)); -	nfmsg = NLMSG_DATA(nlh); -	nfmsg->nfgen_family = entry->pf; -	nfmsg->version = NFNETLINK_V0; -	nfmsg->res_id = htons(queue->queue_num); - -	entry->id = atomic_inc_return(&queue->id_sequence); -	pmsg.packet_id 		= htonl(entry->id); -	pmsg.hw_protocol	= entskb->protocol; -	pmsg.hook		= entry->hook; - -	NLA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); - -	indev = entry->indev; -	if (indev) { -#ifndef CONFIG_BRIDGE_NETFILTER -		NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)); -#else -		if (entry->pf == PF_BRIDGE) { -			/* Case 1: indev is physical input device, we need to -			 * look for bridge group (when called from -			 * netfilter_bridge) */ -			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, -				     htonl(indev->ifindex)); -			/* this is the bridge group "brX" */ -			/* rcu_read_lock()ed by __nf_queue */ -			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, -				     htonl(br_port_get_rcu(indev)->br->dev->ifindex)); -		} else { -			/* Case 2: indev is bridge group, we need to look for -			 * physical device (when called from ipv4) */ -			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, -				     htonl(indev->ifindex)); -			if (entskb->nf_bridge && entskb->nf_bridge->physindev) -				NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, -					     htonl(entskb->nf_bridge->physindev->ifindex)); -		} -#endif -	} - -	if (outdev) { -#ifndef CONFIG_BRIDGE_NETFILTER -		NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)); -#else -		if (entry->pf == PF_BRIDGE) { -			/* Case 1: outdev is physical output device, we need to -			 * look for bridge group (when called from -			 * netfilter_bridge) */ -			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, -				     htonl(outdev->ifindex)); -			/* this is the bridge group "brX" */ -			/* rcu_read_lock()ed by __nf_queue */ -			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, -				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); -		} else { -			/* Case 2: outdev is bridge group, we need to look for -			 * physical output device (when called from ipv4) */ -			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, -				     htonl(outdev->ifindex)); -			if (entskb->nf_bridge && entskb->nf_bridge->physoutdev) -				NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, -					     htonl(entskb->nf_bridge->physoutdev->ifindex)); -		} -#endif -	} - -	if (entskb->mark) -		NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark)); - -	if (indev && entskb->dev) { -		struct nfqnl_msg_packet_hw phw; -		int len = dev_parse_header(entskb, phw.hw_addr); -		if (len) { -			phw.hw_addrlen = htons(len); -			NLA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); -		} -	} - -	if (entskb->tstamp.tv64) { -		struct nfqnl_msg_packet_timestamp ts; -		struct timeval tv = ktime_to_timeval(entskb->tstamp); -		ts.sec = cpu_to_be64(tv.tv_sec); -		ts.usec = cpu_to_be64(tv.tv_usec); - -		NLA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); -	} - -	if (data_len) { -		struct nlattr *nla; -		int sz = nla_attr_size(data_len); - -		if (skb_tailroom(skb) < nla_total_size(data_len)) { -			printk(KERN_WARNING "nf_queue: no tailroom!\n"); -			goto nlmsg_failure; -		} - -		nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len)); -		nla->nla_type = NFQA_PAYLOAD; -		nla->nla_len = sz; - -		if (skb_copy_bits(entskb, 0, nla_data(nla), data_len)) -			BUG(); -	} - -	nlh->nlmsg_len = skb->tail - old_tail; -	return skb; - -nlmsg_failure: -nla_put_failure: -	if (skb) -		kfree_skb(skb); -	if (net_ratelimit()) -		printk(KERN_ERR "nf_queue: error creating packet message\n"); -	return NULL; -} - -static int -nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) -{ -	struct sk_buff *nskb; -	struct nfqnl_instance *queue; -	int err; - -	/* rcu_read_lock()ed by nf_hook_slow() */ -	queue = instance_lookup(queuenum); -	if (!queue) -		goto err_out; - -	if (queue->copy_mode == NFQNL_COPY_NONE) -		goto err_out; - -	nskb = nfqnl_build_packet_message(queue, entry); -	if (nskb == NULL) -		goto err_out; - -	spin_lock_bh(&queue->lock); - -	if (!queue->peer_pid) -		goto err_out_free_nskb; - -	if (queue->queue_total >= queue->queue_maxlen) { -		queue->queue_dropped++; -		if (net_ratelimit()) -			  printk(KERN_WARNING "nf_queue: full at %d entries, " -				 "dropping packets(s).\n", -				 queue->queue_total); -		goto err_out_free_nskb; -	} - -	/* nfnetlink_unicast will either free the nskb or add it to a socket */ -	err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT); -	if (err < 0) { -		queue->queue_user_dropped++; -		goto err_out_unlock; -	} - -	__enqueue_entry(queue, entry); - -	spin_unlock_bh(&queue->lock); -	return 0; - -err_out_free_nskb: -	kfree_skb(nskb); -err_out_unlock: -	spin_unlock_bh(&queue->lock); -err_out: -	return -1; -} - -static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e) -{ -	struct sk_buff *nskb; -	int diff; - -	diff = data_len - e->skb->len; -	if (diff < 0) { -		if (pskb_trim(e->skb, data_len)) -			return -ENOMEM; -	} else if (diff > 0) { -		if (data_len > 0xFFFF) -			return -EINVAL; -		if (diff > skb_tailroom(e->skb)) { -			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), -					       diff, GFP_ATOMIC); -			if (!nskb) { -				printk(KERN_WARNING "nf_queue: OOM " -				      "in mangle, dropping packet\n"); -				return -ENOMEM; -			} -			kfree_skb(e->skb); -			e->skb = nskb; -		} -		skb_put(e->skb, diff); -	} -	if (!skb_make_writable(e->skb, data_len)) -		return -ENOMEM; -	skb_copy_to_linear_data(e->skb, data, data_len); -	e->skb->ip_summed = CHECKSUM_NONE; -	return 0; -} - -static int -nfqnl_set_mode(struct nfqnl_instance *queue, -	       unsigned char mode, unsigned int range) -{ -	int status = 0; - -	spin_lock_bh(&queue->lock); -	switch (mode) { -	case NFQNL_COPY_NONE: -	case NFQNL_COPY_META: -		queue->copy_mode = mode; -		queue->copy_range = 0; -		break; - -	case NFQNL_COPY_PACKET: -		queue->copy_mode = mode; -		/* we're using struct nlattr which has 16bit nla_len */ -		if (range > 0xffff) -			queue->copy_range = 0xffff; -		else -			queue->copy_range = range; -		break; - -	default: -		status = -EINVAL; - -	} -	spin_unlock_bh(&queue->lock); - -	return status; -} - -static int -dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) -{ -	if (entry->indev) -		if (entry->indev->ifindex == ifindex) -			return 1; -	if (entry->outdev) -		if (entry->outdev->ifindex == ifindex) -			return 1; -#ifdef CONFIG_BRIDGE_NETFILTER -	if (entry->skb->nf_bridge) { -		if (entry->skb->nf_bridge->physindev && -		    entry->skb->nf_bridge->physindev->ifindex == ifindex) -			return 1; -		if (entry->skb->nf_bridge->physoutdev && -		    entry->skb->nf_bridge->physoutdev->ifindex == ifindex) -			return 1; -	} -#endif -	return 0; -} - -/* drop all packets with either indev or outdev == ifindex from all queue - * instances */ -static void -nfqnl_dev_drop(int ifindex) -{ -	int i; - -	rcu_read_lock(); - -	for (i = 0; i < INSTANCE_BUCKETS; i++) { -		struct hlist_node *tmp; -		struct nfqnl_instance *inst; -		struct hlist_head *head = &instance_table[i]; - -		hlist_for_each_entry_rcu(inst, tmp, head, hlist) -			nfqnl_flush(inst, dev_cmp, ifindex); -	} - -	rcu_read_unlock(); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static int -nfqnl_rcv_dev_event(struct notifier_block *this, -		    unsigned long event, void *ptr) -{ -	struct net_device *dev = ptr; - -	if (!net_eq(dev_net(dev), &init_net)) -		return NOTIFY_DONE; - -	/* Drop any packets associated with the downed device */ -	if (event == NETDEV_DOWN) -		nfqnl_dev_drop(dev->ifindex); -	return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_dev_notifier = { -	.notifier_call	= nfqnl_rcv_dev_event, -}; - -static int -nfqnl_rcv_nl_event(struct notifier_block *this, -		   unsigned long event, void *ptr) -{ -	struct netlink_notify *n = ptr; - -	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { -		int i; - -		/* destroy all instances for this pid */ -		spin_lock(&instances_lock); -		for (i = 0; i < INSTANCE_BUCKETS; i++) { -			struct hlist_node *tmp, *t2; -			struct nfqnl_instance *inst; -			struct hlist_head *head = &instance_table[i]; - -			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { -				if ((n->net == &init_net) && -				    (n->pid == inst->peer_pid)) -					__instance_destroy(inst); -			} -		} -		spin_unlock(&instances_lock); -	} -	return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_rtnl_notifier = { -	.notifier_call	= nfqnl_rcv_nl_event, -}; - -static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { -	[NFQA_VERDICT_HDR]	= { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, -	[NFQA_MARK]		= { .type = NLA_U32 }, -	[NFQA_PAYLOAD]		= { .type = NLA_UNSPEC }, -}; - -static int -nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, -		   const struct nlmsghdr *nlh, -		   const struct nlattr * const nfqa[]) -{ -	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); -	u_int16_t queue_num = ntohs(nfmsg->res_id); - -	struct nfqnl_msg_verdict_hdr *vhdr; -	struct nfqnl_instance *queue; -	unsigned int verdict; -	struct nf_queue_entry *entry; -	int err; - -	rcu_read_lock(); -	queue = instance_lookup(queue_num); -	if (!queue) { -		err = -ENODEV; -		goto err_out_unlock; -	} - -	if (queue->peer_pid != NETLINK_CB(skb).pid) { -		err = -EPERM; -		goto err_out_unlock; -	} - -	if (!nfqa[NFQA_VERDICT_HDR]) { -		err = -EINVAL; -		goto err_out_unlock; -	} - -	vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); -	verdict = ntohl(vhdr->verdict); - -	if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { -		err = -EINVAL; -		goto err_out_unlock; -	} - -	entry = find_dequeue_entry(queue, ntohl(vhdr->id)); -	if (entry == NULL) { -		err = -ENOENT; -		goto err_out_unlock; -	} -	rcu_read_unlock(); - -	if (nfqa[NFQA_PAYLOAD]) { -		if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), -				 nla_len(nfqa[NFQA_PAYLOAD]), entry) < 0) -			verdict = NF_DROP; -	} - -	if (nfqa[NFQA_MARK]) -		entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - -	nf_reinject(entry, verdict); -	return 0; - -err_out_unlock: -	rcu_read_unlock(); -	return err; -} - -static int -nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, -		  const struct nlmsghdr *nlh, -		  const struct nlattr * const nfqa[]) -{ -	return -ENOTSUPP; -} - -static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { -	[NFQA_CFG_CMD]		= { .len = sizeof(struct nfqnl_msg_config_cmd) }, -	[NFQA_CFG_PARAMS]	= { .len = sizeof(struct nfqnl_msg_config_params) }, -}; - -static const struct nf_queue_handler nfqh = { -	.name 	= "nf_queue", -	.outfn	= &nfqnl_enqueue_packet, -}; - -static int -nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, -		  const struct nlmsghdr *nlh, -		  const struct nlattr * const nfqa[]) -{ -	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); -	u_int16_t queue_num = ntohs(nfmsg->res_id); -	struct nfqnl_instance *queue; -	struct nfqnl_msg_config_cmd *cmd = NULL; -	int ret = 0; - -	if (nfqa[NFQA_CFG_CMD]) { -		cmd = nla_data(nfqa[NFQA_CFG_CMD]); - -		/* Commands without queue context - might sleep */ -		switch (cmd->command) { -		case NFQNL_CFG_CMD_PF_BIND: -			return nf_register_queue_handler(ntohs(cmd->pf), -							 &nfqh); -		case NFQNL_CFG_CMD_PF_UNBIND: -			return nf_unregister_queue_handler(ntohs(cmd->pf), -							   &nfqh); -		} -	} - -	rcu_read_lock(); -	queue = instance_lookup(queue_num); -	if (queue && queue->peer_pid != NETLINK_CB(skb).pid) { -		ret = -EPERM; -		goto err_out_unlock; -	} - -	if (cmd != NULL) { -		switch (cmd->command) { -		case NFQNL_CFG_CMD_BIND: -			if (queue) { -				ret = -EBUSY; -				goto err_out_unlock; -			} -			queue = instance_create(queue_num, NETLINK_CB(skb).pid); -			if (IS_ERR(queue)) { -				ret = PTR_ERR(queue); -				goto err_out_unlock; -			} -			break; -		case NFQNL_CFG_CMD_UNBIND: -			if (!queue) { -				ret = -ENODEV; -				goto err_out_unlock; -			} -			instance_destroy(queue); -			break; -		case NFQNL_CFG_CMD_PF_BIND: -		case NFQNL_CFG_CMD_PF_UNBIND: -			break; -		default: -			ret = -ENOTSUPP; -			break; -		} -	} - -	if (nfqa[NFQA_CFG_PARAMS]) { -		struct nfqnl_msg_config_params *params; - -		if (!queue) { -			ret = -ENODEV; -			goto err_out_unlock; -		} -		params = nla_data(nfqa[NFQA_CFG_PARAMS]); -		nfqnl_set_mode(queue, params->copy_mode, -				ntohl(params->copy_range)); -	} - -	if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { -		__be32 *queue_maxlen; - -		if (!queue) { -			ret = -ENODEV; -			goto err_out_unlock; -		} -		queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); -		spin_lock_bh(&queue->lock); -		queue->queue_maxlen = ntohl(*queue_maxlen); -		spin_unlock_bh(&queue->lock); -	} - -err_out_unlock: -	rcu_read_unlock(); -	return ret; -} - -static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { -	[NFQNL_MSG_PACKET]	= { .call = nfqnl_recv_unsupp, -				    .attr_count = NFQA_MAX, }, -	[NFQNL_MSG_VERDICT]	= { .call = nfqnl_recv_verdict, -				    .attr_count = NFQA_MAX, -				    .policy = nfqa_verdict_policy }, -	[NFQNL_MSG_CONFIG]	= { .call = nfqnl_recv_config, -				    .attr_count = NFQA_CFG_MAX, -				    .policy = nfqa_cfg_policy }, -}; - -static const struct nfnetlink_subsystem nfqnl_subsys = { -	.name		= "nf_queue", -	.subsys_id	= NFNL_SUBSYS_QUEUE, -	.cb_count	= NFQNL_MSG_MAX, -	.cb		= nfqnl_cb, -}; - -#ifdef CONFIG_PROC_FS -struct iter_state { -	unsigned int bucket; -}; - -static struct hlist_node *get_first(struct seq_file *seq) -{ -	struct iter_state *st = seq->private; - -	if (!st) -		return NULL; - -	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { -		if (!hlist_empty(&instance_table[st->bucket])) -			return instance_table[st->bucket].first; -	} -	return NULL; -} - -static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) -{ -	struct iter_state *st = seq->private; - -	h = h->next; -	while (!h) { -		if (++st->bucket >= INSTANCE_BUCKETS) -			return NULL; - -		h = instance_table[st->bucket].first; -	} -	return h; -} - -static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) -{ -	struct hlist_node *head; -	head = get_first(seq); - -	if (head) -		while (pos && (head = get_next(seq, head))) -			pos--; -	return pos ? NULL : head; -} - -static void *seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(instances_lock) -{ -	spin_lock(&instances_lock); -	return get_idx(seq, *pos); -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ -	(*pos)++; -	return get_next(s, v); -} - -static void seq_stop(struct seq_file *s, void *v) -	__releases(instances_lock) -{ -	spin_unlock(&instances_lock); -} - -static int seq_show(struct seq_file *s, void *v) -{ -	const struct nfqnl_instance *inst = v; - -	return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", -			  inst->queue_num, -			  inst->peer_pid, inst->queue_total, -			  inst->copy_mode, inst->copy_range, -			  inst->queue_dropped, inst->queue_user_dropped, -			  atomic_read(&inst->id_sequence), 1); -} - -static const struct seq_operations nfqnl_seq_ops = { -	.start	= seq_start, -	.next	= seq_next, -	.stop	= seq_stop, -	.show	= seq_show, -}; - -static int nfqnl_open(struct inode *inode, struct file *file) -{ -	return seq_open_private(file, &nfqnl_seq_ops, -			sizeof(struct iter_state)); -} - -static const struct file_operations nfqnl_file_ops = { -	.owner	 = THIS_MODULE, -	.open	 = nfqnl_open, -	.read	 = seq_read, -	.llseek	 = seq_lseek, -	.release = seq_release_private, -}; - -#endif /* PROC_FS */ - -static int __init nfnetlink_queue_init(void) -{ -	int i, status = -ENOMEM; - -	for (i = 0; i < INSTANCE_BUCKETS; i++) -		INIT_HLIST_HEAD(&instance_table[i]); - -	netlink_register_notifier(&nfqnl_rtnl_notifier); -	status = nfnetlink_subsys_register(&nfqnl_subsys); -	if (status < 0) { -		printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); -		goto cleanup_netlink_notifier; -	} - -#ifdef CONFIG_PROC_FS -	if (!proc_create("nfnetlink_queue", 0440, -			 proc_net_netfilter, &nfqnl_file_ops)) -		goto cleanup_subsys; -#endif - -	register_netdevice_notifier(&nfqnl_dev_notifier); -	return status; - -#ifdef CONFIG_PROC_FS -cleanup_subsys: -	nfnetlink_subsys_unregister(&nfqnl_subsys); -#endif -cleanup_netlink_notifier: -	netlink_unregister_notifier(&nfqnl_rtnl_notifier); -	return status; -} - -static void __exit nfnetlink_queue_fini(void) -{ -	nf_unregister_queue_handlers(&nfqh); -	unregister_netdevice_notifier(&nfqnl_dev_notifier); -#ifdef CONFIG_PROC_FS -	remove_proc_entry("nfnetlink_queue", proc_net_netfilter); -#endif -	nfnetlink_subsys_unregister(&nfqnl_subsys); -	netlink_unregister_notifier(&nfqnl_rtnl_notifier); - -	rcu_barrier(); /* Wait for completion of call_rcu()'s */ -} - -MODULE_DESCRIPTION("netfilter packet queue handler"); -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); - -module_init(nfnetlink_queue_init); -module_exit(nfnetlink_queue_fini); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c new file mode 100644 index 00000000000..108120f216b --- /dev/null +++ b/net/netfilter/nfnetlink_queue_core.c @@ -0,0 +1,1352 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfnetlink. + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2007 by Patrick McHardy <kaber@trash.net> + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris <jmorris@intercode.com.au> + * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/proc_fs.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> +#include <linux/list.h> +#include <net/sock.h> +#include <net/tcp_states.h> +#include <net/netfilter/nf_queue.h> +#include <net/netns/generic.h> +#include <net/netfilter/nfnetlink_queue.h> + +#include <linux/atomic.h> + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFQNL_QMAX_DEFAULT 1024 + +/* We're using struct nlattr which has 16bit nla_len. Note that nla_len + * includes the header length. Thus, the maximum packet length that we + * support is 65531 bytes. We send truncated packets if the specified length + * is larger than that.  Userspace can check for presence of NFQA_CAP_LEN + * attribute to detect truncation. + */ +#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) + +struct nfqnl_instance { +	struct hlist_node hlist;		/* global list of queues */ +	struct rcu_head rcu; + +	int peer_portid; +	unsigned int queue_maxlen; +	unsigned int copy_range; +	unsigned int queue_dropped; +	unsigned int queue_user_dropped; + + +	u_int16_t queue_num;			/* number of this queue */ +	u_int8_t copy_mode; +	u_int32_t flags;			/* Set using NFQA_CFG_FLAGS */ +/* + * Following fields are dirtied for each queued packet, + * keep them in same cache line if possible. + */ +	spinlock_t	lock; +	unsigned int	queue_total; +	unsigned int	id_sequence;		/* 'sequence' of pkt ids */ +	struct list_head queue_list;		/* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); + +static int nfnl_queue_net_id __read_mostly; + +#define INSTANCE_BUCKETS	16 +struct nfnl_queue_net { +	spinlock_t instances_lock; +	struct hlist_head instance_table[INSTANCE_BUCKETS]; +}; + +static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) +{ +	return net_generic(net, nfnl_queue_net_id); +} + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ +	return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) +{ +	struct hlist_head *head; +	struct nfqnl_instance *inst; + +	head = &q->instance_table[instance_hashfn(queue_num)]; +	hlist_for_each_entry_rcu(inst, head, hlist) { +		if (inst->queue_num == queue_num) +			return inst; +	} +	return NULL; +} + +static struct nfqnl_instance * +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, +		int portid) +{ +	struct nfqnl_instance *inst; +	unsigned int h; +	int err; + +	spin_lock(&q->instances_lock); +	if (instance_lookup(q, queue_num)) { +		err = -EEXIST; +		goto out_unlock; +	} + +	inst = kzalloc(sizeof(*inst), GFP_ATOMIC); +	if (!inst) { +		err = -ENOMEM; +		goto out_unlock; +	} + +	inst->queue_num = queue_num; +	inst->peer_portid = portid; +	inst->queue_maxlen = NFQNL_QMAX_DEFAULT; +	inst->copy_range = NFQNL_MAX_COPY_RANGE; +	inst->copy_mode = NFQNL_COPY_NONE; +	spin_lock_init(&inst->lock); +	INIT_LIST_HEAD(&inst->queue_list); + +	if (!try_module_get(THIS_MODULE)) { +		err = -EAGAIN; +		goto out_free; +	} + +	h = instance_hashfn(queue_num); +	hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); + +	spin_unlock(&q->instances_lock); + +	return inst; + +out_free: +	kfree(inst); +out_unlock: +	spin_unlock(&q->instances_lock); +	return ERR_PTR(err); +} + +static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, +			unsigned long data); + +static void +instance_destroy_rcu(struct rcu_head *head) +{ +	struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, +						   rcu); + +	nfqnl_flush(inst, NULL, 0); +	kfree(inst); +	module_put(THIS_MODULE); +} + +static void +__instance_destroy(struct nfqnl_instance *inst) +{ +	hlist_del_rcu(&inst->hlist); +	call_rcu(&inst->rcu, instance_destroy_rcu); +} + +static void +instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) +{ +	spin_lock(&q->instances_lock); +	__instance_destroy(inst); +	spin_unlock(&q->instances_lock); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ +       list_add_tail(&entry->list, &queue->queue_list); +       queue->queue_total++; +} + +static void +__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ +	list_del(&entry->list); +	queue->queue_total--; +} + +static struct nf_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +{ +	struct nf_queue_entry *entry = NULL, *i; + +	spin_lock_bh(&queue->lock); + +	list_for_each_entry(i, &queue->queue_list, list) { +		if (i->id == id) { +			entry = i; +			break; +		} +	} + +	if (entry) +		__dequeue_entry(queue, entry); + +	spin_unlock_bh(&queue->lock); + +	return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) +{ +	struct nf_queue_entry *entry, *next; + +	spin_lock_bh(&queue->lock); +	list_for_each_entry_safe(entry, next, &queue->queue_list, list) { +		if (!cmpfn || cmpfn(entry, data)) { +			list_del(&entry->list); +			queue->queue_total--; +			nf_reinject(entry, NF_DROP); +		} +	} +	spin_unlock_bh(&queue->lock); +} + +static int +nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, +		      bool csum_verify) +{ +	__u32 flags = 0; + +	if (packet->ip_summed == CHECKSUM_PARTIAL) +		flags = NFQA_SKB_CSUMNOTREADY; +	else if (csum_verify) +		flags = NFQA_SKB_CSUM_NOTVERIFIED; + +	if (skb_is_gso(packet)) +		flags |= NFQA_SKB_GSO; + +	return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; +} + +static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) +{ +	const struct cred *cred; + +	if (sk->sk_state == TCP_TIME_WAIT) +		return 0; + +	read_lock_bh(&sk->sk_callback_lock); +	if (sk->sk_socket && sk->sk_socket->file) { +		cred = sk->sk_socket->file->f_cred; +		if (nla_put_be32(skb, NFQA_UID, +		    htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFQA_GID, +		    htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) +			goto nla_put_failure; +	} +	read_unlock_bh(&sk->sk_callback_lock); +	return 0; + +nla_put_failure: +	read_unlock_bh(&sk->sk_callback_lock); +	return -1; +} + +static struct sk_buff * +nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, +			   struct nf_queue_entry *entry, +			   __be32 **packet_id_ptr) +{ +	size_t size; +	size_t data_len = 0, cap_len = 0; +	unsigned int hlen = 0; +	struct sk_buff *skb; +	struct nlattr *nla; +	struct nfqnl_msg_packet_hdr *pmsg; +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	struct sk_buff *entskb = entry->skb; +	struct net_device *indev; +	struct net_device *outdev; +	struct nf_conn *ct = NULL; +	enum ip_conntrack_info uninitialized_var(ctinfo); +	bool csum_verify; + +	size =    nlmsg_total_size(sizeof(struct nfgenmsg)) +		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) +		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ +		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER +		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ +		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */ +#endif +		+ nla_total_size(sizeof(u_int32_t))	/* mark */ +		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) +		+ nla_total_size(sizeof(u_int32_t))	/* skbinfo */ +		+ nla_total_size(sizeof(u_int32_t));	/* cap_len */ + +	if (entskb->tstamp.tv64) +		size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + +	if (entry->hook <= NF_INET_FORWARD || +	   (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) +		csum_verify = !skb_csum_unnecessary(entskb); +	else +		csum_verify = false; + +	outdev = entry->outdev; + +	switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { +	case NFQNL_COPY_META: +	case NFQNL_COPY_NONE: +		break; + +	case NFQNL_COPY_PACKET: +		if (!(queue->flags & NFQA_CFG_F_GSO) && +		    entskb->ip_summed == CHECKSUM_PARTIAL && +		    skb_checksum_help(entskb)) +			return NULL; + +		data_len = ACCESS_ONCE(queue->copy_range); +		if (data_len > entskb->len) +			data_len = entskb->len; + +		hlen = skb_zerocopy_headlen(entskb); +		hlen = min_t(unsigned int, hlen, data_len); +		size += sizeof(struct nlattr) + hlen; +		cap_len = entskb->len; +		break; +	} + +	if (queue->flags & NFQA_CFG_F_CONNTRACK) +		ct = nfqnl_ct_get(entskb, &size, &ctinfo); + +	if (queue->flags & NFQA_CFG_F_UID_GID) { +		size +=  (nla_total_size(sizeof(u_int32_t))	/* uid */ +			+ nla_total_size(sizeof(u_int32_t)));	/* gid */ +	} + +	skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, +				  GFP_ATOMIC); +	if (!skb) { +		skb_tx_error(entskb); +		return NULL; +	} + +	nlh = nlmsg_put(skb, 0, 0, +			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, +			sizeof(struct nfgenmsg), 0); +	if (!nlh) { +		skb_tx_error(entskb); +		kfree_skb(skb); +		return NULL; +	} +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = entry->pf; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = htons(queue->queue_num); + +	nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); +	pmsg = nla_data(nla); +	pmsg->hw_protocol	= entskb->protocol; +	pmsg->hook		= entry->hook; +	*packet_id_ptr		= &pmsg->packet_id; + +	indev = entry->indev; +	if (indev) { +#ifndef CONFIG_BRIDGE_NETFILTER +		if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) +			goto nla_put_failure; +#else +		if (entry->pf == PF_BRIDGE) { +			/* Case 1: indev is physical input device, we need to +			 * look for bridge group (when called from +			 * netfilter_bridge) */ +			if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, +					 htonl(indev->ifindex)) || +			/* this is the bridge group "brX" */ +			/* rcu_read_lock()ed by __nf_queue */ +			    nla_put_be32(skb, NFQA_IFINDEX_INDEV, +					 htonl(br_port_get_rcu(indev)->br->dev->ifindex))) +				goto nla_put_failure; +		} else { +			/* Case 2: indev is bridge group, we need to look for +			 * physical device (when called from ipv4) */ +			if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, +					 htonl(indev->ifindex))) +				goto nla_put_failure; +			if (entskb->nf_bridge && entskb->nf_bridge->physindev && +			    nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, +					 htonl(entskb->nf_bridge->physindev->ifindex))) +				goto nla_put_failure; +		} +#endif +	} + +	if (outdev) { +#ifndef CONFIG_BRIDGE_NETFILTER +		if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) +			goto nla_put_failure; +#else +		if (entry->pf == PF_BRIDGE) { +			/* Case 1: outdev is physical output device, we need to +			 * look for bridge group (when called from +			 * netfilter_bridge) */ +			if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, +					 htonl(outdev->ifindex)) || +			/* this is the bridge group "brX" */ +			/* rcu_read_lock()ed by __nf_queue */ +			    nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, +					 htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) +				goto nla_put_failure; +		} else { +			/* Case 2: outdev is bridge group, we need to look for +			 * physical output device (when called from ipv4) */ +			if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, +					 htonl(outdev->ifindex))) +				goto nla_put_failure; +			if (entskb->nf_bridge && entskb->nf_bridge->physoutdev && +			    nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, +					 htonl(entskb->nf_bridge->physoutdev->ifindex))) +				goto nla_put_failure; +		} +#endif +	} + +	if (entskb->mark && +	    nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) +		goto nla_put_failure; + +	if (indev && entskb->dev && +	    entskb->mac_header != entskb->network_header) { +		struct nfqnl_msg_packet_hw phw; +		int len; + +		memset(&phw, 0, sizeof(phw)); +		len = dev_parse_header(entskb, phw.hw_addr); +		if (len) { +			phw.hw_addrlen = htons(len); +			if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) +				goto nla_put_failure; +		} +	} + +	if (entskb->tstamp.tv64) { +		struct nfqnl_msg_packet_timestamp ts; +		struct timeval tv = ktime_to_timeval(entskb->tstamp); +		ts.sec = cpu_to_be64(tv.tv_sec); +		ts.usec = cpu_to_be64(tv.tv_usec); + +		if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) +			goto nla_put_failure; +	} + +	if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && +	    nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) +		goto nla_put_failure; + +	if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) +		goto nla_put_failure; + +	if (cap_len > data_len && +	    nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) +		goto nla_put_failure; + +	if (nfqnl_put_packet_info(skb, entskb, csum_verify)) +		goto nla_put_failure; + +	if (data_len) { +		struct nlattr *nla; + +		if (skb_tailroom(skb) < sizeof(*nla) + hlen) +			goto nla_put_failure; + +		nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); +		nla->nla_type = NFQA_PAYLOAD; +		nla->nla_len = nla_attr_size(data_len); + +		if (skb_zerocopy(skb, entskb, data_len, hlen)) +			goto nla_put_failure; +	} + +	nlh->nlmsg_len = skb->len; +	return skb; + +nla_put_failure: +	skb_tx_error(entskb); +	kfree_skb(skb); +	net_err_ratelimited("nf_queue: error creating packet message\n"); +	return NULL; +} + +static int +__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, +			struct nf_queue_entry *entry) +{ +	struct sk_buff *nskb; +	int err = -ENOBUFS; +	__be32 *packet_id_ptr; +	int failopen = 0; + +	nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); +	if (nskb == NULL) { +		err = -ENOMEM; +		goto err_out; +	} +	spin_lock_bh(&queue->lock); + +	if (queue->queue_total >= queue->queue_maxlen) { +		if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { +			failopen = 1; +			err = 0; +		} else { +			queue->queue_dropped++; +			net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", +					     queue->queue_total); +		} +		goto err_out_free_nskb; +	} +	entry->id = ++queue->id_sequence; +	*packet_id_ptr = htonl(entry->id); + +	/* nfnetlink_unicast will either free the nskb or add it to a socket */ +	err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); +	if (err < 0) { +		queue->queue_user_dropped++; +		goto err_out_unlock; +	} + +	__enqueue_entry(queue, entry); + +	spin_unlock_bh(&queue->lock); +	return 0; + +err_out_free_nskb: +	kfree_skb(nskb); +err_out_unlock: +	spin_unlock_bh(&queue->lock); +	if (failopen) +		nf_reinject(entry, NF_ACCEPT); +err_out: +	return err; +} + +static struct nf_queue_entry * +nf_queue_entry_dup(struct nf_queue_entry *e) +{ +	struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); +	if (entry) { +		if (nf_queue_entry_get_refs(entry)) +			return entry; +		kfree(entry); +	} +	return NULL; +} + +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ +	if (skb->nf_bridge) +		__skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ +	if (skb->nf_bridge) +		__skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ +	nf_queue_entry_release_refs(entry); +	kfree(entry); +} + +static int +__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, +			   struct sk_buff *skb, struct nf_queue_entry *entry) +{ +	int ret = -ENOMEM; +	struct nf_queue_entry *entry_seg; + +	nf_bridge_adjust_segmented_data(skb); + +	if (skb->next == NULL) { /* last packet, no need to copy entry */ +		struct sk_buff *gso_skb = entry->skb; +		entry->skb = skb; +		ret = __nfqnl_enqueue_packet(net, queue, entry); +		if (ret) +			entry->skb = gso_skb; +		return ret; +	} + +	skb->next = NULL; + +	entry_seg = nf_queue_entry_dup(entry); +	if (entry_seg) { +		entry_seg->skb = skb; +		ret = __nfqnl_enqueue_packet(net, queue, entry_seg); +		if (ret) +			free_entry(entry_seg); +	} +	return ret; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ +	unsigned int queued; +	struct nfqnl_instance *queue; +	struct sk_buff *skb, *segs; +	int err = -ENOBUFS; +	struct net *net = dev_net(entry->indev ? +				  entry->indev : entry->outdev); +	struct nfnl_queue_net *q = nfnl_queue_pernet(net); + +	/* rcu_read_lock()ed by nf_hook_slow() */ +	queue = instance_lookup(q, queuenum); +	if (!queue) +		return -ESRCH; + +	if (queue->copy_mode == NFQNL_COPY_NONE) +		return -EINVAL; + +	skb = entry->skb; + +	switch (entry->pf) { +	case NFPROTO_IPV4: +		skb->protocol = htons(ETH_P_IP); +		break; +	case NFPROTO_IPV6: +		skb->protocol = htons(ETH_P_IPV6); +		break; +	} + +	if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) +		return __nfqnl_enqueue_packet(net, queue, entry); + +	nf_bridge_adjust_skb_data(skb); +	segs = skb_gso_segment(skb, 0); +	/* Does not use PTR_ERR to limit the number of error codes that can be +	 * returned by nf_queue.  For instance, callers rely on -ECANCELED to +	 * mean 'ignore this hook'. +	 */ +	if (IS_ERR(segs)) +		goto out_err; +	queued = 0; +	err = 0; +	do { +		struct sk_buff *nskb = segs->next; +		if (err == 0) +			err = __nfqnl_enqueue_packet_gso(net, queue, +							segs, entry); +		if (err == 0) +			queued++; +		else +			kfree_skb(segs); +		segs = nskb; +	} while (segs); + +	if (queued) { +		if (err) /* some segments are already queued */ +			free_entry(entry); +		kfree_skb(skb); +		return 0; +	} + out_err: +	nf_bridge_adjust_segmented_data(skb); +	return err; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +{ +	struct sk_buff *nskb; + +	if (diff < 0) { +		if (pskb_trim(e->skb, data_len)) +			return -ENOMEM; +	} else if (diff > 0) { +		if (data_len > 0xFFFF) +			return -EINVAL; +		if (diff > skb_tailroom(e->skb)) { +			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), +					       diff, GFP_ATOMIC); +			if (!nskb) { +				printk(KERN_WARNING "nf_queue: OOM " +				      "in mangle, dropping packet\n"); +				return -ENOMEM; +			} +			kfree_skb(e->skb); +			e->skb = nskb; +		} +		skb_put(e->skb, diff); +	} +	if (!skb_make_writable(e->skb, data_len)) +		return -ENOMEM; +	skb_copy_to_linear_data(e->skb, data, data_len); +	e->skb->ip_summed = CHECKSUM_NONE; +	return 0; +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, +	       unsigned char mode, unsigned int range) +{ +	int status = 0; + +	spin_lock_bh(&queue->lock); +	switch (mode) { +	case NFQNL_COPY_NONE: +	case NFQNL_COPY_META: +		queue->copy_mode = mode; +		queue->copy_range = 0; +		break; + +	case NFQNL_COPY_PACKET: +		queue->copy_mode = mode; +		if (range == 0 || range > NFQNL_MAX_COPY_RANGE) +			queue->copy_range = NFQNL_MAX_COPY_RANGE; +		else +			queue->copy_range = range; +		break; + +	default: +		status = -EINVAL; + +	} +	spin_unlock_bh(&queue->lock); + +	return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ +	if (entry->indev) +		if (entry->indev->ifindex == ifindex) +			return 1; +	if (entry->outdev) +		if (entry->outdev->ifindex == ifindex) +			return 1; +#ifdef CONFIG_BRIDGE_NETFILTER +	if (entry->skb->nf_bridge) { +		if (entry->skb->nf_bridge->physindev && +		    entry->skb->nf_bridge->physindev->ifindex == ifindex) +			return 1; +		if (entry->skb->nf_bridge->physoutdev && +		    entry->skb->nf_bridge->physoutdev->ifindex == ifindex) +			return 1; +	} +#endif +	return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(struct net *net, int ifindex) +{ +	int i; +	struct nfnl_queue_net *q = nfnl_queue_pernet(net); + +	rcu_read_lock(); + +	for (i = 0; i < INSTANCE_BUCKETS; i++) { +		struct nfqnl_instance *inst; +		struct hlist_head *head = &q->instance_table[i]; + +		hlist_for_each_entry_rcu(inst, head, hlist) +			nfqnl_flush(inst, dev_cmp, ifindex); +	} + +	rcu_read_unlock(); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, +		    unsigned long event, void *ptr) +{ +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); + +	/* Drop any packets associated with the downed device */ +	if (event == NETDEV_DOWN) +		nfqnl_dev_drop(dev_net(dev), dev->ifindex); +	return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { +	.notifier_call	= nfqnl_rcv_dev_event, +}; + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, +		   unsigned long event, void *ptr) +{ +	struct netlink_notify *n = ptr; +	struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); + +	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { +		int i; + +		/* destroy all instances for this portid */ +		spin_lock(&q->instances_lock); +		for (i = 0; i < INSTANCE_BUCKETS; i++) { +			struct hlist_node *t2; +			struct nfqnl_instance *inst; +			struct hlist_head *head = &q->instance_table[i]; + +			hlist_for_each_entry_safe(inst, t2, head, hlist) { +				if (n->portid == inst->peer_portid) +					__instance_destroy(inst); +			} +		} +		spin_unlock(&q->instances_lock); +	} +	return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { +	.notifier_call	= nfqnl_rcv_nl_event, +}; + +static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { +	[NFQA_VERDICT_HDR]	= { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, +	[NFQA_MARK]		= { .type = NLA_U32 }, +	[NFQA_PAYLOAD]		= { .type = NLA_UNSPEC }, +	[NFQA_CT]		= { .type = NLA_UNSPEC }, +	[NFQA_EXP]		= { .type = NLA_UNSPEC }, +}; + +static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { +	[NFQA_VERDICT_HDR]	= { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, +	[NFQA_MARK]		= { .type = NLA_U32 }, +}; + +static struct nfqnl_instance * +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid) +{ +	struct nfqnl_instance *queue; + +	queue = instance_lookup(q, queue_num); +	if (!queue) +		return ERR_PTR(-ENODEV); + +	if (queue->peer_portid != nlportid) +		return ERR_PTR(-EPERM); + +	return queue; +} + +static struct nfqnl_msg_verdict_hdr* +verdicthdr_get(const struct nlattr * const nfqa[]) +{ +	struct nfqnl_msg_verdict_hdr *vhdr; +	unsigned int verdict; + +	if (!nfqa[NFQA_VERDICT_HDR]) +		return NULL; + +	vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); +	verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; +	if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) +		return NULL; +	return vhdr; +} + +static int nfq_id_after(unsigned int id, unsigned int max) +{ +	return (int)(id - max) > 0; +} + +static int +nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, +		   const struct nlmsghdr *nlh, +		   const struct nlattr * const nfqa[]) +{ +	struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	struct nf_queue_entry *entry, *tmp; +	unsigned int verdict, maxid; +	struct nfqnl_msg_verdict_hdr *vhdr; +	struct nfqnl_instance *queue; +	LIST_HEAD(batch_list); +	u16 queue_num = ntohs(nfmsg->res_id); + +	struct net *net = sock_net(ctnl); +	struct nfnl_queue_net *q = nfnl_queue_pernet(net); + +	queue = verdict_instance_lookup(q, queue_num, +					NETLINK_CB(skb).portid); +	if (IS_ERR(queue)) +		return PTR_ERR(queue); + +	vhdr = verdicthdr_get(nfqa); +	if (!vhdr) +		return -EINVAL; + +	verdict = ntohl(vhdr->verdict); +	maxid = ntohl(vhdr->id); + +	spin_lock_bh(&queue->lock); + +	list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { +		if (nfq_id_after(entry->id, maxid)) +			break; +		__dequeue_entry(queue, entry); +		list_add_tail(&entry->list, &batch_list); +	} + +	spin_unlock_bh(&queue->lock); + +	if (list_empty(&batch_list)) +		return -ENOENT; + +	list_for_each_entry_safe(entry, tmp, &batch_list, list) { +		if (nfqa[NFQA_MARK]) +			entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); +		nf_reinject(entry, verdict); +	} +	return 0; +} + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, +		   const struct nlmsghdr *nlh, +		   const struct nlattr * const nfqa[]) +{ +	struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	u_int16_t queue_num = ntohs(nfmsg->res_id); + +	struct nfqnl_msg_verdict_hdr *vhdr; +	struct nfqnl_instance *queue; +	unsigned int verdict; +	struct nf_queue_entry *entry; +	enum ip_conntrack_info uninitialized_var(ctinfo); +	struct nf_conn *ct = NULL; + +	struct net *net = sock_net(ctnl); +	struct nfnl_queue_net *q = nfnl_queue_pernet(net); + +	queue = instance_lookup(q, queue_num); +	if (!queue) +		queue = verdict_instance_lookup(q, queue_num, +						NETLINK_CB(skb).portid); +	if (IS_ERR(queue)) +		return PTR_ERR(queue); + +	vhdr = verdicthdr_get(nfqa); +	if (!vhdr) +		return -EINVAL; + +	verdict = ntohl(vhdr->verdict); + +	entry = find_dequeue_entry(queue, ntohl(vhdr->id)); +	if (entry == NULL) +		return -ENOENT; + +	if (nfqa[NFQA_CT]) { +		ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); +		if (ct && nfqa[NFQA_EXP]) { +			nfqnl_attach_expect(ct, nfqa[NFQA_EXP], +					    NETLINK_CB(skb).portid, +					    nlmsg_report(nlh)); +		} +	} + +	if (nfqa[NFQA_PAYLOAD]) { +		u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); +		int diff = payload_len - entry->skb->len; + +		if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), +				 payload_len, entry, diff) < 0) +			verdict = NF_DROP; + +		if (ct) +			nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff); +	} + +	if (nfqa[NFQA_MARK]) +		entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + +	nf_reinject(entry, verdict); +	return 0; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, +		  const struct nlmsghdr *nlh, +		  const struct nlattr * const nfqa[]) +{ +	return -ENOTSUPP; +} + +static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { +	[NFQA_CFG_CMD]		= { .len = sizeof(struct nfqnl_msg_config_cmd) }, +	[NFQA_CFG_PARAMS]	= { .len = sizeof(struct nfqnl_msg_config_params) }, +}; + +static const struct nf_queue_handler nfqh = { +	.outfn	= &nfqnl_enqueue_packet, +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, +		  const struct nlmsghdr *nlh, +		  const struct nlattr * const nfqa[]) +{ +	struct nfgenmsg *nfmsg = nlmsg_data(nlh); +	u_int16_t queue_num = ntohs(nfmsg->res_id); +	struct nfqnl_instance *queue; +	struct nfqnl_msg_config_cmd *cmd = NULL; +	struct net *net = sock_net(ctnl); +	struct nfnl_queue_net *q = nfnl_queue_pernet(net); +	int ret = 0; + +	if (nfqa[NFQA_CFG_CMD]) { +		cmd = nla_data(nfqa[NFQA_CFG_CMD]); + +		/* Obsolete commands without queue context */ +		switch (cmd->command) { +		case NFQNL_CFG_CMD_PF_BIND: return 0; +		case NFQNL_CFG_CMD_PF_UNBIND: return 0; +		} +	} + +	rcu_read_lock(); +	queue = instance_lookup(q, queue_num); +	if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { +		ret = -EPERM; +		goto err_out_unlock; +	} + +	if (cmd != NULL) { +		switch (cmd->command) { +		case NFQNL_CFG_CMD_BIND: +			if (queue) { +				ret = -EBUSY; +				goto err_out_unlock; +			} +			queue = instance_create(q, queue_num, +						NETLINK_CB(skb).portid); +			if (IS_ERR(queue)) { +				ret = PTR_ERR(queue); +				goto err_out_unlock; +			} +			break; +		case NFQNL_CFG_CMD_UNBIND: +			if (!queue) { +				ret = -ENODEV; +				goto err_out_unlock; +			} +			instance_destroy(q, queue); +			break; +		case NFQNL_CFG_CMD_PF_BIND: +		case NFQNL_CFG_CMD_PF_UNBIND: +			break; +		default: +			ret = -ENOTSUPP; +			break; +		} +	} + +	if (nfqa[NFQA_CFG_PARAMS]) { +		struct nfqnl_msg_config_params *params; + +		if (!queue) { +			ret = -ENODEV; +			goto err_out_unlock; +		} +		params = nla_data(nfqa[NFQA_CFG_PARAMS]); +		nfqnl_set_mode(queue, params->copy_mode, +				ntohl(params->copy_range)); +	} + +	if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { +		__be32 *queue_maxlen; + +		if (!queue) { +			ret = -ENODEV; +			goto err_out_unlock; +		} +		queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); +		spin_lock_bh(&queue->lock); +		queue->queue_maxlen = ntohl(*queue_maxlen); +		spin_unlock_bh(&queue->lock); +	} + +	if (nfqa[NFQA_CFG_FLAGS]) { +		__u32 flags, mask; + +		if (!queue) { +			ret = -ENODEV; +			goto err_out_unlock; +		} + +		if (!nfqa[NFQA_CFG_MASK]) { +			/* A mask is needed to specify which flags are being +			 * changed. +			 */ +			ret = -EINVAL; +			goto err_out_unlock; +		} + +		flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); +		mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); + +		if (flags >= NFQA_CFG_F_MAX) { +			ret = -EOPNOTSUPP; +			goto err_out_unlock; +		} + +		spin_lock_bh(&queue->lock); +		queue->flags &= ~mask; +		queue->flags |= flags & mask; +		spin_unlock_bh(&queue->lock); +	} + +err_out_unlock: +	rcu_read_unlock(); +	return ret; +} + +static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { +	[NFQNL_MSG_PACKET]	= { .call_rcu = nfqnl_recv_unsupp, +				    .attr_count = NFQA_MAX, }, +	[NFQNL_MSG_VERDICT]	= { .call_rcu = nfqnl_recv_verdict, +				    .attr_count = NFQA_MAX, +				    .policy = nfqa_verdict_policy }, +	[NFQNL_MSG_CONFIG]	= { .call = nfqnl_recv_config, +				    .attr_count = NFQA_CFG_MAX, +				    .policy = nfqa_cfg_policy }, +	[NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, +				    .attr_count = NFQA_MAX, +				    .policy = nfqa_verdict_batch_policy }, +}; + +static const struct nfnetlink_subsystem nfqnl_subsys = { +	.name		= "nf_queue", +	.subsys_id	= NFNL_SUBSYS_QUEUE, +	.cb_count	= NFQNL_MSG_MAX, +	.cb		= nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { +	struct seq_net_private p; +	unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ +	struct iter_state *st = seq->private; +	struct net *net; +	struct nfnl_queue_net *q; + +	if (!st) +		return NULL; + +	net = seq_file_net(seq); +	q = nfnl_queue_pernet(net); +	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { +		if (!hlist_empty(&q->instance_table[st->bucket])) +			return q->instance_table[st->bucket].first; +	} +	return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ +	struct iter_state *st = seq->private; +	struct net *net = seq_file_net(seq); + +	h = h->next; +	while (!h) { +		struct nfnl_queue_net *q; + +		if (++st->bucket >= INSTANCE_BUCKETS) +			return NULL; + +		q = nfnl_queue_pernet(net); +		h = q->instance_table[st->bucket].first; +	} +	return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ +	struct hlist_node *head; +	head = get_first(seq); + +	if (head) +		while (pos && (head = get_next(seq, head))) +			pos--; +	return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *s, loff_t *pos) +	__acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ +	spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); +	return get_idx(s, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ +	(*pos)++; +	return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) +	__releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ +	spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ +	const struct nfqnl_instance *inst = v; + +	return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", +			  inst->queue_num, +			  inst->peer_portid, inst->queue_total, +			  inst->copy_mode, inst->copy_range, +			  inst->queue_dropped, inst->queue_user_dropped, +			  inst->id_sequence, 1); +} + +static const struct seq_operations nfqnl_seq_ops = { +	.start	= seq_start, +	.next	= seq_next, +	.stop	= seq_stop, +	.show	= seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &nfqnl_seq_ops, +			sizeof(struct iter_state)); +} + +static const struct file_operations nfqnl_file_ops = { +	.owner	 = THIS_MODULE, +	.open	 = nfqnl_open, +	.read	 = seq_read, +	.llseek	 = seq_lseek, +	.release = seq_release_net, +}; + +#endif /* PROC_FS */ + +static int __net_init nfnl_queue_net_init(struct net *net) +{ +	unsigned int i; +	struct nfnl_queue_net *q = nfnl_queue_pernet(net); + +	for (i = 0; i < INSTANCE_BUCKETS; i++) +		INIT_HLIST_HEAD(&q->instance_table[i]); + +	spin_lock_init(&q->instances_lock); + +#ifdef CONFIG_PROC_FS +	if (!proc_create("nfnetlink_queue", 0440, +			 net->nf.proc_netfilter, &nfqnl_file_ops)) +		return -ENOMEM; +#endif +	return 0; +} + +static void __net_exit nfnl_queue_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS +	remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +#endif +} + +static struct pernet_operations nfnl_queue_net_ops = { +	.init	= nfnl_queue_net_init, +	.exit	= nfnl_queue_net_exit, +	.id	= &nfnl_queue_net_id, +	.size	= sizeof(struct nfnl_queue_net), +}; + +static int __init nfnetlink_queue_init(void) +{ +	int status = -ENOMEM; + +	netlink_register_notifier(&nfqnl_rtnl_notifier); +	status = nfnetlink_subsys_register(&nfqnl_subsys); +	if (status < 0) { +		pr_err("nf_queue: failed to create netlink socket\n"); +		goto cleanup_netlink_notifier; +	} + +	status = register_pernet_subsys(&nfnl_queue_net_ops); +	if (status < 0) { +		pr_err("nf_queue: failed to register pernet ops\n"); +		goto cleanup_subsys; +	} +	register_netdevice_notifier(&nfqnl_dev_notifier); +	nf_register_queue_handler(&nfqh); +	return status; + +cleanup_subsys: +	nfnetlink_subsys_unregister(&nfqnl_subsys); +cleanup_netlink_notifier: +	netlink_unregister_notifier(&nfqnl_rtnl_notifier); +	return status; +} + +static void __exit nfnetlink_queue_fini(void) +{ +	nf_unregister_queue_handler(); +	unregister_netdevice_notifier(&nfqnl_dev_notifier); +	unregister_pernet_subsys(&nfnl_queue_net_ops); +	nfnetlink_subsys_unregister(&nfqnl_subsys); +	netlink_unregister_notifier(&nfqnl_rtnl_notifier); + +	rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(nfnetlink_queue_init); +module_exit(nfnetlink_queue_fini); diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c new file mode 100644 index 00000000000..96cac50e0d1 --- /dev/null +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -0,0 +1,113 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nfnetlink_queue.h> + +struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size, +			     enum ip_conntrack_info *ctinfo) +{ +	struct nfq_ct_hook *nfq_ct; +	struct nf_conn *ct; + +	/* rcu_read_lock()ed by __nf_queue already. */ +	nfq_ct = rcu_dereference(nfq_ct_hook); +	if (nfq_ct == NULL) +		return NULL; + +	ct = nf_ct_get(entskb, ctinfo); +	if (ct) { +		if (!nf_ct_is_untracked(ct)) +			*size += nfq_ct->build_size(ct); +		else +			ct = NULL; +	} +	return ct; +} + +struct nf_conn * +nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr, +	       enum ip_conntrack_info *ctinfo) +{ +	struct nfq_ct_hook *nfq_ct; +	struct nf_conn *ct; + +	/* rcu_read_lock()ed by __nf_queue already. */ +	nfq_ct = rcu_dereference(nfq_ct_hook); +	if (nfq_ct == NULL) +		return NULL; + +	ct = nf_ct_get(skb, ctinfo); +	if (ct && !nf_ct_is_untracked(ct)) +		nfq_ct->parse(attr, ct); + +	return ct; +} + +int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, +		 enum ip_conntrack_info ctinfo) +{ +	struct nfq_ct_hook *nfq_ct; +	struct nlattr *nest_parms; +	u_int32_t tmp; + +	nfq_ct = rcu_dereference(nfq_ct_hook); +	if (nfq_ct == NULL) +		return 0; + +	nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED); +	if (!nest_parms) +		goto nla_put_failure; + +	if (nfq_ct->build(skb, ct) < 0) +		goto nla_put_failure; + +	nla_nest_end(skb, nest_parms); + +	tmp = ctinfo; +	if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp))) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, +			 enum ip_conntrack_info ctinfo, int diff) +{ +	struct nfq_ct_hook *nfq_ct; + +	nfq_ct = rcu_dereference(nfq_ct_hook); +	if (nfq_ct == NULL) +		return; + +	if ((ct->status & IPS_NAT_MASK) && diff) +		nfq_ct->seq_adjust(skb, ct, ctinfo, diff); +} + +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, +			u32 portid, u32 report) +{ +	struct nfq_ct_hook *nfq_ct; + +	if (nf_ct_is_untracked(ct)) +		return 0; + +	nfq_ct = rcu_dereference(nfq_ct_hook); +	if (nfq_ct == NULL) +		return -EOPNOTSUPP; + +	return nfq_ct->attach_expect(attr, ct, portid, report); +} diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c new file mode 100644 index 00000000000..4fb6ee2c110 --- /dev/null +++ b/net/netfilter/nft_bitwise.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_bitwise { +	enum nft_registers	sreg:8; +	enum nft_registers	dreg:8; +	u8			len; +	struct nft_data		mask; +	struct nft_data		xor; +}; + +static void nft_bitwise_eval(const struct nft_expr *expr, +			     struct nft_data data[NFT_REG_MAX + 1], +			     const struct nft_pktinfo *pkt) +{ +	const struct nft_bitwise *priv = nft_expr_priv(expr); +	const struct nft_data *src = &data[priv->sreg]; +	struct nft_data *dst = &data[priv->dreg]; +	unsigned int i; + +	for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) { +		dst->data[i] = (src->data[i] & priv->mask.data[i]) ^ +			       priv->xor.data[i]; +	} +} + +static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { +	[NFTA_BITWISE_SREG]	= { .type = NLA_U32 }, +	[NFTA_BITWISE_DREG]	= { .type = NLA_U32 }, +	[NFTA_BITWISE_LEN]	= { .type = NLA_U32 }, +	[NFTA_BITWISE_MASK]	= { .type = NLA_NESTED }, +	[NFTA_BITWISE_XOR]	= { .type = NLA_NESTED }, +}; + +static int nft_bitwise_init(const struct nft_ctx *ctx, +			    const struct nft_expr *expr, +			    const struct nlattr * const tb[]) +{ +	struct nft_bitwise *priv = nft_expr_priv(expr); +	struct nft_data_desc d1, d2; +	int err; + +	if (tb[NFTA_BITWISE_SREG] == NULL || +	    tb[NFTA_BITWISE_DREG] == NULL || +	    tb[NFTA_BITWISE_LEN] == NULL || +	    tb[NFTA_BITWISE_MASK] == NULL || +	    tb[NFTA_BITWISE_XOR] == NULL) +		return -EINVAL; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + +	err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]); +	if (err < 0) +		return err; +	if (d1.len != priv->len) +		return -EINVAL; + +	err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]); +	if (err < 0) +		return err; +	if (d2.len != priv->len) +		return -EINVAL; + +	return 0; +} + +static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_bitwise *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, +			  NFT_DATA_VALUE, priv->len) < 0) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, +			  NFT_DATA_VALUE, priv->len) < 0) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_bitwise_type; +static const struct nft_expr_ops nft_bitwise_ops = { +	.type		= &nft_bitwise_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), +	.eval		= nft_bitwise_eval, +	.init		= nft_bitwise_init, +	.dump		= nft_bitwise_dump, +}; + +static struct nft_expr_type nft_bitwise_type __read_mostly = { +	.name		= "bitwise", +	.ops		= &nft_bitwise_ops, +	.policy		= nft_bitwise_policy, +	.maxattr	= NFTA_BITWISE_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_bitwise_module_init(void) +{ +	return nft_register_expr(&nft_bitwise_type); +} + +void nft_bitwise_module_exit(void) +{ +	nft_unregister_expr(&nft_bitwise_type); +} diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c new file mode 100644 index 00000000000..c39ed8d29df --- /dev/null +++ b/net/netfilter/nft_byteorder.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_byteorder { +	enum nft_registers	sreg:8; +	enum nft_registers	dreg:8; +	enum nft_byteorder_ops	op:8; +	u8			len; +	u8			size; +}; + +static void nft_byteorder_eval(const struct nft_expr *expr, +			       struct nft_data data[NFT_REG_MAX + 1], +			       const struct nft_pktinfo *pkt) +{ +	const struct nft_byteorder *priv = nft_expr_priv(expr); +	struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg]; +	union { u32 u32; u16 u16; } *s, *d; +	unsigned int i; + +	s = (void *)src->data; +	d = (void *)dst->data; + +	switch (priv->size) { +	case 4: +		switch (priv->op) { +		case NFT_BYTEORDER_NTOH: +			for (i = 0; i < priv->len / 4; i++) +				d[i].u32 = ntohl((__force __be32)s[i].u32); +			break; +		case NFT_BYTEORDER_HTON: +			for (i = 0; i < priv->len / 4; i++) +				d[i].u32 = (__force __u32)htonl(s[i].u32); +			break; +		} +		break; +	case 2: +		switch (priv->op) { +		case NFT_BYTEORDER_NTOH: +			for (i = 0; i < priv->len / 2; i++) +				d[i].u16 = ntohs((__force __be16)s[i].u16); +			break; +		case NFT_BYTEORDER_HTON: +			for (i = 0; i < priv->len / 2; i++) +				d[i].u16 = (__force __u16)htons(s[i].u16); +			break; +		} +		break; +	} +} + +static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { +	[NFTA_BYTEORDER_SREG]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_DREG]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_OP]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_LEN]	= { .type = NLA_U32 }, +	[NFTA_BYTEORDER_SIZE]	= { .type = NLA_U32 }, +}; + +static int nft_byteorder_init(const struct nft_ctx *ctx, +			      const struct nft_expr *expr, +			      const struct nlattr * const tb[]) +{ +	struct nft_byteorder *priv = nft_expr_priv(expr); +	int err; + +	if (tb[NFTA_BYTEORDER_SREG] == NULL || +	    tb[NFTA_BYTEORDER_DREG] == NULL || +	    tb[NFTA_BYTEORDER_LEN] == NULL || +	    tb[NFTA_BYTEORDER_SIZE] == NULL || +	    tb[NFTA_BYTEORDER_OP] == NULL) +		return -EINVAL; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); +	switch (priv->op) { +	case NFT_BYTEORDER_NTOH: +	case NFT_BYTEORDER_HTON: +		break; +	default: +		return -EINVAL; +	} + +	priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); +	if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data)) +		return -EINVAL; + +	priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); +	switch (priv->size) { +	case 2: +	case 4: +		break; +	default: +		return -EINVAL; +	} + +	return 0; +} + +static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_byteorder *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_byteorder_type; +static const struct nft_expr_ops nft_byteorder_ops = { +	.type		= &nft_byteorder_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), +	.eval		= nft_byteorder_eval, +	.init		= nft_byteorder_init, +	.dump		= nft_byteorder_dump, +}; + +static struct nft_expr_type nft_byteorder_type __read_mostly = { +	.name		= "byteorder", +	.ops		= &nft_byteorder_ops, +	.policy		= nft_byteorder_policy, +	.maxattr	= NFTA_BYTEORDER_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_byteorder_module_init(void) +{ +	return nft_register_expr(&nft_byteorder_type); +} + +void nft_byteorder_module_exit(void) +{ +	nft_unregister_expr(&nft_byteorder_type); +} diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c new file mode 100644 index 00000000000..e2b3f51c81f --- /dev/null +++ b/net/netfilter/nft_cmp.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_cmp_expr { +	struct nft_data		data; +	enum nft_registers	sreg:8; +	u8			len; +	enum nft_cmp_ops	op:8; +}; + +static void nft_cmp_eval(const struct nft_expr *expr, +			 struct nft_data data[NFT_REG_MAX + 1], +			 const struct nft_pktinfo *pkt) +{ +	const struct nft_cmp_expr *priv = nft_expr_priv(expr); +	int d; + +	d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len); +	switch (priv->op) { +	case NFT_CMP_EQ: +		if (d != 0) +			goto mismatch; +		break; +	case NFT_CMP_NEQ: +		if (d == 0) +			goto mismatch; +		break; +	case NFT_CMP_LT: +		if (d == 0) +			goto mismatch; +	case NFT_CMP_LTE: +		if (d > 0) +			goto mismatch; +		break; +	case NFT_CMP_GT: +		if (d == 0) +			goto mismatch; +	case NFT_CMP_GTE: +		if (d < 0) +			goto mismatch; +		break; +	} +	return; + +mismatch: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = { +	[NFTA_CMP_SREG]		= { .type = NLA_U32 }, +	[NFTA_CMP_OP]		= { .type = NLA_U32 }, +	[NFTA_CMP_DATA]		= { .type = NLA_NESTED }, +}; + +static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +			const struct nlattr * const tb[]) +{ +	struct nft_cmp_expr *priv = nft_expr_priv(expr); +	struct nft_data_desc desc; +	int err; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); +	priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); + +	err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); +	BUG_ON(err < 0); + +	priv->len = desc.len; +	return 0; +} + +static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_cmp_expr *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op))) +		goto nla_put_failure; + +	if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, +			  NFT_DATA_VALUE, priv->len) < 0) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_cmp_type; +static const struct nft_expr_ops nft_cmp_ops = { +	.type		= &nft_cmp_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), +	.eval		= nft_cmp_eval, +	.init		= nft_cmp_init, +	.dump		= nft_cmp_dump, +}; + +static int nft_cmp_fast_init(const struct nft_ctx *ctx, +			     const struct nft_expr *expr, +			     const struct nlattr * const tb[]) +{ +	struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); +	struct nft_data_desc desc; +	struct nft_data data; +	u32 mask; +	int err; + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + +	err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); +	BUG_ON(err < 0); +	desc.len *= BITS_PER_BYTE; + +	mask = nft_cmp_fast_mask(desc.len); +	priv->data = data.data[0] & mask; +	priv->len  = desc.len; +	return 0; +} + +static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); +	struct nft_data data; + +	if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ))) +		goto nla_put_failure; + +	data.data[0] = priv->data; +	if (nft_data_dump(skb, NFTA_CMP_DATA, &data, +			  NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +const struct nft_expr_ops nft_cmp_fast_ops = { +	.type		= &nft_cmp_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)), +	.eval		= NULL,	/* inlined */ +	.init		= nft_cmp_fast_init, +	.dump		= nft_cmp_fast_dump, +}; + +static const struct nft_expr_ops * +nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) +{ +	struct nft_data_desc desc; +	struct nft_data data; +	enum nft_registers sreg; +	enum nft_cmp_ops op; +	int err; + +	if (tb[NFTA_CMP_SREG] == NULL || +	    tb[NFTA_CMP_OP] == NULL || +	    tb[NFTA_CMP_DATA] == NULL) +		return ERR_PTR(-EINVAL); + +	sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); +	err = nft_validate_input_register(sreg); +	if (err < 0) +		return ERR_PTR(err); + +	op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); +	switch (op) { +	case NFT_CMP_EQ: +	case NFT_CMP_NEQ: +	case NFT_CMP_LT: +	case NFT_CMP_LTE: +	case NFT_CMP_GT: +	case NFT_CMP_GTE: +		break; +	default: +		return ERR_PTR(-EINVAL); +	} + +	err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); +	if (err < 0) +		return ERR_PTR(err); + +	if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) +		return &nft_cmp_fast_ops; +	else +		return &nft_cmp_ops; +} + +static struct nft_expr_type nft_cmp_type __read_mostly = { +	.name		= "cmp", +	.select_ops	= nft_cmp_select_ops, +	.policy		= nft_cmp_policy, +	.maxattr	= NFTA_CMP_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_cmp_module_init(void) +{ +	return nft_register_expr(&nft_cmp_type); +} + +void nft_cmp_module_exit(void) +{ +	nft_unregister_expr(&nft_cmp_type); +} diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c new file mode 100644 index 00000000000..1840989092e --- /dev/null +++ b/net/netfilter/nft_compat.c @@ -0,0 +1,793 @@ +/* + * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This software has been sponsored by Sophos Astaro <http://www.sophos.com> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/netfilter/nf_tables_compat.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <asm/uaccess.h> /* for set_fs */ +#include <net/netfilter/nf_tables.h> + +union nft_entry { +	struct ipt_entry e4; +	struct ip6t_entry e6; +}; + +static inline void +nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +{ +	par->target	= xt; +	par->targinfo	= xt_info; +	par->hotdrop	= false; +} + +static void nft_target_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	void *info = nft_expr_priv(expr); +	struct xt_target *target = expr->ops->data; +	struct sk_buff *skb = pkt->skb; +	int ret; + +	nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + +	ret = target->target(skb, &pkt->xt); + +	if (pkt->xt.hotdrop) +		ret = NF_DROP; + +	switch(ret) { +	case XT_CONTINUE: +		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +		break; +	default: +		data[NFT_REG_VERDICT].verdict = ret; +		break; +	} +	return; +} + +static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { +	[NFTA_TARGET_NAME]	= { .type = NLA_NUL_STRING }, +	[NFTA_TARGET_REV]	= { .type = NLA_U32 }, +	[NFTA_TARGET_INFO]	= { .type = NLA_BINARY }, +}; + +static void +nft_target_set_tgchk_param(struct xt_tgchk_param *par, +			   const struct nft_ctx *ctx, +			   struct xt_target *target, void *info, +			   union nft_entry *entry, u8 proto, bool inv) +{ +	par->net	= &init_net; +	par->table	= ctx->table->name; +	switch (ctx->afi->family) { +	case AF_INET: +		entry->e4.ip.proto = proto; +		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; +		break; +	case AF_INET6: +		entry->e6.ipv6.proto = proto; +		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; +		break; +	} +	par->entryinfo	= entry; +	par->target	= target; +	par->targinfo	= info; +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		par->hook_mask = 1 << ops->hooknum; +	} +	par->family	= ctx->afi->family; +} + +static void target_compat_from_user(struct xt_target *t, void *in, void *out) +{ +#ifdef CONFIG_COMPAT +	if (t->compat_from_user) { +		int pad; + +		t->compat_from_user(out, in); +		pad = XT_ALIGN(t->targetsize) - t->targetsize; +		if (pad > 0) +			memset(out + t->targetsize, 0, pad); +	} else +#endif +		memcpy(out, in, XT_ALIGN(t->targetsize)); +} + +static inline int nft_compat_target_offset(struct xt_target *target) +{ +#ifdef CONFIG_COMPAT +	return xt_compat_target_offset(target); +#else +	return 0; +#endif +} + +static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { +	[NFTA_RULE_COMPAT_PROTO]	= { .type = NLA_U32 }, +	[NFTA_RULE_COMPAT_FLAGS]	= { .type = NLA_U32 }, +}; + +static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) +{ +	struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; +	u32 flags; +	int err; + +	err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr, +			       nft_rule_compat_policy); +	if (err < 0) +		return err; + +	if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS]) +		return -EINVAL; + +	flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); +	if (flags & ~NFT_RULE_COMPAT_F_MASK) +		return -EINVAL; +	if (flags & NFT_RULE_COMPAT_F_INV) +		*inv = true; + +	*proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); +	return 0; +} + +static int +nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +		const struct nlattr * const tb[]) +{ +	void *info = nft_expr_priv(expr); +	struct xt_target *target = expr->ops->data; +	struct xt_tgchk_param par; +	size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); +	u8 proto = 0; +	bool inv = false; +	union nft_entry e = {}; +	int ret; + +	target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info); + +	if (ctx->nla[NFTA_RULE_COMPAT]) { +		ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); +		if (ret < 0) +			goto err; +	} + +	nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + +	ret = xt_check_target(&par, size, proto, inv); +	if (ret < 0) +		goto err; + +	/* The standard target cannot be used */ +	if (target->target == NULL) { +		ret = -EINVAL; +		goto err; +	} + +	return 0; +err: +	module_put(target->me); +	return ret; +} + +static void +nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ +	struct xt_target *target = expr->ops->data; +	void *info = nft_expr_priv(expr); +	struct xt_tgdtor_param par; + +	par.net = ctx->net; +	par.target = target; +	par.targinfo = info; +	par.family = ctx->afi->family; +	if (par.target->destroy != NULL) +		par.target->destroy(&par); + +	module_put(target->me); +} + +static int +target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in) +{ +	int ret; + +#ifdef CONFIG_COMPAT +	if (t->compat_to_user) { +		mm_segment_t old_fs; +		void *out; + +		out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC); +		if (out == NULL) +			return -ENOMEM; + +		/* We want to reuse existing compat_to_user */ +		old_fs = get_fs(); +		set_fs(KERNEL_DS); +		t->compat_to_user(out, in); +		set_fs(old_fs); +		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out); +		kfree(out); +	} else +#endif +		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in); + +	return ret; +} + +static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct xt_target *target = expr->ops->data; +	void *info = nft_expr_priv(expr); + +	if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || +	    nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || +	    target_dump_info(skb, target, info)) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_target_validate(const struct nft_ctx *ctx, +			       const struct nft_expr *expr, +			       const struct nft_data **data) +{ +	struct xt_target *target = expr->ops->data; +	unsigned int hook_mask = 0; + +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		hook_mask = 1 << ops->hooknum; +		if (hook_mask & target->hooks) +			return 0; + +		/* This target is being called from an invalid chain */ +		return -EINVAL; +	} +	return 0; +} + +static void nft_match_eval(const struct nft_expr *expr, +			   struct nft_data data[NFT_REG_MAX + 1], +			   const struct nft_pktinfo *pkt) +{ +	void *info = nft_expr_priv(expr); +	struct xt_match *match = expr->ops->data; +	struct sk_buff *skb = pkt->skb; +	bool ret; + +	nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + +	ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + +	if (pkt->xt.hotdrop) { +		data[NFT_REG_VERDICT].verdict = NF_DROP; +		return; +	} + +	switch(ret) { +	case true: +		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; +		break; +	case false: +		data[NFT_REG_VERDICT].verdict = NFT_BREAK; +		break; +	} +} + +static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { +	[NFTA_MATCH_NAME]	= { .type = NLA_NUL_STRING }, +	[NFTA_MATCH_REV]	= { .type = NLA_U32 }, +	[NFTA_MATCH_INFO]	= { .type = NLA_BINARY }, +}; + +/* struct xt_mtchk_param and xt_tgchk_param look very similar */ +static void +nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, +			  struct xt_match *match, void *info, +			  union nft_entry *entry, u8 proto, bool inv) +{ +	par->net	= &init_net; +	par->table	= ctx->table->name; +	switch (ctx->afi->family) { +	case AF_INET: +		entry->e4.ip.proto = proto; +		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; +		break; +	case AF_INET6: +		entry->e6.ipv6.proto = proto; +		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; +		break; +	} +	par->entryinfo	= entry; +	par->match	= match; +	par->matchinfo	= info; +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		par->hook_mask = 1 << ops->hooknum; +	} +	par->family	= ctx->afi->family; +} + +static void match_compat_from_user(struct xt_match *m, void *in, void *out) +{ +#ifdef CONFIG_COMPAT +	if (m->compat_from_user) { +		int pad; + +		m->compat_from_user(out, in); +		pad = XT_ALIGN(m->matchsize) - m->matchsize; +		if (pad > 0) +			memset(out + m->matchsize, 0, pad); +	} else +#endif +		memcpy(out, in, XT_ALIGN(m->matchsize)); +} + +static int +nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +		const struct nlattr * const tb[]) +{ +	void *info = nft_expr_priv(expr); +	struct xt_match *match = expr->ops->data; +	struct xt_mtchk_param par; +	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); +	u8 proto = 0; +	bool inv = false; +	union nft_entry e = {}; +	int ret; + +	match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info); + +	if (ctx->nla[NFTA_RULE_COMPAT]) { +		ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); +		if (ret < 0) +			goto err; +	} + +	nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + +	ret = xt_check_match(&par, size, proto, inv); +	if (ret < 0) +		goto err; + +	return 0; +err: +	module_put(match->me); +	return ret; +} + +static void +nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ +	struct xt_match *match = expr->ops->data; +	void *info = nft_expr_priv(expr); +	struct xt_mtdtor_param par; + +	par.net = ctx->net; +	par.match = match; +	par.matchinfo = info; +	par.family = ctx->afi->family; +	if (par.match->destroy != NULL) +		par.match->destroy(&par); + +	module_put(match->me); +} + +static int +match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in) +{ +	int ret; + +#ifdef CONFIG_COMPAT +	if (m->compat_to_user) { +		mm_segment_t old_fs; +		void *out; + +		out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC); +		if (out == NULL) +			return -ENOMEM; + +		/* We want to reuse existing compat_to_user */ +		old_fs = get_fs(); +		set_fs(KERNEL_DS); +		m->compat_to_user(out, in); +		set_fs(old_fs); +		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out); +		kfree(out); +	} else +#endif +		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in); + +	return ret; +} + +static inline int nft_compat_match_offset(struct xt_match *match) +{ +#ifdef CONFIG_COMPAT +	return xt_compat_match_offset(match); +#else +	return 0; +#endif +} + +static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	void *info = nft_expr_priv(expr); +	struct xt_match *match = expr->ops->data; + +	if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || +	    nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || +	    match_dump_info(skb, match, info)) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_match_validate(const struct nft_ctx *ctx, +			      const struct nft_expr *expr, +			      const struct nft_data **data) +{ +	struct xt_match *match = expr->ops->data; +	unsigned int hook_mask = 0; + +	if (ctx->chain->flags & NFT_BASE_CHAIN) { +		const struct nft_base_chain *basechain = +						nft_base_chain(ctx->chain); +		const struct nf_hook_ops *ops = &basechain->ops[0]; + +		hook_mask = 1 << ops->hooknum; +		if (hook_mask & match->hooks) +			return 0; + +		/* This match is being called from an invalid chain */ +		return -EINVAL; +	} +	return 0; +} + +static int +nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, +		      int event, u16 family, const char *name, +		      int rev, int target) +{ +	struct nlmsghdr *nlh; +	struct nfgenmsg *nfmsg; +	unsigned int flags = portid ? NLM_F_MULTI : 0; + +	event |= NFNL_SUBSYS_NFT_COMPAT << 8; +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); +	if (nlh == NULL) +		goto nlmsg_failure; + +	nfmsg = nlmsg_data(nlh); +	nfmsg->nfgen_family = family; +	nfmsg->version = NFNETLINK_V0; +	nfmsg->res_id = 0; + +	if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || +	    nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || +	    nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) +		goto nla_put_failure; + +	nlmsg_end(skb, nlh); +	return skb->len; + +nlmsg_failure: +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -1; +} + +static int +nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, +		const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ +	int ret = 0, target; +	struct nfgenmsg *nfmsg; +	const char *fmt; +	const char *name; +	u32 rev; +	struct sk_buff *skb2; + +	if (tb[NFTA_COMPAT_NAME] == NULL || +	    tb[NFTA_COMPAT_REV] == NULL || +	    tb[NFTA_COMPAT_TYPE] == NULL) +		return -EINVAL; + +	name = nla_data(tb[NFTA_COMPAT_NAME]); +	rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); +	target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); + +	nfmsg = nlmsg_data(nlh); + +	switch(nfmsg->nfgen_family) { +	case AF_INET: +		fmt = "ipt_%s"; +		break; +	case AF_INET6: +		fmt = "ip6t_%s"; +		break; +	default: +		pr_err("nft_compat: unsupported protocol %d\n", +			nfmsg->nfgen_family); +		return -EINVAL; +	} + +	try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, +						 rev, target, &ret), +						 fmt, name); + +	if (ret < 0) +		return ret; + +	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (skb2 == NULL) +		return -ENOMEM; + +	/* include the best revision for this extension in the message */ +	if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, +				  nlh->nlmsg_seq, +				  NFNL_MSG_TYPE(nlh->nlmsg_type), +				  NFNL_MSG_COMPAT_GET, +				  nfmsg->nfgen_family, +				  name, ret, target) <= 0) { +		kfree_skb(skb2); +		return -ENOSPC; +	} + +	ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, +				MSG_DONTWAIT); +	if (ret > 0) +		ret = 0; + +	return ret == -EAGAIN ? -ENOBUFS : ret; +} + +static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { +	[NFTA_COMPAT_NAME]	= { .type = NLA_NUL_STRING, +				    .len = NFT_COMPAT_NAME_MAX-1 }, +	[NFTA_COMPAT_REV]	= { .type = NLA_U32 }, +	[NFTA_COMPAT_TYPE]	= { .type = NLA_U32 }, +}; + +static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { +	[NFNL_MSG_COMPAT_GET]		= { .call = nfnl_compat_get, +					    .attr_count = NFTA_COMPAT_MAX, +					    .policy = nfnl_compat_policy_get }, +}; + +static const struct nfnetlink_subsystem nfnl_compat_subsys = { +	.name		= "nft-compat", +	.subsys_id	= NFNL_SUBSYS_NFT_COMPAT, +	.cb_count	= NFNL_MSG_COMPAT_MAX, +	.cb		= nfnl_nft_compat_cb, +}; + +static LIST_HEAD(nft_match_list); + +struct nft_xt { +	struct list_head	head; +	struct nft_expr_ops	ops; +}; + +static struct nft_expr_type nft_match_type; + +static const struct nft_expr_ops * +nft_match_select_ops(const struct nft_ctx *ctx, +		     const struct nlattr * const tb[]) +{ +	struct nft_xt *nft_match; +	struct xt_match *match; +	char *mt_name; +	__u32 rev, family; + +	if (tb[NFTA_MATCH_NAME] == NULL || +	    tb[NFTA_MATCH_REV] == NULL || +	    tb[NFTA_MATCH_INFO] == NULL) +		return ERR_PTR(-EINVAL); + +	mt_name = nla_data(tb[NFTA_MATCH_NAME]); +	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); +	family = ctx->afi->family; + +	/* Re-use the existing match if it's already loaded. */ +	list_for_each_entry(nft_match, &nft_match_list, head) { +		struct xt_match *match = nft_match->ops.data; + +		if (strcmp(match->name, mt_name) == 0 && +		    match->revision == rev && match->family == family) +			return &nft_match->ops; +	} + +	match = xt_request_find_match(family, mt_name, rev); +	if (IS_ERR(match)) +		return ERR_PTR(-ENOENT); + +	/* This is the first time we use this match, allocate operations */ +	nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); +	if (nft_match == NULL) +		return ERR_PTR(-ENOMEM); + +	nft_match->ops.type = &nft_match_type; +	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) + +					    nft_compat_match_offset(match)); +	nft_match->ops.eval = nft_match_eval; +	nft_match->ops.init = nft_match_init; +	nft_match->ops.destroy = nft_match_destroy; +	nft_match->ops.dump = nft_match_dump; +	nft_match->ops.validate = nft_match_validate; +	nft_match->ops.data = match; + +	list_add(&nft_match->head, &nft_match_list); + +	return &nft_match->ops; +} + +static void nft_match_release(void) +{ +	struct nft_xt *nft_match, *tmp; + +	list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head) +		kfree(nft_match); +} + +static struct nft_expr_type nft_match_type __read_mostly = { +	.name		= "match", +	.select_ops	= nft_match_select_ops, +	.policy		= nft_match_policy, +	.maxattr	= NFTA_MATCH_MAX, +	.owner		= THIS_MODULE, +}; + +static LIST_HEAD(nft_target_list); + +static struct nft_expr_type nft_target_type; + +static const struct nft_expr_ops * +nft_target_select_ops(const struct nft_ctx *ctx, +		      const struct nlattr * const tb[]) +{ +	struct nft_xt *nft_target; +	struct xt_target *target; +	char *tg_name; +	__u32 rev, family; + +	if (tb[NFTA_TARGET_NAME] == NULL || +	    tb[NFTA_TARGET_REV] == NULL || +	    tb[NFTA_TARGET_INFO] == NULL) +		return ERR_PTR(-EINVAL); + +	tg_name = nla_data(tb[NFTA_TARGET_NAME]); +	rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); +	family = ctx->afi->family; + +	/* Re-use the existing target if it's already loaded. */ +	list_for_each_entry(nft_target, &nft_match_list, head) { +		struct xt_target *target = nft_target->ops.data; + +		if (strcmp(target->name, tg_name) == 0 && +		    target->revision == rev && target->family == family) +			return &nft_target->ops; +	} + +	target = xt_request_find_target(family, tg_name, rev); +	if (IS_ERR(target)) +		return ERR_PTR(-ENOENT); + +	/* This is the first time we use this target, allocate operations */ +	nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); +	if (nft_target == NULL) +		return ERR_PTR(-ENOMEM); + +	nft_target->ops.type = &nft_target_type; +	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) + +					     nft_compat_target_offset(target)); +	nft_target->ops.eval = nft_target_eval; +	nft_target->ops.init = nft_target_init; +	nft_target->ops.destroy = nft_target_destroy; +	nft_target->ops.dump = nft_target_dump; +	nft_target->ops.validate = nft_target_validate; +	nft_target->ops.data = target; + +	list_add(&nft_target->head, &nft_target_list); + +	return &nft_target->ops; +} + +static void nft_target_release(void) +{ +	struct nft_xt *nft_target, *tmp; + +	list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head) +		kfree(nft_target); +} + +static struct nft_expr_type nft_target_type __read_mostly = { +	.name		= "target", +	.select_ops	= nft_target_select_ops, +	.policy		= nft_target_policy, +	.maxattr	= NFTA_TARGET_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_compat_module_init(void) +{ +	int ret; + +	ret = nft_register_expr(&nft_match_type); +	if (ret < 0) +		return ret; + +	ret = nft_register_expr(&nft_target_type); +	if (ret < 0) +		goto err_match; + +	ret = nfnetlink_subsys_register(&nfnl_compat_subsys); +	if (ret < 0) { +		pr_err("nft_compat: cannot register with nfnetlink.\n"); +		goto err_target; +	} + +	pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n"); + +	return ret; + +err_target: +	nft_unregister_expr(&nft_target_type); +err_match: +	nft_unregister_expr(&nft_match_type); +	return ret; +} + +static void __exit nft_compat_module_exit(void) +{ +	nfnetlink_subsys_unregister(&nfnl_compat_subsys); +	nft_unregister_expr(&nft_target_type); +	nft_unregister_expr(&nft_match_type); +	nft_match_release(); +	nft_target_release(); +} + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); + +module_init(nft_compat_module_init); +module_exit(nft_compat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_EXPR("match"); +MODULE_ALIAS_NFT_EXPR("target"); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c new file mode 100644 index 00000000000..c89ee486ce5 --- /dev/null +++ b/net/netfilter/nft_counter.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/seqlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_counter { +	seqlock_t	lock; +	u64		bytes; +	u64		packets; +}; + +static void nft_counter_eval(const struct nft_expr *expr, +			     struct nft_data data[NFT_REG_MAX + 1], +			     const struct nft_pktinfo *pkt) +{ +	struct nft_counter *priv = nft_expr_priv(expr); + +	write_seqlock_bh(&priv->lock); +	priv->bytes += pkt->skb->len; +	priv->packets++; +	write_sequnlock_bh(&priv->lock); +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	struct nft_counter *priv = nft_expr_priv(expr); +	unsigned int seq; +	u64 bytes; +	u64 packets; + +	do { +		seq = read_seqbegin(&priv->lock); +		bytes	= priv->bytes; +		packets	= priv->packets; +	} while (read_seqretry(&priv->lock, seq)); + +	if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes))) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { +	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 }, +	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 }, +}; + +static int nft_counter_init(const struct nft_ctx *ctx, +			    const struct nft_expr *expr, +			    const struct nlattr * const tb[]) +{ +	struct nft_counter *priv = nft_expr_priv(expr); + +	if (tb[NFTA_COUNTER_PACKETS]) +	        priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); +	if (tb[NFTA_COUNTER_BYTES]) +		priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + +	seqlock_init(&priv->lock); +	return 0; +} + +static struct nft_expr_type nft_counter_type; +static const struct nft_expr_ops nft_counter_ops = { +	.type		= &nft_counter_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_counter)), +	.eval		= nft_counter_eval, +	.init		= nft_counter_init, +	.dump		= nft_counter_dump, +}; + +static struct nft_expr_type nft_counter_type __read_mostly = { +	.name		= "counter", +	.ops		= &nft_counter_ops, +	.policy		= nft_counter_policy, +	.maxattr	= NFTA_COUNTER_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_counter_module_init(void) +{ +	return nft_register_expr(&nft_counter_type); +} + +static void __exit nft_counter_module_exit(void) +{ +	nft_unregister_expr(&nft_counter_type); +} + +module_init(nft_counter_module_init); +module_exit(nft_counter_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("counter"); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c new file mode 100644 index 00000000000..cc560301624 --- /dev/null +++ b/net/netfilter/nft_ct.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +struct nft_ct { +	enum nft_ct_keys	key:8; +	enum ip_conntrack_dir	dir:8; +	union { +		enum nft_registers	dreg:8; +		enum nft_registers	sreg:8; +	}; +}; + +static void nft_ct_get_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); +	struct nft_data *dest = &data[priv->dreg]; +	enum ip_conntrack_info ctinfo; +	const struct nf_conn *ct; +	const struct nf_conn_help *help; +	const struct nf_conntrack_tuple *tuple; +	const struct nf_conntrack_helper *helper; +	long diff; +	unsigned int state; + +	ct = nf_ct_get(pkt->skb, &ctinfo); + +	switch (priv->key) { +	case NFT_CT_STATE: +		if (ct == NULL) +			state = NF_CT_STATE_INVALID_BIT; +		else if (nf_ct_is_untracked(ct)) +			state = NF_CT_STATE_UNTRACKED_BIT; +		else +			state = NF_CT_STATE_BIT(ctinfo); +		dest->data[0] = state; +		return; +	} + +	if (ct == NULL) +		goto err; + +	switch (priv->key) { +	case NFT_CT_DIRECTION: +		dest->data[0] = CTINFO2DIR(ctinfo); +		return; +	case NFT_CT_STATUS: +		dest->data[0] = ct->status; +		return; +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +		dest->data[0] = ct->mark; +		return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK +	case NFT_CT_SECMARK: +		dest->data[0] = ct->secmark; +		return; +#endif +	case NFT_CT_EXPIRATION: +		diff = (long)jiffies - (long)ct->timeout.expires; +		if (diff < 0) +			diff = 0; +		dest->data[0] = jiffies_to_msecs(diff); +		return; +	case NFT_CT_HELPER: +		if (ct->master == NULL) +			goto err; +		help = nfct_help(ct->master); +		if (help == NULL) +			goto err; +		helper = rcu_dereference(help->helper); +		if (helper == NULL) +			goto err; +		if (strlen(helper->name) >= sizeof(dest->data)) +			goto err; +		strncpy((char *)dest->data, helper->name, sizeof(dest->data)); +		return; +#ifdef CONFIG_NF_CONNTRACK_LABELS +	case NFT_CT_LABELS: { +		struct nf_conn_labels *labels = nf_ct_labels_find(ct); +		unsigned int size; + +		if (!labels) { +			memset(dest->data, 0, sizeof(dest->data)); +			return; +		} + +		BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data)); +		size = labels->words * sizeof(long); + +		memcpy(dest->data, labels->bits, size); +		if (size < sizeof(dest->data)) +			memset(((char *) dest->data) + size, 0, +			       sizeof(dest->data) - size); +		return; +	} +#endif +	} + +	tuple = &ct->tuplehash[priv->dir].tuple; +	switch (priv->key) { +	case NFT_CT_L3PROTOCOL: +		dest->data[0] = nf_ct_l3num(ct); +		return; +	case NFT_CT_SRC: +		memcpy(dest->data, tuple->src.u3.all, +		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); +		return; +	case NFT_CT_DST: +		memcpy(dest->data, tuple->dst.u3.all, +		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); +		return; +	case NFT_CT_PROTOCOL: +		dest->data[0] = nf_ct_protonum(ct); +		return; +	case NFT_CT_PROTO_SRC: +		dest->data[0] = (__force __u16)tuple->src.u.all; +		return; +	case NFT_CT_PROTO_DST: +		dest->data[0] = (__force __u16)tuple->dst.u.all; +		return; +	} +	return; +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static void nft_ct_set_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); +	struct sk_buff *skb = pkt->skb; +#ifdef CONFIG_NF_CONNTRACK_MARK +	u32 value = data[priv->sreg].data[0]; +#endif +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; + +	ct = nf_ct_get(skb, &ctinfo); +	if (ct == NULL) +		return; + +	switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +		if (ct->mark != value) { +			ct->mark = value; +			nf_conntrack_event_cache(IPCT_MARK, ct); +		} +		break; +#endif +	} +} + +static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { +	[NFTA_CT_DREG]		= { .type = NLA_U32 }, +	[NFTA_CT_KEY]		= { .type = NLA_U32 }, +	[NFTA_CT_DIRECTION]	= { .type = NLA_U8 }, +	[NFTA_CT_SREG]		= { .type = NLA_U32 }, +}; + +static int nft_ct_l3proto_try_module_get(uint8_t family) +{ +	int err; + +	if (family == NFPROTO_INET) { +		err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4); +		if (err < 0) +			goto err1; +		err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6); +		if (err < 0) +			goto err2; +	} else { +		err = nf_ct_l3proto_try_module_get(family); +		if (err < 0) +			goto err1; +	} +	return 0; + +err2: +	nf_ct_l3proto_module_put(NFPROTO_IPV4); +err1: +	return err; +} + +static void nft_ct_l3proto_module_put(uint8_t family) +{ +	if (family == NFPROTO_INET) { +		nf_ct_l3proto_module_put(NFPROTO_IPV4); +		nf_ct_l3proto_module_put(NFPROTO_IPV6); +	} else +		nf_ct_l3proto_module_put(family); +} + +static int nft_ct_get_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_ct *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); +	switch (priv->key) { +	case NFT_CT_STATE: +	case NFT_CT_DIRECTION: +	case NFT_CT_STATUS: +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK +	case NFT_CT_SECMARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS +	case NFT_CT_LABELS: +#endif +	case NFT_CT_EXPIRATION: +	case NFT_CT_HELPER: +		if (tb[NFTA_CT_DIRECTION] != NULL) +			return -EINVAL; +		break; +	case NFT_CT_L3PROTOCOL: +	case NFT_CT_PROTOCOL: +	case NFT_CT_SRC: +	case NFT_CT_DST: +	case NFT_CT_PROTO_SRC: +	case NFT_CT_PROTO_DST: +		if (tb[NFTA_CT_DIRECTION] == NULL) +			return -EINVAL; +		break; +	default: +		return -EOPNOTSUPP; +	} + +	if (tb[NFTA_CT_DIRECTION] != NULL) { +		priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); +		switch (priv->dir) { +		case IP_CT_DIR_ORIGINAL: +		case IP_CT_DIR_REPLY: +			break; +		default: +			return -EINVAL; +		} +	} + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; + +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	err = nft_ct_l3proto_try_module_get(ctx->afi->family); +	if (err < 0) +		return err; + +	return 0; +} + +static int nft_ct_set_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_ct *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); +	switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK +	case NFT_CT_MARK: +		break; +#endif +	default: +		return -EOPNOTSUPP; +	} + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	err = nft_ct_l3proto_try_module_get(ctx->afi->family); +	if (err < 0) +		return err; + +	return 0; +} + +static void nft_ct_destroy(const struct nft_ctx *ctx, +			   const struct nft_expr *expr) +{ +	nft_ct_l3proto_module_put(ctx->afi->family); +} + +static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) +		goto nla_put_failure; + +	switch (priv->key) { +	case NFT_CT_PROTOCOL: +	case NFT_CT_SRC: +	case NFT_CT_DST: +	case NFT_CT_PROTO_SRC: +	case NFT_CT_PROTO_DST: +		if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) +			goto nla_put_failure; +	default: +		break; +	} + +	return 0; + +nla_put_failure: +	return -1; +} + +static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_ct *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_ct_type; +static const struct nft_expr_ops nft_ct_get_ops = { +	.type		= &nft_ct_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)), +	.eval		= nft_ct_get_eval, +	.init		= nft_ct_get_init, +	.destroy	= nft_ct_destroy, +	.dump		= nft_ct_get_dump, +}; + +static const struct nft_expr_ops nft_ct_set_ops = { +	.type		= &nft_ct_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)), +	.eval		= nft_ct_set_eval, +	.init		= nft_ct_set_init, +	.destroy	= nft_ct_destroy, +	.dump		= nft_ct_set_dump, +}; + +static const struct nft_expr_ops * +nft_ct_select_ops(const struct nft_ctx *ctx, +		    const struct nlattr * const tb[]) +{ +	if (tb[NFTA_CT_KEY] == NULL) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_CT_DREG]) +		return &nft_ct_get_ops; + +	if (tb[NFTA_CT_SREG]) +		return &nft_ct_set_ops; + +	return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_ct_type __read_mostly = { +	.name		= "ct", +	.select_ops	= &nft_ct_select_ops, +	.policy		= nft_ct_policy, +	.maxattr	= NFTA_CT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_ct_module_init(void) +{ +	return nft_register_expr(&nft_ct_type); +} + +static void __exit nft_ct_module_exit(void) +{ +	nft_unregister_expr(&nft_ct_type); +} + +module_init(nft_ct_module_init); +module_exit(nft_ct_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("ct"); diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c new file mode 100644 index 00000000000..b6eed4d5a09 --- /dev/null +++ b/net/netfilter/nft_expr_template.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_template { + +}; + +static void nft_template_eval(const struct nft_expr *expr, +			      struct nft_data data[NFT_REG_MAX + 1], +			      const struct nft_pktinfo *pkt) +{ +	struct nft_template *priv = nft_expr_priv(expr); + +} + +static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = { +	[NFTA_TEMPLATE_ATTR]		= { .type = NLA_U32 }, +}; + +static int nft_template_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_template *priv = nft_expr_priv(expr); + +	return 0; +} + +static void nft_template_destroy(const struct nft_ctx *ctx, +			       const struct nft_expr *expr) +{ +	struct nft_template *priv = nft_expr_priv(expr); + +} + +static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_template *priv = nft_expr_priv(expr); + +	NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field); +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_template_type; +static const struct nft_expr_ops nft_template_ops = { +	.type		= &nft_template_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_template)), +	.eval		= nft_template_eval, +	.init		= nft_template_init, +	.destroy	= nft_template_destroy, +	.dump		= nft_template_dump, +}; + +static struct nft_expr_type nft_template_type __read_mostly = { +	.name		= "template", +	.ops		= &nft_template_ops, +	.policy		= nft_template_policy, +	.maxattr	= NFTA_TEMPLATE_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_template_module_init(void) +{ +	return nft_register_expr(&nft_template_type); +} + +static void __exit nft_template_module_exit(void) +{ +	nft_unregister_expr(&nft_template_type); +} + +module_init(nft_template_module_init); +module_exit(nft_template_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("template"); diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c new file mode 100644 index 00000000000..55c939f5371 --- /dev/null +++ b/net/netfilter/nft_exthdr.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +// FIXME: +#include <net/ipv6.h> + +struct nft_exthdr { +	u8			type; +	u8			offset; +	u8			len; +	enum nft_registers	dreg:8; +}; + +static void nft_exthdr_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	struct nft_exthdr *priv = nft_expr_priv(expr); +	struct nft_data *dest = &data[priv->dreg]; +	unsigned int offset = 0; +	int err; + +	err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); +	if (err < 0) +		goto err; +	offset += priv->offset; + +	if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0) +		goto err; +	return; +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { +	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 }, +	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 }, +	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 }, +	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 }, +}; + +static int nft_exthdr_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_exthdr *priv = nft_expr_priv(expr); +	int err; + +	if (tb[NFTA_EXTHDR_DREG] == NULL || +	    tb[NFTA_EXTHDR_TYPE] == NULL || +	    tb[NFTA_EXTHDR_OFFSET] == NULL || +	    tb[NFTA_EXTHDR_LEN] == NULL) +		return -EINVAL; + +	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); +	priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); +	priv->len    = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); +	if (priv->len == 0 || +	    priv->len > FIELD_SIZEOF(struct nft_data, data)) +		return -EINVAL; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_exthdr *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_exthdr_type; +static const struct nft_expr_ops nft_exthdr_ops = { +	.type		= &nft_exthdr_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), +	.eval		= nft_exthdr_eval, +	.init		= nft_exthdr_init, +	.dump		= nft_exthdr_dump, +}; + +static struct nft_expr_type nft_exthdr_type __read_mostly = { +	.name		= "exthdr", +	.ops		= &nft_exthdr_ops, +	.policy		= nft_exthdr_policy, +	.maxattr	= NFTA_EXTHDR_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_exthdr_module_init(void) +{ +	return nft_register_expr(&nft_exthdr_type); +} + +static void __exit nft_exthdr_module_exit(void) +{ +	nft_unregister_expr(&nft_exthdr_type); +} + +module_init(nft_exthdr_module_init); +module_exit(nft_exthdr_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("exthdr"); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c new file mode 100644 index 00000000000..4080ed6a072 --- /dev/null +++ b/net/netfilter/nft_hash.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/log2.h> +#include <linux/jhash.h> +#include <linux/netlink.h> +#include <linux/vmalloc.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +#define NFT_HASH_MIN_SIZE	4UL + +struct nft_hash { +	struct nft_hash_table __rcu	*tbl; +}; + +struct nft_hash_table { +	unsigned int			size; +	struct nft_hash_elem __rcu	*buckets[]; +}; + +struct nft_hash_elem { +	struct nft_hash_elem __rcu	*next; +	struct nft_data			key; +	struct nft_data			data[]; +}; + +#define nft_hash_for_each_entry(i, head) \ +	for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next)) +#define nft_hash_for_each_entry_rcu(i, head) \ +	for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next)) + +static u32 nft_hash_rnd __read_mostly; +static bool nft_hash_rnd_initted __read_mostly; + +static unsigned int nft_hash_data(const struct nft_data *data, +				  unsigned int hsize, unsigned int len) +{ +	unsigned int h; + +	h = jhash(data->data, len, nft_hash_rnd); +	return h & (hsize - 1); +} + +static bool nft_hash_lookup(const struct nft_set *set, +			    const struct nft_data *key, +			    struct nft_data *data) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = rcu_dereference(priv->tbl); +	const struct nft_hash_elem *he; +	unsigned int h; + +	h = nft_hash_data(key, tbl->size, set->klen); +	nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) { +		if (nft_data_cmp(&he->key, key, set->klen)) +			continue; +		if (set->flags & NFT_SET_MAP) +			nft_data_copy(data, he->data); +		return true; +	} +	return false; +} + +static void nft_hash_tbl_free(const struct nft_hash_table *tbl) +{ +	kvfree(tbl); +} + +static unsigned int nft_hash_tbl_size(unsigned int nelem) +{ +	return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); +} + +static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) +{ +	struct nft_hash_table *tbl; +	size_t size; + +	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); +	tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN); +	if (tbl == NULL) +		tbl = vzalloc(size); +	if (tbl == NULL) +		return NULL; +	tbl->size = nbuckets; + +	return tbl; +} + +static void nft_hash_chain_unzip(const struct nft_set *set, +				 const struct nft_hash_table *ntbl, +				 struct nft_hash_table *tbl, unsigned int n) +{ +	struct nft_hash_elem *he, *last, *next; +	unsigned int h; + +	he = nft_dereference(tbl->buckets[n]); +	if (he == NULL) +		return; +	h = nft_hash_data(&he->key, ntbl->size, set->klen); + +	/* Find last element of first chain hashing to bucket h */ +	last = he; +	nft_hash_for_each_entry(he, he->next) { +		if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) +			break; +		last = he; +	} + +	/* Unlink first chain from the old table */ +	RCU_INIT_POINTER(tbl->buckets[n], last->next); + +	/* If end of chain reached, done */ +	if (he == NULL) +		return; + +	/* Find first element of second chain hashing to bucket h */ +	next = NULL; +	nft_hash_for_each_entry(he, he->next) { +		if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) +			continue; +		next = he; +		break; +	} + +	/* Link the two chains */ +	RCU_INIT_POINTER(last->next, next); +} + +static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) +{ +	struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; +	struct nft_hash_elem *he; +	unsigned int i, h; +	bool complete; + +	ntbl = nft_hash_tbl_alloc(tbl->size * 2); +	if (ntbl == NULL) +		return -ENOMEM; + +	/* Link new table's buckets to first element in the old table +	 * hashing to the new bucket. +	 */ +	for (i = 0; i < ntbl->size; i++) { +		h = i < tbl->size ? i : i - tbl->size; +		nft_hash_for_each_entry(he, tbl->buckets[h]) { +			if (nft_hash_data(&he->key, ntbl->size, set->klen) != i) +				continue; +			RCU_INIT_POINTER(ntbl->buckets[i], he); +			break; +		} +	} + +	/* Publish new table */ +	rcu_assign_pointer(priv->tbl, ntbl); + +	/* Unzip interleaved hash chains */ +	do { +		/* Wait for readers to use new table/unzipped chains */ +		synchronize_rcu(); + +		complete = true; +		for (i = 0; i < tbl->size; i++) { +			nft_hash_chain_unzip(set, ntbl, tbl, i); +			if (tbl->buckets[i] != NULL) +				complete = false; +		} +	} while (!complete); + +	nft_hash_tbl_free(tbl); +	return 0; +} + +static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) +{ +	struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; +	struct nft_hash_elem __rcu **pprev; +	unsigned int i; + +	ntbl = nft_hash_tbl_alloc(tbl->size / 2); +	if (ntbl == NULL) +		return -ENOMEM; + +	for (i = 0; i < ntbl->size; i++) { +		ntbl->buckets[i] = tbl->buckets[i]; + +		for (pprev = &ntbl->buckets[i]; *pprev != NULL; +		     pprev = &nft_dereference(*pprev)->next) +			; +		RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); +	} + +	/* Publish new table */ +	rcu_assign_pointer(priv->tbl, ntbl); +	synchronize_rcu(); + +	nft_hash_tbl_free(tbl); +	return 0; +} + +static int nft_hash_insert(const struct nft_set *set, +			   const struct nft_set_elem *elem) +{ +	struct nft_hash *priv = nft_set_priv(set); +	struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem *he; +	unsigned int size, h; + +	if (elem->flags != 0) +		return -EINVAL; + +	size = sizeof(*he); +	if (set->flags & NFT_SET_MAP) +		size += sizeof(he->data[0]); + +	he = kzalloc(size, GFP_KERNEL); +	if (he == NULL) +		return -ENOMEM; + +	nft_data_copy(&he->key, &elem->key); +	if (set->flags & NFT_SET_MAP) +		nft_data_copy(he->data, &elem->data); + +	h = nft_hash_data(&he->key, tbl->size, set->klen); +	RCU_INIT_POINTER(he->next, tbl->buckets[h]); +	rcu_assign_pointer(tbl->buckets[h], he); + +	/* Expand table when exceeding 75% load */ +	if (set->nelems + 1 > tbl->size / 4 * 3) +		nft_hash_tbl_expand(set, priv); + +	return 0; +} + +static void nft_hash_elem_destroy(const struct nft_set *set, +				  struct nft_hash_elem *he) +{ +	nft_data_uninit(&he->key, NFT_DATA_VALUE); +	if (set->flags & NFT_SET_MAP) +		nft_data_uninit(he->data, set->dtype); +	kfree(he); +} + +static void nft_hash_remove(const struct nft_set *set, +			    const struct nft_set_elem *elem) +{ +	struct nft_hash *priv = nft_set_priv(set); +	struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem *he, __rcu **pprev; + +	pprev = elem->cookie; +	he = nft_dereference((*pprev)); + +	RCU_INIT_POINTER(*pprev, he->next); +	synchronize_rcu(); +	kfree(he); + +	/* Shrink table beneath 30% load */ +	if (set->nelems - 1 < tbl->size * 3 / 10 && +	    tbl->size > NFT_HASH_MIN_SIZE) +		nft_hash_tbl_shrink(set, priv); +} + +static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem __rcu * const *pprev; +	struct nft_hash_elem *he; +	unsigned int h; + +	h = nft_hash_data(&elem->key, tbl->size, set->klen); +	pprev = &tbl->buckets[h]; +	nft_hash_for_each_entry(he, tbl->buckets[h]) { +		if (nft_data_cmp(&he->key, &elem->key, set->klen)) { +			pprev = &he->next; +			continue; +		} + +		elem->cookie = (void *)pprev; +		elem->flags = 0; +		if (set->flags & NFT_SET_MAP) +			nft_data_copy(&elem->data, he->data); +		return 0; +	} +	return -ENOENT; +} + +static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, +			  struct nft_set_iter *iter) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	const struct nft_hash_elem *he; +	struct nft_set_elem elem; +	unsigned int i; + +	for (i = 0; i < tbl->size; i++) { +		nft_hash_for_each_entry(he, tbl->buckets[i]) { +			if (iter->count < iter->skip) +				goto cont; + +			memcpy(&elem.key, &he->key, sizeof(elem.key)); +			if (set->flags & NFT_SET_MAP) +				memcpy(&elem.data, he->data, sizeof(elem.data)); +			elem.flags = 0; + +			iter->err = iter->fn(ctx, set, iter, &elem); +			if (iter->err < 0) +				return; +cont: +			iter->count++; +		} +	} +} + +static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) +{ +	return sizeof(struct nft_hash); +} + +static int nft_hash_init(const struct nft_set *set, +			 const struct nft_set_desc *desc, +			 const struct nlattr * const tb[]) +{ +	struct nft_hash *priv = nft_set_priv(set); +	struct nft_hash_table *tbl; +	unsigned int size; + +	if (unlikely(!nft_hash_rnd_initted)) { +		get_random_bytes(&nft_hash_rnd, 4); +		nft_hash_rnd_initted = true; +	} + +	size = NFT_HASH_MIN_SIZE; +	if (desc->size) +		size = nft_hash_tbl_size(desc->size); + +	tbl = nft_hash_tbl_alloc(size); +	if (tbl == NULL) +		return -ENOMEM; +	RCU_INIT_POINTER(priv->tbl, tbl); +	return 0; +} + +static void nft_hash_destroy(const struct nft_set *set) +{ +	const struct nft_hash *priv = nft_set_priv(set); +	const struct nft_hash_table *tbl = nft_dereference(priv->tbl); +	struct nft_hash_elem *he, *next; +	unsigned int i; + +	for (i = 0; i < tbl->size; i++) { +		for (he = nft_dereference(tbl->buckets[i]); he != NULL; +		     he = next) { +			next = nft_dereference(he->next); +			nft_hash_elem_destroy(set, he); +		} +	} +	kfree(tbl); +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, +			      struct nft_set_estimate *est) +{ +	unsigned int esize; + +	esize = sizeof(struct nft_hash_elem); +	if (features & NFT_SET_MAP) +		esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); + +	if (desc->size) { +		est->size = sizeof(struct nft_hash) + +			    nft_hash_tbl_size(desc->size) * +			    sizeof(struct nft_hash_elem *) + +			    desc->size * esize; +	} else { +		/* Resizing happens when the load drops below 30% or goes +		 * above 75%. The average of 52.5% load (approximated by 50%) +		 * is used for the size estimation of the hash buckets, +		 * meaning we calculate two buckets per element. +		 */ +		est->size = esize + 2 * sizeof(struct nft_hash_elem *); +	} + +	est->class = NFT_SET_CLASS_O_1; + +	return true; +} + +static struct nft_set_ops nft_hash_ops __read_mostly = { +	.privsize       = nft_hash_privsize, +	.estimate	= nft_hash_estimate, +	.init		= nft_hash_init, +	.destroy	= nft_hash_destroy, +	.get		= nft_hash_get, +	.insert		= nft_hash_insert, +	.remove		= nft_hash_remove, +	.lookup		= nft_hash_lookup, +	.walk		= nft_hash_walk, +	.features	= NFT_SET_MAP, +	.owner		= THIS_MODULE, +}; + +static int __init nft_hash_module_init(void) +{ +	return nft_register_set(&nft_hash_ops); +} + +static void __exit nft_hash_module_exit(void) +{ +	nft_unregister_set(&nft_hash_ops); +} + +module_init(nft_hash_module_init); +module_exit(nft_hash_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c new file mode 100644 index 00000000000..810385eb724 --- /dev/null +++ b/net/netfilter/nft_immediate.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_immediate_expr { +	struct nft_data		data; +	enum nft_registers	dreg:8; +	u8			dlen; +}; + +static void nft_immediate_eval(const struct nft_expr *expr, +			       struct nft_data data[NFT_REG_MAX + 1], +			       const struct nft_pktinfo *pkt) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); + +	nft_data_copy(&data[priv->dreg], &priv->data); +} + +static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { +	[NFTA_IMMEDIATE_DREG]	= { .type = NLA_U32 }, +	[NFTA_IMMEDIATE_DATA]	= { .type = NLA_NESTED }, +}; + +static int nft_immediate_init(const struct nft_ctx *ctx, +			      const struct nft_expr *expr, +			      const struct nlattr * const tb[]) +{ +	struct nft_immediate_expr *priv = nft_expr_priv(expr); +	struct nft_data_desc desc; +	int err; + +	if (tb[NFTA_IMMEDIATE_DREG] == NULL || +	    tb[NFTA_IMMEDIATE_DATA] == NULL) +		return -EINVAL; + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; + +	err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); +	if (err < 0) +		return err; +	priv->dlen = desc.len; + +	err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type); +	if (err < 0) +		goto err1; + +	return 0; + +err1: +	nft_data_uninit(&priv->data, desc.type); +	return err; +} + +static void nft_immediate_destroy(const struct nft_ctx *ctx, +				  const struct nft_expr *expr) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); +	return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); +} + +static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg))) +		goto nla_put_failure; + +	return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, +			     nft_dreg_to_type(priv->dreg), priv->dlen); + +nla_put_failure: +	return -1; +} + +static int nft_immediate_validate(const struct nft_ctx *ctx, +				  const struct nft_expr *expr, +				  const struct nft_data **data) +{ +	const struct nft_immediate_expr *priv = nft_expr_priv(expr); + +	if (priv->dreg == NFT_REG_VERDICT) +		*data = &priv->data; + +	return 0; +} + +static struct nft_expr_type nft_imm_type; +static const struct nft_expr_ops nft_imm_ops = { +	.type		= &nft_imm_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), +	.eval		= nft_immediate_eval, +	.init		= nft_immediate_init, +	.destroy	= nft_immediate_destroy, +	.dump		= nft_immediate_dump, +	.validate	= nft_immediate_validate, +}; + +static struct nft_expr_type nft_imm_type __read_mostly = { +	.name		= "immediate", +	.ops		= &nft_imm_ops, +	.policy		= nft_immediate_policy, +	.maxattr	= NFTA_IMMEDIATE_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_immediate_module_init(void) +{ +	return nft_register_expr(&nft_imm_type); +} + +void nft_immediate_module_exit(void) +{ +	nft_unregister_expr(&nft_imm_type); +} diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c new file mode 100644 index 00000000000..85da5bd02f6 --- /dev/null +++ b/net/netfilter/nft_limit.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(limit_lock); + +struct nft_limit { +	u64		tokens; +	u64		rate; +	u64		unit; +	unsigned long	stamp; +}; + +static void nft_limit_eval(const struct nft_expr *expr, +			   struct nft_data data[NFT_REG_MAX + 1], +			   const struct nft_pktinfo *pkt) +{ +	struct nft_limit *priv = nft_expr_priv(expr); + +	spin_lock_bh(&limit_lock); +	if (time_after_eq(jiffies, priv->stamp)) { +		priv->tokens = priv->rate; +		priv->stamp = jiffies + priv->unit * HZ; +	} + +	if (priv->tokens >= 1) { +		priv->tokens--; +		spin_unlock_bh(&limit_lock); +		return; +	} +	spin_unlock_bh(&limit_lock); + +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { +	[NFTA_LIMIT_RATE]	= { .type = NLA_U64 }, +	[NFTA_LIMIT_UNIT]	= { .type = NLA_U64 }, +}; + +static int nft_limit_init(const struct nft_ctx *ctx, +			  const struct nft_expr *expr, +			  const struct nlattr * const tb[]) +{ +	struct nft_limit *priv = nft_expr_priv(expr); + +	if (tb[NFTA_LIMIT_RATE] == NULL || +	    tb[NFTA_LIMIT_UNIT] == NULL) +		return -EINVAL; + +	priv->rate   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); +	priv->unit   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); +	priv->stamp  = jiffies + priv->unit * HZ; +	priv->tokens = priv->rate; +	return 0; +} + +static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_limit *priv = nft_expr_priv(expr); + +	if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate))) +		goto nla_put_failure; +	if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_limit_type; +static const struct nft_expr_ops nft_limit_ops = { +	.type		= &nft_limit_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit)), +	.eval		= nft_limit_eval, +	.init		= nft_limit_init, +	.dump		= nft_limit_dump, +}; + +static struct nft_expr_type nft_limit_type __read_mostly = { +	.name		= "limit", +	.ops		= &nft_limit_ops, +	.policy		= nft_limit_policy, +	.maxattr	= NFTA_LIMIT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_limit_module_init(void) +{ +	return nft_register_expr(&nft_limit_type); +} + +static void __exit nft_limit_module_exit(void) +{ +	nft_unregister_expr(&nft_limit_type); +} + +module_init(nft_limit_module_init); +module_exit(nft_limit_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("limit"); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c new file mode 100644 index 00000000000..10cfb156cdf --- /dev/null +++ b/net/netfilter/nft_log.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netdevice.h> + +static const char *nft_log_null_prefix = ""; + +struct nft_log { +	struct nf_loginfo	loginfo; +	char			*prefix; +}; + +static void nft_log_eval(const struct nft_expr *expr, +			 struct nft_data data[NFT_REG_MAX + 1], +			 const struct nft_pktinfo *pkt) +{ +	const struct nft_log *priv = nft_expr_priv(expr); +	struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + +	nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, +		      pkt->out, &priv->loginfo, "%s", priv->prefix); +} + +static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { +	[NFTA_LOG_GROUP]	= { .type = NLA_U16 }, +	[NFTA_LOG_PREFIX]	= { .type = NLA_STRING }, +	[NFTA_LOG_SNAPLEN]	= { .type = NLA_U32 }, +	[NFTA_LOG_QTHRESHOLD]	= { .type = NLA_U16 }, +}; + +static int nft_log_init(const struct nft_ctx *ctx, +			const struct nft_expr *expr, +			const struct nlattr * const tb[]) +{ +	struct nft_log *priv = nft_expr_priv(expr); +	struct nf_loginfo *li = &priv->loginfo; +	const struct nlattr *nla; + +	nla = tb[NFTA_LOG_PREFIX]; +	if (nla != NULL) { +		priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); +		if (priv->prefix == NULL) +			return -ENOMEM; +		nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); +	} else +		priv->prefix = (char *)nft_log_null_prefix; + +	li->type = NF_LOG_TYPE_ULOG; +	if (tb[NFTA_LOG_GROUP] != NULL) +		li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); + +	if (tb[NFTA_LOG_SNAPLEN] != NULL) +		li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); +	if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { +		li->u.ulog.qthreshold = +			ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); +	} + +	return 0; +} + +static void nft_log_destroy(const struct nft_ctx *ctx, +			    const struct nft_expr *expr) +{ +	struct nft_log *priv = nft_expr_priv(expr); + +	if (priv->prefix != nft_log_null_prefix) +		kfree(priv->prefix); +} + +static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_log *priv = nft_expr_priv(expr); +	const struct nf_loginfo *li = &priv->loginfo; + +	if (priv->prefix != nft_log_null_prefix) +		if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) +			goto nla_put_failure; +	if (li->u.ulog.group) +		if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) +			goto nla_put_failure; +	if (li->u.ulog.copy_len) +		if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, +				 htonl(li->u.ulog.copy_len))) +			goto nla_put_failure; +	if (li->u.ulog.qthreshold) +		if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, +				 htons(li->u.ulog.qthreshold))) +			goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_log_type; +static const struct nft_expr_ops nft_log_ops = { +	.type		= &nft_log_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_log)), +	.eval		= nft_log_eval, +	.init		= nft_log_init, +	.destroy	= nft_log_destroy, +	.dump		= nft_log_dump, +}; + +static struct nft_expr_type nft_log_type __read_mostly = { +	.name		= "log", +	.ops		= &nft_log_ops, +	.policy		= nft_log_policy, +	.maxattr	= NFTA_LOG_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_log_module_init(void) +{ +	return nft_register_expr(&nft_log_type); +} + +static void __exit nft_log_module_exit(void) +{ +	nft_unregister_expr(&nft_log_type); +} + +module_init(nft_log_module_init); +module_exit(nft_log_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("log"); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c new file mode 100644 index 00000000000..6404a726d17 --- /dev/null +++ b/net/netfilter/nft_lookup.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> + +struct nft_lookup { +	struct nft_set			*set; +	enum nft_registers		sreg:8; +	enum nft_registers		dreg:8; +	struct nft_set_binding		binding; +}; + +static void nft_lookup_eval(const struct nft_expr *expr, +			    struct nft_data data[NFT_REG_MAX + 1], +			    const struct nft_pktinfo *pkt) +{ +	const struct nft_lookup *priv = nft_expr_priv(expr); +	const struct nft_set *set = priv->set; + +	if (set->ops->lookup(set, &data[priv->sreg], &data[priv->dreg])) +		return; +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { +	[NFTA_LOOKUP_SET]	= { .type = NLA_STRING }, +	[NFTA_LOOKUP_SREG]	= { .type = NLA_U32 }, +	[NFTA_LOOKUP_DREG]	= { .type = NLA_U32 }, +}; + +static int nft_lookup_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_lookup *priv = nft_expr_priv(expr); +	struct nft_set *set; +	int err; + +	if (tb[NFTA_LOOKUP_SET] == NULL || +	    tb[NFTA_LOOKUP_SREG] == NULL) +		return -EINVAL; + +	set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); +	if (IS_ERR(set)) { +		if (tb[NFTA_LOOKUP_SET_ID]) { +			set = nf_tables_set_lookup_byid(ctx->net, +							tb[NFTA_LOOKUP_SET_ID]); +		} +		if (IS_ERR(set)) +			return PTR_ERR(set); +	} + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	if (tb[NFTA_LOOKUP_DREG] != NULL) { +		if (!(set->flags & NFT_SET_MAP)) +			return -EINVAL; + +		priv->dreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_DREG])); +		err = nft_validate_output_register(priv->dreg); +		if (err < 0) +			return err; + +		if (priv->dreg == NFT_REG_VERDICT) { +			if (set->dtype != NFT_DATA_VERDICT) +				return -EINVAL; +		} else if (set->dtype == NFT_DATA_VERDICT) +			return -EINVAL; +	} else if (set->flags & NFT_SET_MAP) +		return -EINVAL; + +	err = nf_tables_bind_set(ctx, set, &priv->binding); +	if (err < 0) +		return err; + +	priv->set = set; +	return 0; +} + +static void nft_lookup_destroy(const struct nft_ctx *ctx, +			       const struct nft_expr *expr) +{ +	struct nft_lookup *priv = nft_expr_priv(expr); + +	nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_lookup *priv = nft_expr_priv(expr); + +	if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_LOOKUP_SREG, htonl(priv->sreg))) +		goto nla_put_failure; +	if (priv->set->flags & NFT_SET_MAP) +		if (nla_put_be32(skb, NFTA_LOOKUP_DREG, htonl(priv->dreg))) +			goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_lookup_type; +static const struct nft_expr_ops nft_lookup_ops = { +	.type		= &nft_lookup_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)), +	.eval		= nft_lookup_eval, +	.init		= nft_lookup_init, +	.destroy	= nft_lookup_destroy, +	.dump		= nft_lookup_dump, +}; + +static struct nft_expr_type nft_lookup_type __read_mostly = { +	.name		= "lookup", +	.ops		= &nft_lookup_ops, +	.policy		= nft_lookup_policy, +	.maxattr	= NFTA_LOOKUP_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_lookup_module_init(void) +{ +	return nft_register_expr(&nft_lookup_type); +} + +void nft_lookup_module_exit(void) +{ +	nft_unregister_expr(&nft_lookup_type); +} diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c new file mode 100644 index 00000000000..852b178c6ae --- /dev/null +++ b/net/netfilter/nft_meta.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/tcp_states.h> /* for TCP_TIME_WAIT */ +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> + +void nft_meta_get_eval(const struct nft_expr *expr, +		       struct nft_data data[NFT_REG_MAX + 1], +		       const struct nft_pktinfo *pkt) +{ +	const struct nft_meta *priv = nft_expr_priv(expr); +	const struct sk_buff *skb = pkt->skb; +	const struct net_device *in = pkt->in, *out = pkt->out; +	struct nft_data *dest = &data[priv->dreg]; + +	switch (priv->key) { +	case NFT_META_LEN: +		dest->data[0] = skb->len; +		break; +	case NFT_META_PROTOCOL: +		*(__be16 *)dest->data = skb->protocol; +		break; +	case NFT_META_NFPROTO: +		dest->data[0] = pkt->ops->pf; +		break; +	case NFT_META_L4PROTO: +		dest->data[0] = pkt->tprot; +		break; +	case NFT_META_PRIORITY: +		dest->data[0] = skb->priority; +		break; +	case NFT_META_MARK: +		dest->data[0] = skb->mark; +		break; +	case NFT_META_IIF: +		if (in == NULL) +			goto err; +		dest->data[0] = in->ifindex; +		break; +	case NFT_META_OIF: +		if (out == NULL) +			goto err; +		dest->data[0] = out->ifindex; +		break; +	case NFT_META_IIFNAME: +		if (in == NULL) +			goto err; +		strncpy((char *)dest->data, in->name, sizeof(dest->data)); +		break; +	case NFT_META_OIFNAME: +		if (out == NULL) +			goto err; +		strncpy((char *)dest->data, out->name, sizeof(dest->data)); +		break; +	case NFT_META_IIFTYPE: +		if (in == NULL) +			goto err; +		*(u16 *)dest->data = in->type; +		break; +	case NFT_META_OIFTYPE: +		if (out == NULL) +			goto err; +		*(u16 *)dest->data = out->type; +		break; +	case NFT_META_SKUID: +		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) +			goto err; + +		read_lock_bh(&skb->sk->sk_callback_lock); +		if (skb->sk->sk_socket == NULL || +		    skb->sk->sk_socket->file == NULL) { +			read_unlock_bh(&skb->sk->sk_callback_lock); +			goto err; +		} + +		dest->data[0] = +			from_kuid_munged(&init_user_ns, +				skb->sk->sk_socket->file->f_cred->fsuid); +		read_unlock_bh(&skb->sk->sk_callback_lock); +		break; +	case NFT_META_SKGID: +		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) +			goto err; + +		read_lock_bh(&skb->sk->sk_callback_lock); +		if (skb->sk->sk_socket == NULL || +		    skb->sk->sk_socket->file == NULL) { +			read_unlock_bh(&skb->sk->sk_callback_lock); +			goto err; +		} +		dest->data[0] = +			from_kgid_munged(&init_user_ns, +				 skb->sk->sk_socket->file->f_cred->fsgid); +		read_unlock_bh(&skb->sk->sk_callback_lock); +		break; +#ifdef CONFIG_IP_ROUTE_CLASSID +	case NFT_META_RTCLASSID: { +		const struct dst_entry *dst = skb_dst(skb); + +		if (dst == NULL) +			goto err; +		dest->data[0] = dst->tclassid; +		break; +	} +#endif +#ifdef CONFIG_NETWORK_SECMARK +	case NFT_META_SECMARK: +		dest->data[0] = skb->secmark; +		break; +#endif +	default: +		WARN_ON(1); +		goto err; +	} +	return; + +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} +EXPORT_SYMBOL_GPL(nft_meta_get_eval); + +void nft_meta_set_eval(const struct nft_expr *expr, +		       struct nft_data data[NFT_REG_MAX + 1], +		       const struct nft_pktinfo *pkt) +{ +	const struct nft_meta *meta = nft_expr_priv(expr); +	struct sk_buff *skb = pkt->skb; +	u32 value = data[meta->sreg].data[0]; + +	switch (meta->key) { +	case NFT_META_MARK: +		skb->mark = value; +		break; +	case NFT_META_PRIORITY: +		skb->priority = value; +		break; +	case NFT_META_NFTRACE: +		skb->nf_trace = 1; +		break; +	default: +		WARN_ON(1); +	} +} +EXPORT_SYMBOL_GPL(nft_meta_set_eval); + +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +	[NFTA_META_DREG]	= { .type = NLA_U32 }, +	[NFTA_META_KEY]		= { .type = NLA_U32 }, +	[NFTA_META_SREG]	= { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_meta_policy); + +int nft_meta_get_init(const struct nft_ctx *ctx, +		      const struct nft_expr *expr, +		      const struct nlattr * const tb[]) +{ +	struct nft_meta *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); +	switch (priv->key) { +	case NFT_META_LEN: +	case NFT_META_PROTOCOL: +	case NFT_META_NFPROTO: +	case NFT_META_L4PROTO: +	case NFT_META_PRIORITY: +	case NFT_META_MARK: +	case NFT_META_IIF: +	case NFT_META_OIF: +	case NFT_META_IIFNAME: +	case NFT_META_OIFNAME: +	case NFT_META_IIFTYPE: +	case NFT_META_OIFTYPE: +	case NFT_META_SKUID: +	case NFT_META_SKGID: +#ifdef CONFIG_IP_ROUTE_CLASSID +	case NFT_META_RTCLASSID: +#endif +#ifdef CONFIG_NETWORK_SECMARK +	case NFT_META_SECMARK: +#endif +		break; +	default: +		return -EOPNOTSUPP; +	} + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; + +	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +	if (err < 0) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_get_init); + +int nft_meta_set_init(const struct nft_ctx *ctx, +		      const struct nft_expr *expr, +		      const struct nlattr * const tb[]) +{ +	struct nft_meta *priv = nft_expr_priv(expr); +	int err; + +	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); +	switch (priv->key) { +	case NFT_META_MARK: +	case NFT_META_PRIORITY: +	case NFT_META_NFTRACE: +		break; +	default: +		return -EOPNOTSUPP; +	} + +	priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); +	err = nft_validate_input_register(priv->sreg); +	if (err < 0) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_set_init); + +int nft_meta_get_dump(struct sk_buff *skb, +		      const struct nft_expr *expr) +{ +	const struct nft_meta *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_get_dump); + +int nft_meta_set_dump(struct sk_buff *skb, +		      const struct nft_expr *expr) +{ +	const struct nft_meta *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) +		goto nla_put_failure; +	if (nla_put_be32(skb, NFTA_META_SREG, htonl(priv->sreg))) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_set_dump); + +static struct nft_expr_type nft_meta_type; +static const struct nft_expr_ops nft_meta_get_ops = { +	.type		= &nft_meta_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)), +	.eval		= nft_meta_get_eval, +	.init		= nft_meta_get_init, +	.dump		= nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_set_ops = { +	.type		= &nft_meta_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)), +	.eval		= nft_meta_set_eval, +	.init		= nft_meta_set_init, +	.dump		= nft_meta_set_dump, +}; + +static const struct nft_expr_ops * +nft_meta_select_ops(const struct nft_ctx *ctx, +		    const struct nlattr * const tb[]) +{ +	if (tb[NFTA_META_KEY] == NULL) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) +		return ERR_PTR(-EINVAL); + +	if (tb[NFTA_META_DREG]) +		return &nft_meta_get_ops; + +	if (tb[NFTA_META_SREG]) +		return &nft_meta_set_ops; + +	return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_type __read_mostly = { +	.name		= "meta", +	.select_ops	= &nft_meta_select_ops, +	.policy		= nft_meta_policy, +	.maxattr	= NFTA_META_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_meta_module_init(void) +{ +	return nft_register_expr(&nft_meta_type); +} + +static void __exit nft_meta_module_exit(void) +{ +	nft_unregister_expr(&nft_meta_type); +} + +module_init(nft_meta_module_init); +module_exit(nft_meta_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("meta"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c new file mode 100644 index 00000000000..79ff58cd36d --- /dev/null +++ b/net/netfilter/nft_nat.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/string.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +struct nft_nat { +	enum nft_registers      sreg_addr_min:8; +	enum nft_registers      sreg_addr_max:8; +	enum nft_registers      sreg_proto_min:8; +	enum nft_registers      sreg_proto_max:8; +	enum nf_nat_manip_type  type:8; +	u8			family; +}; + +static void nft_nat_eval(const struct nft_expr *expr, +			 struct nft_data data[NFT_REG_MAX + 1], +			 const struct nft_pktinfo *pkt) +{ +	const struct nft_nat *priv = nft_expr_priv(expr); +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); +	struct nf_nat_range range; + +	memset(&range, 0, sizeof(range)); +	if (priv->sreg_addr_min) { +		if (priv->family == AF_INET) { +			range.min_addr.ip = (__force __be32) +					data[priv->sreg_addr_min].data[0]; +			range.max_addr.ip = (__force __be32) +					data[priv->sreg_addr_max].data[0]; + +		} else { +			memcpy(range.min_addr.ip6, +			       data[priv->sreg_addr_min].data, +			       sizeof(struct nft_data)); +			memcpy(range.max_addr.ip6, +			       data[priv->sreg_addr_max].data, +			       sizeof(struct nft_data)); +		} +		range.flags |= NF_NAT_RANGE_MAP_IPS; +	} + +	if (priv->sreg_proto_min) { +		range.min_proto.all = (__force __be16) +					data[priv->sreg_proto_min].data[0]; +		range.max_proto.all = (__force __be16) +					data[priv->sreg_proto_max].data[0]; +		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; +	} + +	data[NFT_REG_VERDICT].verdict = +		nf_nat_setup_info(ct, &range, priv->type); +} + +static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { +	[NFTA_NAT_TYPE]		 = { .type = NLA_U32 }, +	[NFTA_NAT_FAMILY]	 = { .type = NLA_U32 }, +	[NFTA_NAT_REG_ADDR_MIN]	 = { .type = NLA_U32 }, +	[NFTA_NAT_REG_ADDR_MAX]	 = { .type = NLA_U32 }, +	[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, +	[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, +}; + +static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, +			const struct nlattr * const tb[]) +{ +	struct nft_nat *priv = nft_expr_priv(expr); +	u32 family; +	int err; + +	if (tb[NFTA_NAT_TYPE] == NULL) +		return -EINVAL; + +	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { +	case NFT_NAT_SNAT: +		priv->type = NF_NAT_MANIP_SRC; +		break; +	case NFT_NAT_DNAT: +		priv->type = NF_NAT_MANIP_DST; +		break; +	default: +		return -EINVAL; +	} + +	if (tb[NFTA_NAT_FAMILY] == NULL) +		return -EINVAL; + +	family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); +	if (family != AF_INET && family != AF_INET6) +		return -EAFNOSUPPORT; +	if (family != ctx->afi->family) +		return -EOPNOTSUPP; +	priv->family = family; + +	if (tb[NFTA_NAT_REG_ADDR_MIN]) { +		priv->sreg_addr_min = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_ADDR_MIN])); +		err = nft_validate_input_register(priv->sreg_addr_min); +		if (err < 0) +			return err; +	} + +	if (tb[NFTA_NAT_REG_ADDR_MAX]) { +		priv->sreg_addr_max = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_ADDR_MAX])); +		err = nft_validate_input_register(priv->sreg_addr_max); +		if (err < 0) +			return err; +	} else +		priv->sreg_addr_max = priv->sreg_addr_min; + +	if (tb[NFTA_NAT_REG_PROTO_MIN]) { +		priv->sreg_proto_min = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_PROTO_MIN])); +		err = nft_validate_input_register(priv->sreg_proto_min); +		if (err < 0) +			return err; +	} + +	if (tb[NFTA_NAT_REG_PROTO_MAX]) { +		priv->sreg_proto_max = ntohl(nla_get_be32( +						tb[NFTA_NAT_REG_PROTO_MAX])); +		err = nft_validate_input_register(priv->sreg_proto_max); +		if (err < 0) +			return err; +	} else +		priv->sreg_proto_max = priv->sreg_proto_min; + +	return 0; +} + +static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_nat *priv = nft_expr_priv(expr); + +	switch (priv->type) { +	case NF_NAT_MANIP_SRC: +		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) +			goto nla_put_failure; +		break; +	case NF_NAT_MANIP_DST: +		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) +			goto nla_put_failure; +		break; +	} + +	if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) +		goto nla_put_failure; +	if (nla_put_be32(skb, +			 NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min))) +		goto nla_put_failure; +	if (nla_put_be32(skb, +			 NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) +		goto nla_put_failure; +	if (priv->sreg_proto_min) { +		if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, +				 htonl(priv->sreg_proto_min))) +			goto nla_put_failure; +		if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, +				 htonl(priv->sreg_proto_max))) +			goto nla_put_failure; +	} +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_nat_type; +static const struct nft_expr_ops nft_nat_ops = { +	.type           = &nft_nat_type, +	.size           = NFT_EXPR_SIZE(sizeof(struct nft_nat)), +	.eval           = nft_nat_eval, +	.init           = nft_nat_init, +	.dump           = nft_nat_dump, +}; + +static struct nft_expr_type nft_nat_type __read_mostly = { +	.name           = "nat", +	.ops            = &nft_nat_ops, +	.policy         = nft_nat_policy, +	.maxattr        = NFTA_NAT_MAX, +	.owner          = THIS_MODULE, +}; + +static int __init nft_nat_module_init(void) +{ +	return nft_register_expr(&nft_nat_type); +} + +static void __exit nft_nat_module_exit(void) +{ +	nft_unregister_expr(&nft_nat_type); +} + +module_init(nft_nat_module_init); +module_exit(nft_nat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_EXPR("nat"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c new file mode 100644 index 00000000000..85daa84bfdf --- /dev/null +++ b/net/netfilter/nft_payload.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +static void nft_payload_eval(const struct nft_expr *expr, +			     struct nft_data data[NFT_REG_MAX + 1], +			     const struct nft_pktinfo *pkt) +{ +	const struct nft_payload *priv = nft_expr_priv(expr); +	const struct sk_buff *skb = pkt->skb; +	struct nft_data *dest = &data[priv->dreg]; +	int offset; + +	switch (priv->base) { +	case NFT_PAYLOAD_LL_HEADER: +		if (!skb_mac_header_was_set(skb)) +			goto err; +		offset = skb_mac_header(skb) - skb->data; +		break; +	case NFT_PAYLOAD_NETWORK_HEADER: +		offset = skb_network_offset(skb); +		break; +	case NFT_PAYLOAD_TRANSPORT_HEADER: +		offset = pkt->xt.thoff; +		break; +	default: +		BUG(); +	} +	offset += priv->offset; + +	if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0) +		goto err; +	return; +err: +	data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { +	[NFTA_PAYLOAD_DREG]	= { .type = NLA_U32 }, +	[NFTA_PAYLOAD_BASE]	= { .type = NLA_U32 }, +	[NFTA_PAYLOAD_OFFSET]	= { .type = NLA_U32 }, +	[NFTA_PAYLOAD_LEN]	= { .type = NLA_U32 }, +}; + +static int nft_payload_init(const struct nft_ctx *ctx, +			    const struct nft_expr *expr, +			    const struct nlattr * const tb[]) +{ +	struct nft_payload *priv = nft_expr_priv(expr); +	int err; + +	priv->base   = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); +	priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); +	priv->len    = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + +	priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG])); +	err = nft_validate_output_register(priv->dreg); +	if (err < 0) +		return err; +	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_payload *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) || +	    nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || +	    nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || +	    nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_payload_type; +static const struct nft_expr_ops nft_payload_ops = { +	.type		= &nft_payload_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)), +	.eval		= nft_payload_eval, +	.init		= nft_payload_init, +	.dump		= nft_payload_dump, +}; + +const struct nft_expr_ops nft_payload_fast_ops = { +	.type		= &nft_payload_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)), +	.eval		= nft_payload_eval, +	.init		= nft_payload_init, +	.dump		= nft_payload_dump, +}; + +static const struct nft_expr_ops * +nft_payload_select_ops(const struct nft_ctx *ctx, +		       const struct nlattr * const tb[]) +{ +	enum nft_payload_bases base; +	unsigned int offset, len; + +	if (tb[NFTA_PAYLOAD_DREG] == NULL || +	    tb[NFTA_PAYLOAD_BASE] == NULL || +	    tb[NFTA_PAYLOAD_OFFSET] == NULL || +	    tb[NFTA_PAYLOAD_LEN] == NULL) +		return ERR_PTR(-EINVAL); + +	base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); +	switch (base) { +	case NFT_PAYLOAD_LL_HEADER: +	case NFT_PAYLOAD_NETWORK_HEADER: +	case NFT_PAYLOAD_TRANSPORT_HEADER: +		break; +	default: +		return ERR_PTR(-EOPNOTSUPP); +	} + +	offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); +	len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); +	if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data)) +		return ERR_PTR(-EINVAL); + +	if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && +	    base != NFT_PAYLOAD_LL_HEADER) +		return &nft_payload_fast_ops; +	else +		return &nft_payload_ops; +} + +static struct nft_expr_type nft_payload_type __read_mostly = { +	.name		= "payload", +	.select_ops	= nft_payload_select_ops, +	.policy		= nft_payload_policy, +	.maxattr	= NFTA_PAYLOAD_MAX, +	.owner		= THIS_MODULE, +}; + +int __init nft_payload_module_init(void) +{ +	return nft_register_expr(&nft_payload_type); +} + +void nft_payload_module_exit(void) +{ +	nft_unregister_expr(&nft_payload_type); +} diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c new file mode 100644 index 00000000000..e8ae2f6bf23 --- /dev/null +++ b/net/netfilter/nft_queue.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code partly funded by OISF + * (http://www.openinfosecfoundation.org/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/jhash.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_queue.h> + +static u32 jhash_initval __read_mostly; + +struct nft_queue { +	u16	queuenum; +	u16	queues_total; +	u16	flags; +}; + +static void nft_queue_eval(const struct nft_expr *expr, +			   struct nft_data data[NFT_REG_MAX + 1], +			   const struct nft_pktinfo *pkt) +{ +	struct nft_queue *priv = nft_expr_priv(expr); +	u32 queue = priv->queuenum; +	u32 ret; + +	if (priv->queues_total > 1) { +		if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { +			int cpu = smp_processor_id(); + +			queue = priv->queuenum + cpu % priv->queues_total; +		} else { +			queue = nfqueue_hash(pkt->skb, queue, +					     priv->queues_total, pkt->ops->pf, +					     jhash_initval); +		} +	} + +	ret = NF_QUEUE_NR(queue); +	if (priv->flags & NFT_QUEUE_FLAG_BYPASS) +		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + +	data[NFT_REG_VERDICT].verdict = ret; +} + +static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { +	[NFTA_QUEUE_NUM]	= { .type = NLA_U16 }, +	[NFTA_QUEUE_TOTAL]	= { .type = NLA_U16 }, +	[NFTA_QUEUE_FLAGS]	= { .type = NLA_U16 }, +}; + +static int nft_queue_init(const struct nft_ctx *ctx, +			   const struct nft_expr *expr, +			   const struct nlattr * const tb[]) +{ +	struct nft_queue *priv = nft_expr_priv(expr); + +	if (tb[NFTA_QUEUE_NUM] == NULL) +		return -EINVAL; + +	init_hashrandom(&jhash_initval); +	priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); + +	if (tb[NFTA_QUEUE_TOTAL] != NULL) +		priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); +	if (tb[NFTA_QUEUE_FLAGS] != NULL) { +		priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); +		if (priv->flags & ~NFT_QUEUE_FLAG_MASK) +			return -EINVAL; +	} +	return 0; +} + +static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_queue *priv = nft_expr_priv(expr); + +	if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) || +	    nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) || +	    nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) +		goto nla_put_failure; + +	return 0; + +nla_put_failure: +	return -1; +} + +static struct nft_expr_type nft_queue_type; +static const struct nft_expr_ops nft_queue_ops = { +	.type		= &nft_queue_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_queue)), +	.eval		= nft_queue_eval, +	.init		= nft_queue_init, +	.dump		= nft_queue_dump, +}; + +static struct nft_expr_type nft_queue_type __read_mostly = { +	.name		= "queue", +	.ops		= &nft_queue_ops, +	.policy		= nft_queue_policy, +	.maxattr	= NFTA_QUEUE_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_queue_module_init(void) +{ +	return nft_register_expr(&nft_queue_type); +} + +static void __exit nft_queue_module_exit(void) +{ +	nft_unregister_expr(&nft_queue_type); +} + +module_init(nft_queue_module_init); +module_exit(nft_queue_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); +MODULE_ALIAS_NFT_EXPR("queue"); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c new file mode 100644 index 00000000000..e1836ff8819 --- /dev/null +++ b/net/netfilter/nft_rbtree.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(nft_rbtree_lock); + +struct nft_rbtree { +	struct rb_root		root; +}; + +struct nft_rbtree_elem { +	struct rb_node		node; +	u16			flags; +	struct nft_data		key; +	struct nft_data		data[]; +}; + +static bool nft_rbtree_lookup(const struct nft_set *set, +			      const struct nft_data *key, +			      struct nft_data *data) +{ +	const struct nft_rbtree *priv = nft_set_priv(set); +	const struct nft_rbtree_elem *rbe, *interval = NULL; +	const struct rb_node *parent = priv->root.rb_node; +	int d; + +	spin_lock_bh(&nft_rbtree_lock); +	while (parent != NULL) { +		rbe = rb_entry(parent, struct nft_rbtree_elem, node); + +		d = nft_data_cmp(&rbe->key, key, set->klen); +		if (d < 0) { +			parent = parent->rb_left; +			interval = rbe; +		} else if (d > 0) +			parent = parent->rb_right; +		else { +found: +			if (rbe->flags & NFT_SET_ELEM_INTERVAL_END) +				goto out; +			if (set->flags & NFT_SET_MAP) +				nft_data_copy(data, rbe->data); + +			spin_unlock_bh(&nft_rbtree_lock); +			return true; +		} +	} + +	if (set->flags & NFT_SET_INTERVAL && interval != NULL) { +		rbe = interval; +		goto found; +	} +out: +	spin_unlock_bh(&nft_rbtree_lock); +	return false; +} + +static void nft_rbtree_elem_destroy(const struct nft_set *set, +				    struct nft_rbtree_elem *rbe) +{ +	nft_data_uninit(&rbe->key, NFT_DATA_VALUE); +	if (set->flags & NFT_SET_MAP && +	    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +		nft_data_uninit(rbe->data, set->dtype); + +	kfree(rbe); +} + +static int __nft_rbtree_insert(const struct nft_set *set, +			       struct nft_rbtree_elem *new) +{ +	struct nft_rbtree *priv = nft_set_priv(set); +	struct nft_rbtree_elem *rbe; +	struct rb_node *parent, **p; +	int d; + +	parent = NULL; +	p = &priv->root.rb_node; +	while (*p != NULL) { +		parent = *p; +		rbe = rb_entry(parent, struct nft_rbtree_elem, node); +		d = nft_data_cmp(&rbe->key, &new->key, set->klen); +		if (d < 0) +			p = &parent->rb_left; +		else if (d > 0) +			p = &parent->rb_right; +		else +			return -EEXIST; +	} +	rb_link_node(&new->node, parent, p); +	rb_insert_color(&new->node, &priv->root); +	return 0; +} + +static int nft_rbtree_insert(const struct nft_set *set, +			     const struct nft_set_elem *elem) +{ +	struct nft_rbtree_elem *rbe; +	unsigned int size; +	int err; + +	size = sizeof(*rbe); +	if (set->flags & NFT_SET_MAP && +	    !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) +		size += sizeof(rbe->data[0]); + +	rbe = kzalloc(size, GFP_KERNEL); +	if (rbe == NULL) +		return -ENOMEM; + +	rbe->flags = elem->flags; +	nft_data_copy(&rbe->key, &elem->key); +	if (set->flags & NFT_SET_MAP && +	    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +		nft_data_copy(rbe->data, &elem->data); + +	spin_lock_bh(&nft_rbtree_lock); +	err = __nft_rbtree_insert(set, rbe); +	if (err < 0) +		kfree(rbe); + +	spin_unlock_bh(&nft_rbtree_lock); +	return err; +} + +static void nft_rbtree_remove(const struct nft_set *set, +			      const struct nft_set_elem *elem) +{ +	struct nft_rbtree *priv = nft_set_priv(set); +	struct nft_rbtree_elem *rbe = elem->cookie; + +	spin_lock_bh(&nft_rbtree_lock); +	rb_erase(&rbe->node, &priv->root); +	spin_unlock_bh(&nft_rbtree_lock); +	kfree(rbe); +} + +static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) +{ +	const struct nft_rbtree *priv = nft_set_priv(set); +	const struct rb_node *parent = priv->root.rb_node; +	struct nft_rbtree_elem *rbe; +	int d; + +	spin_lock_bh(&nft_rbtree_lock); +	while (parent != NULL) { +		rbe = rb_entry(parent, struct nft_rbtree_elem, node); + +		d = nft_data_cmp(&rbe->key, &elem->key, set->klen); +		if (d < 0) +			parent = parent->rb_left; +		else if (d > 0) +			parent = parent->rb_right; +		else { +			elem->cookie = rbe; +			if (set->flags & NFT_SET_MAP && +			    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +				nft_data_copy(&elem->data, rbe->data); +			elem->flags = rbe->flags; +			spin_unlock_bh(&nft_rbtree_lock); +			return 0; +		} +	} +	spin_unlock_bh(&nft_rbtree_lock); +	return -ENOENT; +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, +			    const struct nft_set *set, +			    struct nft_set_iter *iter) +{ +	const struct nft_rbtree *priv = nft_set_priv(set); +	const struct nft_rbtree_elem *rbe; +	struct nft_set_elem elem; +	struct rb_node *node; + +	spin_lock_bh(&nft_rbtree_lock); +	for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { +		if (iter->count < iter->skip) +			goto cont; + +		rbe = rb_entry(node, struct nft_rbtree_elem, node); +		nft_data_copy(&elem.key, &rbe->key); +		if (set->flags & NFT_SET_MAP && +		    !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) +			nft_data_copy(&elem.data, rbe->data); +		elem.flags = rbe->flags; + +		iter->err = iter->fn(ctx, set, iter, &elem); +		if (iter->err < 0) { +			spin_unlock_bh(&nft_rbtree_lock); +			return; +		} +cont: +		iter->count++; +	} +	spin_unlock_bh(&nft_rbtree_lock); +} + +static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) +{ +	return sizeof(struct nft_rbtree); +} + +static int nft_rbtree_init(const struct nft_set *set, +			   const struct nft_set_desc *desc, +			   const struct nlattr * const nla[]) +{ +	struct nft_rbtree *priv = nft_set_priv(set); + +	priv->root = RB_ROOT; +	return 0; +} + +static void nft_rbtree_destroy(const struct nft_set *set) +{ +	struct nft_rbtree *priv = nft_set_priv(set); +	struct nft_rbtree_elem *rbe; +	struct rb_node *node; + +	spin_lock_bh(&nft_rbtree_lock); +	while ((node = priv->root.rb_node) != NULL) { +		rb_erase(node, &priv->root); +		rbe = rb_entry(node, struct nft_rbtree_elem, node); +		nft_rbtree_elem_destroy(set, rbe); +	} +	spin_unlock_bh(&nft_rbtree_lock); +} + +static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, +				struct nft_set_estimate *est) +{ +	unsigned int nsize; + +	nsize = sizeof(struct nft_rbtree_elem); +	if (features & NFT_SET_MAP) +		nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); + +	if (desc->size) +		est->size = sizeof(struct nft_rbtree) + desc->size * nsize; +	else +		est->size = nsize; + +	est->class = NFT_SET_CLASS_O_LOG_N; + +	return true; +} + +static struct nft_set_ops nft_rbtree_ops __read_mostly = { +	.privsize	= nft_rbtree_privsize, +	.estimate	= nft_rbtree_estimate, +	.init		= nft_rbtree_init, +	.destroy	= nft_rbtree_destroy, +	.insert		= nft_rbtree_insert, +	.remove		= nft_rbtree_remove, +	.get		= nft_rbtree_get, +	.lookup		= nft_rbtree_lookup, +	.walk		= nft_rbtree_walk, +	.features	= NFT_SET_INTERVAL | NFT_SET_MAP, +	.owner		= THIS_MODULE, +}; + +static int __init nft_rbtree_module_init(void) +{ +	return nft_register_set(&nft_rbtree_ops); +} + +static void __exit nft_rbtree_module_exit(void) +{ +	nft_unregister_set(&nft_rbtree_ops); +} + +module_init(nft_rbtree_module_init); +module_exit(nft_rbtree_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c new file mode 100644 index 00000000000..f3448c29644 --- /dev/null +++ b/net/netfilter/nft_reject.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { +	[NFTA_REJECT_TYPE]		= { .type = NLA_U32 }, +	[NFTA_REJECT_ICMP_CODE]		= { .type = NLA_U8 }, +}; +EXPORT_SYMBOL_GPL(nft_reject_policy); + +int nft_reject_init(const struct nft_ctx *ctx, +		    const struct nft_expr *expr, +		    const struct nlattr * const tb[]) +{ +	struct nft_reject *priv = nft_expr_priv(expr); + +	if (tb[NFTA_REJECT_TYPE] == NULL) +		return -EINVAL; + +	priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); +	switch (priv->type) { +	case NFT_REJECT_ICMP_UNREACH: +		if (tb[NFTA_REJECT_ICMP_CODE] == NULL) +			return -EINVAL; +		priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); +	case NFT_REJECT_TCP_RST: +		break; +	default: +		return -EINVAL; +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(nft_reject_init); + +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ +	const struct nft_reject *priv = nft_expr_priv(expr); + +	if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) +		goto nla_put_failure; + +	switch (priv->type) { +	case NFT_REJECT_ICMP_UNREACH: +		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) +			goto nla_put_failure; +		break; +	} + +	return 0; + +nla_put_failure: +	return -1; +} +EXPORT_SYMBOL_GPL(nft_reject_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 00000000000..b718a52a465 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_inet_eval(const struct nft_expr *expr, +				 struct nft_data data[NFT_REG_MAX + 1], +				 const struct nft_pktinfo *pkt) +{ +	switch (pkt->ops->pf) { +	case NFPROTO_IPV4: +		return nft_reject_ipv4_eval(expr, data, pkt); +	case NFPROTO_IPV6: +		return nft_reject_ipv6_eval(expr, data, pkt); +	} +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { +	.type		= &nft_reject_inet_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)), +	.eval		= nft_reject_inet_eval, +	.init		= nft_reject_init, +	.dump		= nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { +	.family		= NFPROTO_INET, +	.name		= "reject", +	.ops		= &nft_reject_inet_ops, +	.policy		= nft_reject_policy, +	.maxattr	= NFTA_REJECT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ +	return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ +	nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 80463507420..227aa11e840 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -2,6 +2,7 @@   * x_tables core - Backend for {ip,ip6,arp}_tables   *   * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> + * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * Based on existing ip_tables code which is   *   Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling @@ -14,6 +15,7 @@   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/kernel.h> +#include <linux/module.h>  #include <linux/socket.h>  #include <linux/net.h>  #include <linux/proc_fs.h> @@ -23,6 +25,7 @@  #include <linux/mutex.h>  #include <linux/mm.h>  #include <linux/slab.h> +#include <linux/audit.h>  #include <net/net_namespace.h>  #include <linux/netfilter/x_tables.h> @@ -38,9 +41,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");  #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))  struct compat_delta { -	struct compat_delta *next; -	unsigned int offset; -	int delta; +	unsigned int offset; /* offset in kernel */ +	int delta; /* delta in 32bit user land */  };  struct xt_af { @@ -49,7 +51,9 @@ struct xt_af {  	struct list_head target;  #ifdef CONFIG_COMPAT  	struct mutex compat_mutex; -	struct compat_delta *compat_offsets; +	struct compat_delta *compat_tab; +	unsigned int number; /* number of slots in compat_tab[] */ +	unsigned int cur; /* number of used slots in compat_tab[] */  #endif  }; @@ -181,14 +185,14 @@ EXPORT_SYMBOL(xt_unregister_matches);  /*   * These are weird, but module loading must not be done with mutex   * held (since they will register), and we have to have a single - * function to use try_then_request_module(). + * function to use.   */  /* Find match, grabs ref.  Returns ERR_PTR() on error. */  struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)  {  	struct xt_match *m; -	int err = 0; +	int err = -ENOENT;  	if (mutex_lock_interruptible(&xt[af].mutex) != 0)  		return ERR_PTR(-EINTR); @@ -219,9 +223,13 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision)  {  	struct xt_match *match; -	match = try_then_request_module(xt_find_match(nfproto, name, revision), -					"%st_%s", xt_prefix[nfproto], name); -	return (match != NULL) ? match : ERR_PTR(-ENOENT); +	match = xt_find_match(nfproto, name, revision); +	if (IS_ERR(match)) { +		request_module("%st_%s", xt_prefix[nfproto], name); +		match = xt_find_match(nfproto, name, revision); +	} + +	return match;  }  EXPORT_SYMBOL_GPL(xt_request_find_match); @@ -229,7 +237,7 @@ EXPORT_SYMBOL_GPL(xt_request_find_match);  struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)  {  	struct xt_target *t; -	int err = 0; +	int err = -ENOENT;  	if (mutex_lock_interruptible(&xt[af].mutex) != 0)  		return ERR_PTR(-EINTR); @@ -259,9 +267,13 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)  {  	struct xt_target *target; -	target = try_then_request_module(xt_find_target(af, name, revision), -					 "%st_%s", xt_prefix[af], name); -	return (target != NULL) ? target : ERR_PTR(-ENOENT); +	target = xt_find_target(af, name, revision); +	if (IS_ERR(target)) { +		request_module("%st_%s", xt_prefix[af], name); +		target = xt_find_target(af, name, revision); +	} + +	return target;  }  EXPORT_SYMBOL_GPL(xt_request_find_target); @@ -334,19 +346,27 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,  }  EXPORT_SYMBOL_GPL(xt_find_revision); -static char *textify_hooks(char *buf, size_t size, unsigned int mask) +static char * +textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto)  { -	static const char *const names[] = { +	static const char *const inetbr_names[] = {  		"PREROUTING", "INPUT", "FORWARD",  		"OUTPUT", "POSTROUTING", "BROUTING",  	}; -	unsigned int i; +	static const char *const arp_names[] = { +		"INPUT", "FORWARD", "OUTPUT", +	}; +	const char *const *names; +	unsigned int i, max;  	char *p = buf;  	bool np = false;  	int res; +	names = (nfproto == NFPROTO_ARP) ? arp_names : inetbr_names; +	max   = (nfproto == NFPROTO_ARP) ? ARRAY_SIZE(arp_names) : +	                                   ARRAY_SIZE(inetbr_names);  	*p = '\0'; -	for (i = 0; i < ARRAY_SIZE(names); ++i) { +	for (i = 0; i < max; ++i) {  		if (!(mask & (1 << i)))  			continue;  		res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]); @@ -391,8 +411,10 @@ int xt_check_match(struct xt_mtchk_param *par,  		pr_err("%s_tables: %s match: used from hooks %s, but only "  		       "valid from %s\n",  		       xt_prefix[par->family], par->match->name, -		       textify_hooks(used, sizeof(used), par->hook_mask), -		       textify_hooks(allow, sizeof(allow), par->match->hooks)); +		       textify_hooks(used, sizeof(used), par->hook_mask, +		                     par->family), +		       textify_hooks(allow, sizeof(allow), par->match->hooks, +		                     par->family));  		return -EINVAL;  	}  	if (par->match->proto && (par->match->proto != proto || inv_proto)) { @@ -414,54 +436,67 @@ int xt_check_match(struct xt_mtchk_param *par,  EXPORT_SYMBOL_GPL(xt_check_match);  #ifdef CONFIG_COMPAT -int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta) +int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)  { -	struct compat_delta *tmp; +	struct xt_af *xp = &xt[af]; -	tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); -	if (!tmp) -		return -ENOMEM; +	if (!xp->compat_tab) { +		if (!xp->number) +			return -EINVAL; +		xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); +		if (!xp->compat_tab) +			return -ENOMEM; +		xp->cur = 0; +	} -	tmp->offset = offset; -	tmp->delta = delta; +	if (xp->cur >= xp->number) +		return -EINVAL; -	if (xt[af].compat_offsets) { -		tmp->next = xt[af].compat_offsets->next; -		xt[af].compat_offsets->next = tmp; -	} else { -		xt[af].compat_offsets = tmp; -		tmp->next = NULL; -	} +	if (xp->cur) +		delta += xp->compat_tab[xp->cur - 1].delta; +	xp->compat_tab[xp->cur].offset = offset; +	xp->compat_tab[xp->cur].delta = delta; +	xp->cur++;  	return 0;  }  EXPORT_SYMBOL_GPL(xt_compat_add_offset);  void xt_compat_flush_offsets(u_int8_t af)  { -	struct compat_delta *tmp, *next; - -	if (xt[af].compat_offsets) { -		for (tmp = xt[af].compat_offsets; tmp; tmp = next) { -			next = tmp->next; -			kfree(tmp); -		} -		xt[af].compat_offsets = NULL; +	if (xt[af].compat_tab) { +		vfree(xt[af].compat_tab); +		xt[af].compat_tab = NULL; +		xt[af].number = 0; +		xt[af].cur = 0;  	}  }  EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);  int xt_compat_calc_jump(u_int8_t af, unsigned int offset)  { -	struct compat_delta *tmp; -	int delta; - -	for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next) -		if (tmp->offset < offset) -			delta += tmp->delta; -	return delta; +	struct compat_delta *tmp = xt[af].compat_tab; +	int mid, left = 0, right = xt[af].cur - 1; + +	while (left <= right) { +		mid = (left + right) >> 1; +		if (offset > tmp[mid].offset) +			left = mid + 1; +		else if (offset < tmp[mid].offset) +			right = mid - 1; +		else +			return mid ? tmp[mid - 1].delta : 0; +	} +	return left ? tmp[left - 1].delta : 0;  }  EXPORT_SYMBOL_GPL(xt_compat_calc_jump); +void xt_compat_init_offsets(u_int8_t af, unsigned int number) +{ +	xt[af].number = number; +	xt[af].cur = 0; +} +EXPORT_SYMBOL(xt_compat_init_offsets); +  int xt_compat_match_offset(const struct xt_match *match)  {  	u_int16_t csize = match->compatsize ? : match->matchsize; @@ -551,8 +586,10 @@ int xt_check_target(struct xt_tgchk_param *par,  		pr_err("%s_tables: %s target: used from hooks %s, but only "  		       "usable from %s\n",  		       xt_prefix[par->family], par->target->name, -		       textify_hooks(used, sizeof(used), par->hook_mask), -		       textify_hooks(allow, sizeof(allow), par->target->hooks)); +		       textify_hooks(used, sizeof(used), par->hook_mask, +		                     par->family), +		       textify_hooks(allow, sizeof(allow), par->target->hooks, +		                     par->family));  		return -EINVAL;  	}  	if (par->target->proto && (par->target->proto != proto || inv_proto)) { @@ -739,8 +776,8 @@ void xt_compat_unlock(u_int8_t af)  EXPORT_SYMBOL_GPL(xt_compat_unlock);  #endif -DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); -EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); +DEFINE_PER_CPU(seqcount_t, xt_recseq); +EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);  static int xt_jumpstack_alloc(struct xt_table_info *i)  { @@ -753,12 +790,11 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)  	size = sizeof(void **) * nr_cpu_ids;  	if (size > PAGE_SIZE) -		i->jumpstack = vmalloc(size); +		i->jumpstack = vzalloc(size);  	else -		i->jumpstack = kmalloc(size, GFP_KERNEL); +		i->jumpstack = kzalloc(size, GFP_KERNEL);  	if (i->jumpstack == NULL)  		return -ENOMEM; -	memset(i->jumpstack, 0, size);  	i->stacksize *= xt_jumpstack_multiplier;  	size = sizeof(void *) * i->stacksize; @@ -809,8 +845,13 @@ xt_replace_table(struct xt_table *table,  		return NULL;  	} -	table->private = newinfo;  	newinfo->initial_entries = private->initial_entries; +	/* +	 * Ensure contents of newinfo are visible before assigning to +	 * private. +	 */ +	smp_wmb(); +	table->private = newinfo;  	/*  	 * Even though table entries have now been swapped, other CPU's @@ -820,6 +861,21 @@ xt_replace_table(struct xt_table *table,  	 */  	local_bh_enable(); +#ifdef CONFIG_AUDIT +	if (audit_enabled) { +		struct audit_buffer *ab; + +		ab = audit_log_start(current->audit_context, GFP_KERNEL, +				     AUDIT_NETFILTER_CFG); +		if (ab) { +			audit_log_format(ab, "table=%s family=%u entries=%u", +					 table->name, table->af, +					 private->number); +			audit_log_end(ab); +		} +	} +#endif +  	return private;  }  EXPORT_SYMBOL_GPL(xt_replace_table); @@ -949,7 +1005,7 @@ static int xt_table_open(struct inode *inode, struct file *file)  			   sizeof(struct xt_names_priv));  	if (!ret) {  		priv = ((struct seq_file *)file->private_data)->private; -		priv->af = (unsigned long)PDE(inode)->data; +		priv->af = (unsigned long)PDE_DATA(inode);  	}  	return ret;  } @@ -1097,7 +1153,7 @@ static int xt_match_open(struct inode *inode, struct file *file)  	seq = file->private_data;  	seq->private = trav; -	trav->nfproto = (unsigned long)PDE(inode)->data; +	trav->nfproto = (unsigned long)PDE_DATA(inode);  	return 0;  } @@ -1161,7 +1217,7 @@ static int xt_target_open(struct inode *inode, struct file *file)  	seq = file->private_data;  	seq->private = trav; -	trav->nfproto = (unsigned long)PDE(inode)->data; +	trav->nfproto = (unsigned long)PDE_DATA(inode);  	return 0;  } @@ -1273,12 +1329,12 @@ int xt_proto_init(struct net *net, u_int8_t af)  out_remove_matches:  	strlcpy(buf, xt_prefix[af], sizeof(buf));  	strlcat(buf, FORMAT_MATCHES, sizeof(buf)); -	proc_net_remove(net, buf); +	remove_proc_entry(buf, net->proc_net);  out_remove_tables:  	strlcpy(buf, xt_prefix[af], sizeof(buf));  	strlcat(buf, FORMAT_TABLES, sizeof(buf)); -	proc_net_remove(net, buf); +	remove_proc_entry(buf, net->proc_net);  out:  	return -1;  #endif @@ -1292,15 +1348,15 @@ void xt_proto_fini(struct net *net, u_int8_t af)  	strlcpy(buf, xt_prefix[af], sizeof(buf));  	strlcat(buf, FORMAT_TABLES, sizeof(buf)); -	proc_net_remove(net, buf); +	remove_proc_entry(buf, net->proc_net);  	strlcpy(buf, xt_prefix[af], sizeof(buf));  	strlcat(buf, FORMAT_TARGETS, sizeof(buf)); -	proc_net_remove(net, buf); +	remove_proc_entry(buf, net->proc_net);  	strlcpy(buf, xt_prefix[af], sizeof(buf));  	strlcat(buf, FORMAT_MATCHES, sizeof(buf)); -	proc_net_remove(net, buf); +	remove_proc_entry(buf, net->proc_net);  #endif /*CONFIG_PROC_FS*/  }  EXPORT_SYMBOL_GPL(xt_proto_fini); @@ -1324,9 +1380,7 @@ static int __init xt_init(void)  	int rv;  	for_each_possible_cpu(i) { -		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); -		spin_lock_init(&lock->lock); -		lock->readers = 0; +		seqcount_init(&per_cpu(xt_recseq, i));  	}  	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); @@ -1337,7 +1391,7 @@ static int __init xt_init(void)  		mutex_init(&xt[i].mutex);  #ifdef CONFIG_COMPAT  		mutex_init(&xt[i].compat_mutex); -		xt[i].compat_offsets = NULL; +		xt[i].compat_tab = NULL;  #endif  		INIT_LIST_HEAD(&xt[i].target);  		INIT_LIST_HEAD(&xt[i].match); diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c new file mode 100644 index 00000000000..4973cbddc44 --- /dev/null +++ b/net/netfilter/xt_AUDIT.c @@ -0,0 +1,231 @@ +/* + * Creates audit record for dropped/accepted packets + * + * (C) 2010-2011 Thomas Graf <tgraf@redhat.com> + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/audit.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_AUDIT.h> +#include <linux/netfilter_bridge/ebtables.h> +#include <net/ipv6.h> +#include <net/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>"); +MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); +MODULE_ALIAS("ipt_AUDIT"); +MODULE_ALIAS("ip6t_AUDIT"); +MODULE_ALIAS("ebt_AUDIT"); +MODULE_ALIAS("arpt_AUDIT"); + +static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb, +			unsigned int proto, unsigned int offset) +{ +	switch (proto) { +	case IPPROTO_TCP: +	case IPPROTO_UDP: +	case IPPROTO_UDPLITE: { +		const __be16 *pptr; +		__be16 _ports[2]; + +		pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports); +		if (pptr == NULL) { +			audit_log_format(ab, " truncated=1"); +			return; +		} + +		audit_log_format(ab, " sport=%hu dport=%hu", +				 ntohs(pptr[0]), ntohs(pptr[1])); +		} +		break; + +	case IPPROTO_ICMP: +	case IPPROTO_ICMPV6: { +		const u8 *iptr; +		u8 _ih[2]; + +		iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih); +		if (iptr == NULL) { +			audit_log_format(ab, " truncated=1"); +			return; +		} + +		audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu", +				 iptr[0], iptr[1]); + +		} +		break; +	} +} + +static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ +	struct iphdr _iph; +	const struct iphdr *ih; + +	ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); +	if (!ih) { +		audit_log_format(ab, " truncated=1"); +		return; +	} + +	audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu", +		&ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol); + +	if (ntohs(ih->frag_off) & IP_OFFSET) { +		audit_log_format(ab, " frag=1"); +		return; +	} + +	audit_proto(ab, skb, ih->protocol, ih->ihl * 4); +} + +static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ +	struct ipv6hdr _ip6h; +	const struct ipv6hdr *ih; +	u8 nexthdr; +	__be16 frag_off; +	int offset; + +	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); +	if (!ih) { +		audit_log_format(ab, " truncated=1"); +		return; +	} + +	nexthdr = ih->nexthdr; +	offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), +				  &nexthdr, &frag_off); + +	audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", +			 &ih->saddr, &ih->daddr, nexthdr); + +	if (offset) +		audit_proto(ab, skb, nexthdr, offset); +} + +static unsigned int +audit_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_audit_info *info = par->targinfo; +	struct audit_buffer *ab; + +	if (audit_enabled == 0) +		goto errout; + +	ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); +	if (ab == NULL) +		goto errout; + +	audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", +			 info->type, par->hooknum, skb->len, +			 par->in ? par->in->name : "?", +			 par->out ? par->out->name : "?"); + +	if (skb->mark) +		audit_log_format(ab, " mark=%#x", skb->mark); + +	if (skb->dev && skb->dev->type == ARPHRD_ETHER) { +		audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x", +				 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, +				 ntohs(eth_hdr(skb)->h_proto)); + +		if (par->family == NFPROTO_BRIDGE) { +			switch (eth_hdr(skb)->h_proto) { +			case htons(ETH_P_IP): +				audit_ip4(ab, skb); +				break; + +			case htons(ETH_P_IPV6): +				audit_ip6(ab, skb); +				break; +			} +		} +	} + +	switch (par->family) { +	case NFPROTO_IPV4: +		audit_ip4(ab, skb); +		break; + +	case NFPROTO_IPV6: +		audit_ip6(ab, skb); +		break; +	} + +#ifdef CONFIG_NETWORK_SECMARK +	if (skb->secmark) +		audit_log_secctx(ab, skb->secmark); +#endif + +	audit_log_end(ab); + +errout: +	return XT_CONTINUE; +} + +static unsigned int +audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par) +{ +	audit_tg(skb, par); +	return EBT_CONTINUE; +} + +static int audit_tg_check(const struct xt_tgchk_param *par) +{ +	const struct xt_audit_info *info = par->targinfo; + +	if (info->type > XT_AUDIT_TYPE_MAX) { +		pr_info("Audit type out of range (valid range: 0..%hhu)\n", +			XT_AUDIT_TYPE_MAX); +		return -ERANGE; +	} + +	return 0; +} + +static struct xt_target audit_tg_reg[] __read_mostly = { +	{ +		.name		= "AUDIT", +		.family		= NFPROTO_UNSPEC, +		.target		= audit_tg, +		.targetsize	= sizeof(struct xt_audit_info), +		.checkentry	= audit_tg_check, +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "AUDIT", +		.family		= NFPROTO_BRIDGE, +		.target		= audit_tg_ebt, +		.targetsize	= sizeof(struct xt_audit_info), +		.checkentry	= audit_tg_check, +		.me		= THIS_MODULE, +	}, +}; + +static int __init audit_tg_init(void) +{ +	return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +static void __exit audit_tg_exit(void) +{ +	xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +module_init(audit_tg_init); +module_exit(audit_tg_exit); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index c2c0e4abeb9..af9c4dadf81 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -19,12 +19,14 @@  #include <linux/netfilter_ipv6.h>  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter/xt_CLASSIFY.h> +#include <linux/netfilter_arp.h>  MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");  MODULE_LICENSE("GPL");  MODULE_DESCRIPTION("Xtables: Qdisc classification");  MODULE_ALIAS("ipt_CLASSIFY");  MODULE_ALIAS("ip6t_CLASSIFY"); +MODULE_ALIAS("arpt_CLASSIFY");  static unsigned int  classify_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par)  	return XT_CONTINUE;  } -static struct xt_target classify_tg_reg __read_mostly = { -	.name       = "CLASSIFY", -	.revision   = 0, -	.family     = NFPROTO_UNSPEC, -	.table      = "mangle", -	.hooks      = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | -		      (1 << NF_INET_POST_ROUTING), -	.target     = classify_tg, -	.targetsize = sizeof(struct xt_classify_target_info), -	.me         = THIS_MODULE, +static struct xt_target classify_tg_reg[] __read_mostly = { +	{ +		.name       = "CLASSIFY", +		.revision   = 0, +		.family     = NFPROTO_UNSPEC, +		.hooks      = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | +		              (1 << NF_INET_POST_ROUTING), +		.target     = classify_tg, +		.targetsize = sizeof(struct xt_classify_target_info), +		.me         = THIS_MODULE, +	}, +	{ +		.name       = "CLASSIFY", +		.revision   = 0, +		.family     = NFPROTO_ARP, +		.hooks      = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), +		.target     = classify_tg, +		.targetsize = sizeof(struct xt_classify_target_info), +		.me         = THIS_MODULE, +	},  };  static int __init classify_tg_init(void)  { -	return xt_register_target(&classify_tg_reg); +	return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));  }  static void __exit classify_tg_exit(void)  { -	xt_unregister_target(&classify_tg_reg); +	xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));  }  module_init(classify_tg_init); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 782e51986a6..75747aecdeb 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -5,7 +5,7 @@   * it under the terms of the GNU General Public License version 2 as   * published by the Free Software Foundation.   */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/module.h>  #include <linux/gfp.h>  #include <linux/skbuff.h> @@ -14,20 +14,21 @@  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter/xt_CT.h>  #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h>  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_timeout.h>  #include <net/netfilter/nf_conntrack_zones.h> -static unsigned int xt_ct_target(struct sk_buff *skb, -				 const struct xt_action_param *par) +static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)  { -	const struct xt_ct_target_info *info = par->targinfo; -	struct nf_conn *ct = info->ct; -  	/* Previously seen (loopback)? Ignore. */  	if (skb->nfct != NULL)  		return XT_CONTINUE; +	/* special case the untracked ct : we want the percpu object */ +	if (!ct) +		ct = nf_ct_untracked_get();  	atomic_inc(&ct->ct_general.use);  	skb->nfct = &ct->ct_general;  	skb->nfctinfo = IP_CT_NEW; @@ -35,6 +36,24 @@ static unsigned int xt_ct_target(struct sk_buff *skb,  	return XT_CONTINUE;  } +static unsigned int xt_ct_target_v0(struct sk_buff *skb, +				    const struct xt_action_param *par) +{ +	const struct xt_ct_target_info *info = par->targinfo; +	struct nf_conn *ct = info->ct; + +	return xt_ct_target(skb, ct); +} + +static unsigned int xt_ct_target_v1(struct sk_buff *skb, +				    const struct xt_action_param *par) +{ +	const struct xt_ct_target_info_v1 *info = par->targinfo; +	struct nf_conn *ct = info->ct; + +	return xt_ct_target(skb, ct); +} +  static u8 xt_ct_find_proto(const struct xt_tgchk_param *par)  {  	if (par->family == NFPROTO_IPV4) { @@ -53,21 +72,124 @@ static u8 xt_ct_find_proto(const struct xt_tgchk_param *par)  		return 0;  } -static int xt_ct_tg_check(const struct xt_tgchk_param *par) +static int +xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, +		 const struct xt_tgchk_param *par)  { -	struct xt_ct_target_info *info = par->targinfo; -	struct nf_conntrack_tuple t; +	struct nf_conntrack_helper *helper;  	struct nf_conn_help *help; -	struct nf_conn *ct; +	u8 proto; + +	proto = xt_ct_find_proto(par); +	if (!proto) { +		pr_info("You must specify a L4 protocol, and not use " +			"inversions on it.\n"); +		return -ENOENT; +	} + +	helper = nf_conntrack_helper_try_module_get(helper_name, par->family, +						    proto); +	if (helper == NULL) { +		pr_info("No such helper \"%s\"\n", helper_name); +		return -ENOENT; +	} + +	help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); +	if (help == NULL) { +		module_put(helper->me); +		return -ENOMEM; +	} + +	help->helper = helper; +	return 0; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) +{ +	typeof(nf_ct_timeout_put_hook) timeout_put; + +	timeout_put = rcu_dereference(nf_ct_timeout_put_hook); +	if (timeout_put) +		timeout_put(timeout); +} +#endif + +static int +xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, +		  const char *timeout_name) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +	typeof(nf_ct_timeout_find_get_hook) timeout_find_get; +	struct ctnl_timeout *timeout; +	struct nf_conn_timeout *timeout_ext; +	struct nf_conntrack_l4proto *l4proto;  	int ret = 0;  	u8 proto; -	if (info->flags & ~XT_CT_NOTRACK) -		return -EINVAL; +	rcu_read_lock(); +	timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); +	if (timeout_find_get == NULL) { +		ret = -ENOENT; +		pr_info("Timeout policy base is empty\n"); +		goto out; +	} + +	proto = xt_ct_find_proto(par); +	if (!proto) { +		ret = -EINVAL; +		pr_info("You must specify a L4 protocol, and not use " +			"inversions on it.\n"); +		goto out; +	} + +	timeout = timeout_find_get(timeout_name); +	if (timeout == NULL) { +		ret = -ENOENT; +		pr_info("No such timeout policy \"%s\"\n", timeout_name); +		goto out; +	} + +	if (timeout->l3num != par->family) { +		ret = -EINVAL; +		pr_info("Timeout policy `%s' can only be used by L3 protocol " +			"number %d\n", timeout_name, timeout->l3num); +		goto err_put_timeout; +	} +	/* Make sure the timeout policy matches any existing protocol tracker, +	 * otherwise default to generic. +	 */ +	l4proto = __nf_ct_l4proto_find(par->family, proto); +	if (timeout->l4proto->l4proto != l4proto->l4proto) { +		ret = -EINVAL; +		pr_info("Timeout policy `%s' can only be used by L4 protocol " +			"number %d\n", +			timeout_name, timeout->l4proto->l4proto); +		goto err_put_timeout; +	} +	timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); +	if (timeout_ext == NULL) +		ret = -ENOMEM; + +err_put_timeout: +	__xt_ct_tg_timeout_put(timeout); +out: +	rcu_read_unlock(); +	return ret; +#else +	return -EOPNOTSUPP; +#endif +} + +static int xt_ct_tg_check(const struct xt_tgchk_param *par, +			  struct xt_ct_target_info_v1 *info) +{ +	struct nf_conntrack_tuple t; +	struct nf_conn *ct; +	int ret = -EOPNOTSUPP;  	if (info->flags & XT_CT_NOTRACK) { -		ct = nf_ct_untracked_get(); -		atomic_inc(&ct->ct_general.use); +		ct = NULL;  		goto out;  	} @@ -89,30 +211,24 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par)  	ret = 0;  	if ((info->ct_events || info->exp_events) &&  	    !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, -				  GFP_KERNEL)) +				  GFP_KERNEL)) { +		ret = -EINVAL;  		goto err3; +	}  	if (info->helper[0]) { -		ret = -ENOENT; -		proto = xt_ct_find_proto(par); -		if (!proto) -			goto err3; - -		ret = -ENOMEM; -		help = nf_ct_helper_ext_add(ct, GFP_KERNEL); -		if (help == NULL) +		ret = xt_ct_set_helper(ct, info->helper, par); +		if (ret < 0)  			goto err3; +	} -		ret = -ENOENT; -		help->helper = nf_conntrack_helper_try_module_get(info->helper, -								  par->family, -								  proto); -		if (help->helper == NULL) +	if (info->timeout[0]) { +		ret = xt_ct_set_timeout(ct, par, info->timeout); +		if (ret < 0)  			goto err3;  	} -	__set_bit(IPS_TEMPLATE_BIT, &ct->status); -	__set_bit(IPS_CONFIRMED_BIT, &ct->status); +	nf_conntrack_tmpl_insert(par->net, ct);  out:  	info->ct = ct;  	return 0; @@ -125,41 +241,196 @@ err1:  	return ret;  } -static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par) +static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par)  {  	struct xt_ct_target_info *info = par->targinfo; +	struct xt_ct_target_info_v1 info_v1 = { +		.flags 		= info->flags, +		.zone		= info->zone, +		.ct_events	= info->ct_events, +		.exp_events	= info->exp_events, +	}; +	int ret; + +	if (info->flags & ~XT_CT_NOTRACK) +		return -EINVAL; + +	memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + +	ret = xt_ct_tg_check(par, &info_v1); +	if (ret < 0) +		return ret; + +	info->ct = info_v1.ct; + +	return ret; +} + +static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) +{ +	struct xt_ct_target_info_v1 *info = par->targinfo; + +	if (info->flags & ~XT_CT_NOTRACK) +		return -EINVAL; + +	return xt_ct_tg_check(par, par->targinfo); +} + +static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par) +{ +	struct xt_ct_target_info_v1 *info = par->targinfo; + +	if (info->flags & ~XT_CT_MASK) +		return -EINVAL; + +	return xt_ct_tg_check(par, par->targinfo); +} + +static void xt_ct_destroy_timeout(struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +	struct nf_conn_timeout *timeout_ext; +	typeof(nf_ct_timeout_put_hook) timeout_put; + +	rcu_read_lock(); +	timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + +	if (timeout_put) { +		timeout_ext = nf_ct_timeout_find(ct); +		if (timeout_ext) +			timeout_put(timeout_ext->timeout); +	} +	rcu_read_unlock(); +#endif +} + +static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, +			     struct xt_ct_target_info_v1 *info) +{  	struct nf_conn *ct = info->ct;  	struct nf_conn_help *help; -	if (!nf_ct_is_untracked(ct)) { +	if (ct && !nf_ct_is_untracked(ct)) {  		help = nfct_help(ct);  		if (help)  			module_put(help->helper->me);  		nf_ct_l3proto_module_put(par->family); + +		xt_ct_destroy_timeout(ct); +		nf_ct_put(info->ct);  	} -	nf_ct_put(info->ct);  } -static struct xt_target xt_ct_tg __read_mostly = { -	.name		= "CT", +static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) +{ +	struct xt_ct_target_info *info = par->targinfo; +	struct xt_ct_target_info_v1 info_v1 = { +		.flags 		= info->flags, +		.zone		= info->zone, +		.ct_events	= info->ct_events, +		.exp_events	= info->exp_events, +		.ct		= info->ct, +	}; +	memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + +	xt_ct_tg_destroy(par, &info_v1); +} + +static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) +{ +	xt_ct_tg_destroy(par, par->targinfo); +} + +static struct xt_target xt_ct_tg_reg[] __read_mostly = { +	{ +		.name		= "CT", +		.family		= NFPROTO_UNSPEC, +		.targetsize	= sizeof(struct xt_ct_target_info), +		.checkentry	= xt_ct_tg_check_v0, +		.destroy	= xt_ct_tg_destroy_v0, +		.target		= xt_ct_target_v0, +		.table		= "raw", +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "CT", +		.family		= NFPROTO_UNSPEC, +		.revision	= 1, +		.targetsize	= sizeof(struct xt_ct_target_info_v1), +		.checkentry	= xt_ct_tg_check_v1, +		.destroy	= xt_ct_tg_destroy_v1, +		.target		= xt_ct_target_v1, +		.table		= "raw", +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "CT", +		.family		= NFPROTO_UNSPEC, +		.revision	= 2, +		.targetsize	= sizeof(struct xt_ct_target_info_v1), +		.checkentry	= xt_ct_tg_check_v2, +		.destroy	= xt_ct_tg_destroy_v1, +		.target		= xt_ct_target_v1, +		.table		= "raw", +		.me		= THIS_MODULE, +	}, +}; + +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ +	/* Previously seen (loopback)? Ignore. */ +	if (skb->nfct != NULL) +		return XT_CONTINUE; + +	skb->nfct = &nf_ct_untracked_get()->ct_general; +	skb->nfctinfo = IP_CT_NEW; +	nf_conntrack_get(skb->nfct); + +	return XT_CONTINUE; +} + +static int notrack_chk(const struct xt_tgchk_param *par) +{ +	if (!par->net->xt.notrack_deprecated_warning) { +		pr_info("netfilter: NOTRACK target is deprecated, " +			"use CT instead or upgrade iptables\n"); +		par->net->xt.notrack_deprecated_warning = true; +	} +	return 0; +} + +static struct xt_target notrack_tg_reg __read_mostly = { +	.name		= "NOTRACK", +	.revision	= 0,  	.family		= NFPROTO_UNSPEC, -	.targetsize	= sizeof(struct xt_ct_target_info), -	.checkentry	= xt_ct_tg_check, -	.destroy	= xt_ct_tg_destroy, -	.target		= xt_ct_target, +	.checkentry	= notrack_chk, +	.target		= notrack_tg,  	.table		= "raw",  	.me		= THIS_MODULE,  };  static int __init xt_ct_tg_init(void)  { -	return xt_register_target(&xt_ct_tg); +	int ret; + +	ret = xt_register_target(¬rack_tg_reg); +	if (ret < 0) +		return ret; + +	ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); +	if (ret < 0) { +		xt_unregister_target(¬rack_tg_reg); +		return ret; +	} +	return 0;  }  static void __exit xt_ct_tg_exit(void)  { -	xt_unregister_target(&xt_ct_tg); +	xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); +	xt_unregister_target(¬rack_tg_reg);  }  module_init(xt_ct_tg_init); @@ -169,3 +440,5 @@ MODULE_LICENSE("GPL");  MODULE_DESCRIPTION("Xtables: connection tracking target");  MODULE_ALIAS("ipt_CT");  MODULE_ALIAS("ip6t_CT"); +MODULE_ALIAS("ipt_NOTRACK"); +MODULE_ALIAS("ip6t_NOTRACK"); diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 0a229191e55..ae8271652ef 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)  	u_int8_t orig, nv;  	orig = ipv6_get_dsfield(iph); -	nv   = (orig & info->tos_mask) ^ info->tos_value; +	nv   = (orig & ~info->tos_mask) ^ info->tos_value;  	if (orig != nv) {  		if (!skb_make_writable(skb, sizeof(struct iphdr))) diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c index 95b084800fc..1535e87ed9b 100644 --- a/net/netfilter/xt_HL.c +++ b/net/netfilter/xt_HL.c @@ -38,22 +38,22 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par)  	iph = ip_hdr(skb);  	switch (info->mode) { -		case IPT_TTL_SET: -			new_ttl = info->ttl; -			break; -		case IPT_TTL_INC: -			new_ttl = iph->ttl + info->ttl; -			if (new_ttl > 255) -				new_ttl = 255; -			break; -		case IPT_TTL_DEC: -			new_ttl = iph->ttl - info->ttl; -			if (new_ttl < 0) -				new_ttl = 0; -			break; -		default: -			new_ttl = iph->ttl; -			break; +	case IPT_TTL_SET: +		new_ttl = info->ttl; +		break; +	case IPT_TTL_INC: +		new_ttl = iph->ttl + info->ttl; +		if (new_ttl > 255) +			new_ttl = 255; +		break; +	case IPT_TTL_DEC: +		new_ttl = iph->ttl - info->ttl; +		if (new_ttl < 0) +			new_ttl = 0; +		break; +	default: +		new_ttl = iph->ttl; +		break;  	}  	if (new_ttl != iph->ttl) { @@ -78,22 +78,22 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par)  	ip6h = ipv6_hdr(skb);  	switch (info->mode) { -		case IP6T_HL_SET: -			new_hl = info->hop_limit; -			break; -		case IP6T_HL_INC: -			new_hl = ip6h->hop_limit + info->hop_limit; -			if (new_hl > 255) -				new_hl = 255; -			break; -		case IP6T_HL_DEC: -			new_hl = ip6h->hop_limit - info->hop_limit; -			if (new_hl < 0) -				new_hl = 0; -			break; -		default: -			new_hl = ip6h->hop_limit; -			break; +	case IP6T_HL_SET: +		new_hl = info->hop_limit; +		break; +	case IP6T_HL_INC: +		new_hl = ip6h->hop_limit + info->hop_limit; +		if (new_hl > 255) +			new_hl = 255; +		break; +	case IP6T_HL_DEC: +		new_hl = ip6h->hop_limit - info->hop_limit; +		if (new_hl < 0) +			new_hl = 0; +		break; +	default: +		new_hl = ip6h->hop_limit; +		break;  	}  	ip6h->hop_limit = new_hl; diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c new file mode 100644 index 00000000000..73b73f687c5 --- /dev/null +++ b/net/netfilter/xt_HMARK.c @@ -0,0 +1,372 @@ +/* + * xt_HMARK - Netfilter module to set mark by means of hashing + * + * (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com> + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_HMARK.h> + +#include <net/ip.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#include <net/ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>"); +MODULE_DESCRIPTION("Xtables: packet marking using hash calculation"); +MODULE_ALIAS("ipt_HMARK"); +MODULE_ALIAS("ip6t_HMARK"); + +struct hmark_tuple { +	__be32			src; +	__be32			dst; +	union hmark_ports	uports; +	u8			proto; +}; + +static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask) +{ +	return (addr32[0] & mask[0]) ^ +	       (addr32[1] & mask[1]) ^ +	       (addr32[2] & mask[2]) ^ +	       (addr32[3] & mask[3]); +} + +static inline __be32 +hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask) +{ +	switch (l3num) { +	case AF_INET: +		return *addr32 & *mask; +	case AF_INET6: +		return hmark_addr6_mask(addr32, mask); +	} +	return 0; +} + +static inline void hmark_swap_ports(union hmark_ports *uports, +				    const struct xt_hmark_info *info) +{ +	union hmark_ports hp; +	u16 src, dst; + +	hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32; +	src = ntohs(hp.b16.src); +	dst = ntohs(hp.b16.dst); + +	if (dst > src) +		uports->v32 = (dst << 16) | src; +	else +		uports->v32 = (src << 16) | dst; +} + +static int +hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, +		    const struct xt_hmark_info *info) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	struct nf_conntrack_tuple *otuple; +	struct nf_conntrack_tuple *rtuple; + +	if (ct == NULL || nf_ct_is_untracked(ct)) +		return -1; + +	otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; +	rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + +	t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6, +				 info->src_mask.ip6); +	t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6, +				 info->dst_mask.ip6); + +	if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) +		return 0; + +	t->proto = nf_ct_protonum(ct); +	if (t->proto != IPPROTO_ICMP) { +		t->uports.b16.src = otuple->src.u.all; +		t->uports.b16.dst = rtuple->src.u.all; +		hmark_swap_ports(&t->uports, info); +	} + +	return 0; +#else +	return -1; +#endif +} + +/* This hash function is endian independent, to ensure consistent hashing if + * the cluster is composed of big and little endian systems. */ +static inline u32 +hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) +{ +	u32 hash; +	u32 src = ntohl(t->src); +	u32 dst = ntohl(t->dst); + +	if (dst < src) +		swap(src, dst); + +	hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); +	hash = hash ^ (t->proto & info->proto_mask); + +	return (((u64)hash * info->hmodulus) >> 32) + info->hoffset; +} + +static void +hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff, +		      struct hmark_tuple *t, const struct xt_hmark_info *info) +{ +	int protoff; + +	protoff = proto_ports_offset(t->proto); +	if (protoff < 0) +		return; + +	nhoff += protoff; +	if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0) +		return; + +	hmark_swap_ports(&t->uports, info); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int get_inner6_hdr(const struct sk_buff *skb, int *offset) +{ +	struct icmp6hdr *icmp6h, _ih6; + +	icmp6h = skb_header_pointer(skb, *offset, sizeof(_ih6), &_ih6); +	if (icmp6h == NULL) +		return 0; + +	if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) { +		*offset += sizeof(struct icmp6hdr); +		return 1; +	} +	return 0; +} + +static int +hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t, +			  const struct xt_hmark_info *info) +{ +	struct ipv6hdr *ip6, _ip6; +	int flag = IP6_FH_F_AUTH; +	unsigned int nhoff = 0; +	u16 fragoff = 0; +	int nexthdr; + +	ip6 = (struct ipv6hdr *) (skb->data + skb_network_offset(skb)); +	nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); +	if (nexthdr < 0) +		return 0; +	/* No need to check for icmp errors on fragments */ +	if ((flag & IP6_FH_F_FRAG) || (nexthdr != IPPROTO_ICMPV6)) +		goto noicmp; +	/* Use inner header in case of ICMP errors */ +	if (get_inner6_hdr(skb, &nhoff)) { +		ip6 = skb_header_pointer(skb, nhoff, sizeof(_ip6), &_ip6); +		if (ip6 == NULL) +			return -1; +		/* If AH present, use SPI like in ESP. */ +		flag = IP6_FH_F_AUTH; +		nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); +		if (nexthdr < 0) +			return -1; +	} +noicmp: +	t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6); +	t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6); + +	if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) +		return 0; + +	t->proto = nexthdr; +	if (t->proto == IPPROTO_ICMPV6) +		return 0; + +	if (flag & IP6_FH_F_FRAG) +		return 0; + +	hmark_set_tuple_ports(skb, nhoff, t, info); +	return 0; +} + +static unsigned int +hmark_tg_v6(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_hmark_info *info = par->targinfo; +	struct hmark_tuple t; + +	memset(&t, 0, sizeof(struct hmark_tuple)); + +	if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { +		if (hmark_ct_set_htuple(skb, &t, info) < 0) +			return XT_CONTINUE; +	} else { +		if (hmark_pkt_set_htuple_ipv6(skb, &t, info) < 0) +			return XT_CONTINUE; +	} + +	skb->mark = hmark_hash(&t, info); +	return XT_CONTINUE; +} +#endif + +static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) +{ +	const struct icmphdr *icmph; +	struct icmphdr _ih; + +	/* Not enough header? */ +	icmph = skb_header_pointer(skb, *nhoff + iphsz, sizeof(_ih), &_ih); +	if (icmph == NULL || icmph->type > NR_ICMP_TYPES) +		return 0; + +	/* Error message? */ +	if (icmph->type != ICMP_DEST_UNREACH && +	    icmph->type != ICMP_SOURCE_QUENCH && +	    icmph->type != ICMP_TIME_EXCEEDED && +	    icmph->type != ICMP_PARAMETERPROB && +	    icmph->type != ICMP_REDIRECT) +		return 0; + +	*nhoff += iphsz + sizeof(_ih); +	return 1; +} + +static int +hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, +			  const struct xt_hmark_info *info) +{ +	struct iphdr *ip, _ip; +	int nhoff = skb_network_offset(skb); + +	ip = (struct iphdr *) (skb->data + nhoff); +	if (ip->protocol == IPPROTO_ICMP) { +		/* Use inner header in case of ICMP errors */ +		if (get_inner_hdr(skb, ip->ihl * 4, &nhoff)) { +			ip = skb_header_pointer(skb, nhoff, sizeof(_ip), &_ip); +			if (ip == NULL) +				return -1; +		} +	} + +	t->src = ip->saddr & info->src_mask.ip; +	t->dst = ip->daddr & info->dst_mask.ip; + +	if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) +		return 0; + +	t->proto = ip->protocol; + +	/* ICMP has no ports, skip */ +	if (t->proto == IPPROTO_ICMP) +		return 0; + +	/* follow-up fragments don't contain ports, skip all fragments */ +	if (ip->frag_off & htons(IP_MF | IP_OFFSET)) +		return 0; + +	hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info); + +	return 0; +} + +static unsigned int +hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_hmark_info *info = par->targinfo; +	struct hmark_tuple t; + +	memset(&t, 0, sizeof(struct hmark_tuple)); + +	if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { +		if (hmark_ct_set_htuple(skb, &t, info) < 0) +			return XT_CONTINUE; +	} else { +		if (hmark_pkt_set_htuple_ipv4(skb, &t, info) < 0) +			return XT_CONTINUE; +	} + +	skb->mark = hmark_hash(&t, info); +	return XT_CONTINUE; +} + +static int hmark_tg_check(const struct xt_tgchk_param *par) +{ +	const struct xt_hmark_info *info = par->targinfo; + +	if (!info->hmodulus) { +		pr_info("xt_HMARK: hash modulus can't be zero\n"); +		return -EINVAL; +	} +	if (info->proto_mask && +	    (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) { +		pr_info("xt_HMARK: proto mask must be zero with L3 mode\n"); +		return -EINVAL; +	} +	if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI_MASK) && +	    (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT_MASK) | +			     XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) { +		pr_info("xt_HMARK: spi-mask and port-mask can't be combined\n"); +		return -EINVAL; +	} +	if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI) && +	    (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT) | +			     XT_HMARK_FLAG(XT_HMARK_DPORT)))) { +		pr_info("xt_HMARK: spi-set and port-set can't be combined\n"); +		return -EINVAL; +	} +	return 0; +} + +static struct xt_target hmark_tg_reg[] __read_mostly = { +	{ +		.name		= "HMARK", +		.family		= NFPROTO_IPV4, +		.target		= hmark_tg_v4, +		.targetsize	= sizeof(struct xt_hmark_info), +		.checkentry	= hmark_tg_check, +		.me		= THIS_MODULE, +	}, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	{ +		.name		= "HMARK", +		.family		= NFPROTO_IPV6, +		.target		= hmark_tg_v6, +		.targetsize	= sizeof(struct xt_hmark_info), +		.checkentry	= hmark_tg_check, +		.me		= THIS_MODULE, +	}, +#endif +}; + +static int __init hmark_tg_init(void) +{ +	return xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); +} + +static void __exit hmark_tg_exit(void) +{ +	xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); +} + +module_init(hmark_tg_init); +module_exit(hmark_tg_exit); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index be1f22e1354..f407ebc1348 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -122,14 +122,12 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)  	info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);  	if (!info->timer) { -		pr_debug("couldn't alloc timer\n");  		ret = -ENOMEM;  		goto out;  	}  	info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);  	if (!info->timer->attr.attr.name) { -		pr_debug("couldn't alloc attribute name\n");  		ret = -ENOMEM;  		goto out_free_timer;  	} @@ -313,3 +311,5 @@ MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");  MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");  MODULE_DESCRIPTION("Xtables: idle time monitor");  MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ipt_IDLETIMER"); +MODULE_ALIAS("ip6t_IDLETIMER"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index a4140509eea..993de2ba89d 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -31,6 +31,8 @@  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");  MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); +MODULE_ALIAS("ipt_LED"); +MODULE_ALIAS("ip6t_LED");  static LIST_HEAD(xt_led_triggers);  static DEFINE_MUTEX(xt_led_mutex); diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c new file mode 100644 index 00000000000..5ab24843370 --- /dev/null +++ b/net/netfilter/xt_LOG.c @@ -0,0 +1,975 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_LOG.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/xt_log.h> + +static struct nf_loginfo default_loginfo = { +	.type	= NF_LOG_TYPE_LOG, +	.u = { +		.log = { +			.level    = 5, +			.logflags = NF_LOG_MASK, +		}, +	}, +}; + +static int dump_udp_header(struct sbuff *m, const struct sk_buff *skb, +			   u8 proto, int fragment, unsigned int offset) +{ +	struct udphdr _udph; +	const struct udphdr *uh; + +	if (proto == IPPROTO_UDP) +		/* Max length: 10 "PROTO=UDP "     */ +		sb_add(m, "PROTO=UDP "); +	else	/* Max length: 14 "PROTO=UDPLITE " */ +		sb_add(m, "PROTO=UDPLITE "); + +	if (fragment) +		goto out; + +	/* Max length: 25 "INCOMPLETE [65535 bytes] " */ +	uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); +	if (uh == NULL) { +		sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + +		return 1; +	} + +	/* Max length: 20 "SPT=65535 DPT=65535 " */ +	sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), +		ntohs(uh->len)); + +out: +	return 0; +} + +static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, +			   u8 proto, int fragment, unsigned int offset, +			   unsigned int logflags) +{ +	struct tcphdr _tcph; +	const struct tcphdr *th; + +	/* Max length: 10 "PROTO=TCP " */ +	sb_add(m, "PROTO=TCP "); + +	if (fragment) +		return 0; + +	/* Max length: 25 "INCOMPLETE [65535 bytes] " */ +	th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); +	if (th == NULL) { +		sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); +		return 1; +	} + +	/* Max length: 20 "SPT=65535 DPT=65535 " */ +	sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); +	/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ +	if (logflags & XT_LOG_TCPSEQ) +		sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); + +	/* Max length: 13 "WINDOW=65535 " */ +	sb_add(m, "WINDOW=%u ", ntohs(th->window)); +	/* Max length: 9 "RES=0x3C " */ +	sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & +					    TCP_RESERVED_BITS) >> 22)); +	/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ +	if (th->cwr) +		sb_add(m, "CWR "); +	if (th->ece) +		sb_add(m, "ECE "); +	if (th->urg) +		sb_add(m, "URG "); +	if (th->ack) +		sb_add(m, "ACK "); +	if (th->psh) +		sb_add(m, "PSH "); +	if (th->rst) +		sb_add(m, "RST "); +	if (th->syn) +		sb_add(m, "SYN "); +	if (th->fin) +		sb_add(m, "FIN "); +	/* Max length: 11 "URGP=65535 " */ +	sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + +	if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { +		u_int8_t _opt[60 - sizeof(struct tcphdr)]; +		const u_int8_t *op; +		unsigned int i; +		unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); + +		op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), +					optsize, _opt); +		if (op == NULL) { +			sb_add(m, "OPT (TRUNCATED)"); +			return 1; +		} + +		/* Max length: 127 "OPT (" 15*4*2chars ") " */ +		sb_add(m, "OPT ("); +		for (i = 0; i < optsize; i++) +			sb_add(m, "%02X", op[i]); + +		sb_add(m, ") "); +	} + +	return 0; +} + +static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk) +{ +	if (!sk || sk->sk_state == TCP_TIME_WAIT) +		return; + +	read_lock_bh(&sk->sk_callback_lock); +	if (sk->sk_socket && sk->sk_socket->file) { +		const struct cred *cred = sk->sk_socket->file->f_cred; +		sb_add(m, "UID=%u GID=%u ", +			from_kuid_munged(&init_user_ns, cred->fsuid), +			from_kgid_munged(&init_user_ns, cred->fsgid)); +	} +	read_unlock_bh(&sk->sk_callback_lock); +} + +/* One level of recursion won't kill us */ +static void dump_ipv4_packet(struct sbuff *m, +			const struct nf_loginfo *info, +			const struct sk_buff *skb, +			unsigned int iphoff) +{ +	struct iphdr _iph; +	const struct iphdr *ih; +	unsigned int logflags; + +	if (info->type == NF_LOG_TYPE_LOG) +		logflags = info->u.log.logflags; +	else +		logflags = NF_LOG_MASK; + +	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); +	if (ih == NULL) { +		sb_add(m, "TRUNCATED"); +		return; +	} + +	/* Important fields: +	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ +	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ +	sb_add(m, "SRC=%pI4 DST=%pI4 ", +	       &ih->saddr, &ih->daddr); + +	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ +	sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", +	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, +	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + +	/* Max length: 6 "CE DF MF " */ +	if (ntohs(ih->frag_off) & IP_CE) +		sb_add(m, "CE "); +	if (ntohs(ih->frag_off) & IP_DF) +		sb_add(m, "DF "); +	if (ntohs(ih->frag_off) & IP_MF) +		sb_add(m, "MF "); + +	/* Max length: 11 "FRAG:65535 " */ +	if (ntohs(ih->frag_off) & IP_OFFSET) +		sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + +	if ((logflags & XT_LOG_IPOPT) && +	    ih->ihl * 4 > sizeof(struct iphdr)) { +		const unsigned char *op; +		unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; +		unsigned int i, optsize; + +		optsize = ih->ihl * 4 - sizeof(struct iphdr); +		op = skb_header_pointer(skb, iphoff+sizeof(_iph), +					optsize, _opt); +		if (op == NULL) { +			sb_add(m, "TRUNCATED"); +			return; +		} + +		/* Max length: 127 "OPT (" 15*4*2chars ") " */ +		sb_add(m, "OPT ("); +		for (i = 0; i < optsize; i++) +			sb_add(m, "%02X", op[i]); +		sb_add(m, ") "); +	} + +	switch (ih->protocol) { +	case IPPROTO_TCP: +		if (dump_tcp_header(m, skb, ih->protocol, +				    ntohs(ih->frag_off) & IP_OFFSET, +				    iphoff+ih->ihl*4, logflags)) +			return; +		break; +	case IPPROTO_UDP: +	case IPPROTO_UDPLITE: +		if (dump_udp_header(m, skb, ih->protocol, +				    ntohs(ih->frag_off) & IP_OFFSET, +				    iphoff+ih->ihl*4)) +			return; +		break; +	case IPPROTO_ICMP: { +		struct icmphdr _icmph; +		const struct icmphdr *ich; +		static const size_t required_len[NR_ICMP_TYPES+1] +			= { [ICMP_ECHOREPLY] = 4, +			    [ICMP_DEST_UNREACH] +			    = 8 + sizeof(struct iphdr), +			    [ICMP_SOURCE_QUENCH] +			    = 8 + sizeof(struct iphdr), +			    [ICMP_REDIRECT] +			    = 8 + sizeof(struct iphdr), +			    [ICMP_ECHO] = 4, +			    [ICMP_TIME_EXCEEDED] +			    = 8 + sizeof(struct iphdr), +			    [ICMP_PARAMETERPROB] +			    = 8 + sizeof(struct iphdr), +			    [ICMP_TIMESTAMP] = 20, +			    [ICMP_TIMESTAMPREPLY] = 20, +			    [ICMP_ADDRESS] = 12, +			    [ICMP_ADDRESSREPLY] = 12 }; + +		/* Max length: 11 "PROTO=ICMP " */ +		sb_add(m, "PROTO=ICMP "); + +		if (ntohs(ih->frag_off) & IP_OFFSET) +			break; + +		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ +		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, +					 sizeof(_icmph), &_icmph); +		if (ich == NULL) { +			sb_add(m, "INCOMPLETE [%u bytes] ", +			       skb->len - iphoff - ih->ihl*4); +			break; +		} + +		/* Max length: 18 "TYPE=255 CODE=255 " */ +		sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + +		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ +		if (ich->type <= NR_ICMP_TYPES && +		    required_len[ich->type] && +		    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { +			sb_add(m, "INCOMPLETE [%u bytes] ", +			       skb->len - iphoff - ih->ihl*4); +			break; +		} + +		switch (ich->type) { +		case ICMP_ECHOREPLY: +		case ICMP_ECHO: +			/* Max length: 19 "ID=65535 SEQ=65535 " */ +			sb_add(m, "ID=%u SEQ=%u ", +			       ntohs(ich->un.echo.id), +			       ntohs(ich->un.echo.sequence)); +			break; + +		case ICMP_PARAMETERPROB: +			/* Max length: 14 "PARAMETER=255 " */ +			sb_add(m, "PARAMETER=%u ", +			       ntohl(ich->un.gateway) >> 24); +			break; +		case ICMP_REDIRECT: +			/* Max length: 24 "GATEWAY=255.255.255.255 " */ +			sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); +			/* Fall through */ +		case ICMP_DEST_UNREACH: +		case ICMP_SOURCE_QUENCH: +		case ICMP_TIME_EXCEEDED: +			/* Max length: 3+maxlen */ +			if (!iphoff) { /* Only recurse once. */ +				sb_add(m, "["); +				dump_ipv4_packet(m, info, skb, +					    iphoff + ih->ihl*4+sizeof(_icmph)); +				sb_add(m, "] "); +			} + +			/* Max length: 10 "MTU=65535 " */ +			if (ich->type == ICMP_DEST_UNREACH && +			    ich->code == ICMP_FRAG_NEEDED) +				sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); +		} +		break; +	} +	/* Max Length */ +	case IPPROTO_AH: { +		struct ip_auth_hdr _ahdr; +		const struct ip_auth_hdr *ah; + +		if (ntohs(ih->frag_off) & IP_OFFSET) +			break; + +		/* Max length: 9 "PROTO=AH " */ +		sb_add(m, "PROTO=AH "); + +		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ +		ah = skb_header_pointer(skb, iphoff+ih->ihl*4, +					sizeof(_ahdr), &_ahdr); +		if (ah == NULL) { +			sb_add(m, "INCOMPLETE [%u bytes] ", +			       skb->len - iphoff - ih->ihl*4); +			break; +		} + +		/* Length: 15 "SPI=0xF1234567 " */ +		sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); +		break; +	} +	case IPPROTO_ESP: { +		struct ip_esp_hdr _esph; +		const struct ip_esp_hdr *eh; + +		/* Max length: 10 "PROTO=ESP " */ +		sb_add(m, "PROTO=ESP "); + +		if (ntohs(ih->frag_off) & IP_OFFSET) +			break; + +		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ +		eh = skb_header_pointer(skb, iphoff+ih->ihl*4, +					sizeof(_esph), &_esph); +		if (eh == NULL) { +			sb_add(m, "INCOMPLETE [%u bytes] ", +			       skb->len - iphoff - ih->ihl*4); +			break; +		} + +		/* Length: 15 "SPI=0xF1234567 " */ +		sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); +		break; +	} +	/* Max length: 10 "PROTO 255 " */ +	default: +		sb_add(m, "PROTO=%u ", ih->protocol); +	} + +	/* Max length: 15 "UID=4294967295 " */ +	if ((logflags & XT_LOG_UID) && !iphoff) +		dump_sk_uid_gid(m, skb->sk); + +	/* Max length: 16 "MARK=0xFFFFFFFF " */ +	if (!iphoff && skb->mark) +		sb_add(m, "MARK=0x%x ", skb->mark); + +	/* Proto    Max log string length */ +	/* IP:      40+46+6+11+127 = 230 */ +	/* TCP:     10+max(25,20+30+13+9+32+11+127) = 252 */ +	/* UDP:     10+max(25,20) = 35 */ +	/* UDPLITE: 14+max(25,20) = 39 */ +	/* ICMP:    11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ +	/* ESP:     10+max(25)+15 = 50 */ +	/* AH:      9+max(25)+15 = 49 */ +	/* unknown: 10 */ + +	/* (ICMP allows recursion one level deep) */ +	/* maxlen =  IP + ICMP +  IP + max(TCP,UDP,ICMP,unknown) */ +	/* maxlen = 230+   91  + 230 + 252 = 803 */ +} + +static void dump_ipv4_mac_header(struct sbuff *m, +			    const struct nf_loginfo *info, +			    const struct sk_buff *skb) +{ +	struct net_device *dev = skb->dev; +	unsigned int logflags = 0; + +	if (info->type == NF_LOG_TYPE_LOG) +		logflags = info->u.log.logflags; + +	if (!(logflags & XT_LOG_MACDECODE)) +		goto fallback; + +	switch (dev->type) { +	case ARPHRD_ETHER: +		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", +		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, +		       ntohs(eth_hdr(skb)->h_proto)); +		return; +	default: +		break; +	} + +fallback: +	sb_add(m, "MAC="); +	if (dev->hard_header_len && +	    skb->mac_header != skb->network_header) { +		const unsigned char *p = skb_mac_header(skb); +		unsigned int i; + +		sb_add(m, "%02x", *p++); +		for (i = 1; i < dev->hard_header_len; i++, p++) +			sb_add(m, ":%02x", *p); +	} +	sb_add(m, " "); +} + +static void +log_packet_common(struct sbuff *m, +		  u_int8_t pf, +		  unsigned int hooknum, +		  const struct sk_buff *skb, +		  const struct net_device *in, +		  const struct net_device *out, +		  const struct nf_loginfo *loginfo, +		  const char *prefix) +{ +	sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", +	       '0' + loginfo->u.log.level, prefix, +	       in ? in->name : "", +	       out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER +	if (skb->nf_bridge) { +		const struct net_device *physindev; +		const struct net_device *physoutdev; + +		physindev = skb->nf_bridge->physindev; +		if (physindev && in != physindev) +			sb_add(m, "PHYSIN=%s ", physindev->name); +		physoutdev = skb->nf_bridge->physoutdev; +		if (physoutdev && out != physoutdev) +			sb_add(m, "PHYSOUT=%s ", physoutdev->name); +	} +#endif +} + + +static void +ipt_log_packet(struct net *net, +	       u_int8_t pf, +	       unsigned int hooknum, +	       const struct sk_buff *skb, +	       const struct net_device *in, +	       const struct net_device *out, +	       const struct nf_loginfo *loginfo, +	       const char *prefix) +{ +	struct sbuff *m; + +	/* FIXME: Disabled from containers until syslog ns is supported */ +	if (!net_eq(net, &init_net)) +		return; + +	m = sb_open(); + +	if (!loginfo) +		loginfo = &default_loginfo; + +	log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + +	if (in != NULL) +		dump_ipv4_mac_header(m, loginfo, skb); + +	dump_ipv4_packet(m, loginfo, skb, 0); + +	sb_close(m); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +/* One level of recursion won't kill us */ +static void dump_ipv6_packet(struct sbuff *m, +			const struct nf_loginfo *info, +			const struct sk_buff *skb, unsigned int ip6hoff, +			int recurse) +{ +	u_int8_t currenthdr; +	int fragment; +	struct ipv6hdr _ip6h; +	const struct ipv6hdr *ih; +	unsigned int ptr; +	unsigned int hdrlen = 0; +	unsigned int logflags; + +	if (info->type == NF_LOG_TYPE_LOG) +		logflags = info->u.log.logflags; +	else +		logflags = NF_LOG_MASK; + +	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); +	if (ih == NULL) { +		sb_add(m, "TRUNCATED"); +		return; +	} + +	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ +	sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + +	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ +	sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", +	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr), +	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, +	       ih->hop_limit, +	       (ntohl(*(__be32 *)ih) & 0x000fffff)); + +	fragment = 0; +	ptr = ip6hoff + sizeof(struct ipv6hdr); +	currenthdr = ih->nexthdr; +	while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { +		struct ipv6_opt_hdr _hdr; +		const struct ipv6_opt_hdr *hp; + +		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); +		if (hp == NULL) { +			sb_add(m, "TRUNCATED"); +			return; +		} + +		/* Max length: 48 "OPT (...) " */ +		if (logflags & XT_LOG_IPOPT) +			sb_add(m, "OPT ( "); + +		switch (currenthdr) { +		case IPPROTO_FRAGMENT: { +			struct frag_hdr _fhdr; +			const struct frag_hdr *fh; + +			sb_add(m, "FRAG:"); +			fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), +						&_fhdr); +			if (fh == NULL) { +				sb_add(m, "TRUNCATED "); +				return; +			} + +			/* Max length: 6 "65535 " */ +			sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + +			/* Max length: 11 "INCOMPLETE " */ +			if (fh->frag_off & htons(0x0001)) +				sb_add(m, "INCOMPLETE "); + +			sb_add(m, "ID:%08x ", ntohl(fh->identification)); + +			if (ntohs(fh->frag_off) & 0xFFF8) +				fragment = 1; + +			hdrlen = 8; + +			break; +		} +		case IPPROTO_DSTOPTS: +		case IPPROTO_ROUTING: +		case IPPROTO_HOPOPTS: +			if (fragment) { +				if (logflags & XT_LOG_IPOPT) +					sb_add(m, ")"); +				return; +			} +			hdrlen = ipv6_optlen(hp); +			break; +		/* Max Length */ +		case IPPROTO_AH: +			if (logflags & XT_LOG_IPOPT) { +				struct ip_auth_hdr _ahdr; +				const struct ip_auth_hdr *ah; + +				/* Max length: 3 "AH " */ +				sb_add(m, "AH "); + +				if (fragment) { +					sb_add(m, ")"); +					return; +				} + +				ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), +							&_ahdr); +				if (ah == NULL) { +					/* +					 * Max length: 26 "INCOMPLETE [65535 +					 *  bytes] )" +					 */ +					sb_add(m, "INCOMPLETE [%u bytes] )", +					       skb->len - ptr); +					return; +				} + +				/* Length: 15 "SPI=0xF1234567 */ +				sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + +			} + +			hdrlen = (hp->hdrlen+2)<<2; +			break; +		case IPPROTO_ESP: +			if (logflags & XT_LOG_IPOPT) { +				struct ip_esp_hdr _esph; +				const struct ip_esp_hdr *eh; + +				/* Max length: 4 "ESP " */ +				sb_add(m, "ESP "); + +				if (fragment) { +					sb_add(m, ")"); +					return; +				} + +				/* +				 * Max length: 26 "INCOMPLETE [65535 bytes] )" +				 */ +				eh = skb_header_pointer(skb, ptr, sizeof(_esph), +							&_esph); +				if (eh == NULL) { +					sb_add(m, "INCOMPLETE [%u bytes] )", +					       skb->len - ptr); +					return; +				} + +				/* Length: 16 "SPI=0xF1234567 )" */ +				sb_add(m, "SPI=0x%x )", ntohl(eh->spi)); + +			} +			return; +		default: +			/* Max length: 20 "Unknown Ext Hdr 255" */ +			sb_add(m, "Unknown Ext Hdr %u", currenthdr); +			return; +		} +		if (logflags & XT_LOG_IPOPT) +			sb_add(m, ") "); + +		currenthdr = hp->nexthdr; +		ptr += hdrlen; +	} + +	switch (currenthdr) { +	case IPPROTO_TCP: +		if (dump_tcp_header(m, skb, currenthdr, fragment, ptr, +		    logflags)) +			return; +		break; +	case IPPROTO_UDP: +	case IPPROTO_UDPLITE: +		if (dump_udp_header(m, skb, currenthdr, fragment, ptr)) +			return; +		break; +	case IPPROTO_ICMPV6: { +		struct icmp6hdr _icmp6h; +		const struct icmp6hdr *ic; + +		/* Max length: 13 "PROTO=ICMPv6 " */ +		sb_add(m, "PROTO=ICMPv6 "); + +		if (fragment) +			break; + +		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ +		ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); +		if (ic == NULL) { +			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); +			return; +		} + +		/* Max length: 18 "TYPE=255 CODE=255 " */ +		sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + +		switch (ic->icmp6_type) { +		case ICMPV6_ECHO_REQUEST: +		case ICMPV6_ECHO_REPLY: +			/* Max length: 19 "ID=65535 SEQ=65535 " */ +			sb_add(m, "ID=%u SEQ=%u ", +				ntohs(ic->icmp6_identifier), +				ntohs(ic->icmp6_sequence)); +			break; +		case ICMPV6_MGM_QUERY: +		case ICMPV6_MGM_REPORT: +		case ICMPV6_MGM_REDUCTION: +			break; + +		case ICMPV6_PARAMPROB: +			/* Max length: 17 "POINTER=ffffffff " */ +			sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); +			/* Fall through */ +		case ICMPV6_DEST_UNREACH: +		case ICMPV6_PKT_TOOBIG: +		case ICMPV6_TIME_EXCEED: +			/* Max length: 3+maxlen */ +			if (recurse) { +				sb_add(m, "["); +				dump_ipv6_packet(m, info, skb, +					    ptr + sizeof(_icmp6h), 0); +				sb_add(m, "] "); +			} + +			/* Max length: 10 "MTU=65535 " */ +			if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) +				sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); +		} +		break; +	} +	/* Max length: 10 "PROTO=255 " */ +	default: +		sb_add(m, "PROTO=%u ", currenthdr); +	} + +	/* Max length: 15 "UID=4294967295 " */ +	if ((logflags & XT_LOG_UID) && recurse) +		dump_sk_uid_gid(m, skb->sk); + +	/* Max length: 16 "MARK=0xFFFFFFFF " */ +	if (recurse && skb->mark) +		sb_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv6_mac_header(struct sbuff *m, +			    const struct nf_loginfo *info, +			    const struct sk_buff *skb) +{ +	struct net_device *dev = skb->dev; +	unsigned int logflags = 0; + +	if (info->type == NF_LOG_TYPE_LOG) +		logflags = info->u.log.logflags; + +	if (!(logflags & XT_LOG_MACDECODE)) +		goto fallback; + +	switch (dev->type) { +	case ARPHRD_ETHER: +		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", +		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, +		       ntohs(eth_hdr(skb)->h_proto)); +		return; +	default: +		break; +	} + +fallback: +	sb_add(m, "MAC="); +	if (dev->hard_header_len && +	    skb->mac_header != skb->network_header) { +		const unsigned char *p = skb_mac_header(skb); +		unsigned int len = dev->hard_header_len; +		unsigned int i; + +		if (dev->type == ARPHRD_SIT) { +			p -= ETH_HLEN; + +			if (p < skb->head) +				p = NULL; +		} + +		if (p != NULL) { +			sb_add(m, "%02x", *p++); +			for (i = 1; i < len; i++) +				sb_add(m, ":%02x", *p++); +		} +		sb_add(m, " "); + +		if (dev->type == ARPHRD_SIT) { +			const struct iphdr *iph = +				(struct iphdr *)skb_mac_header(skb); +			sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, +			       &iph->daddr); +		} +	} else +		sb_add(m, " "); +} + +static void +ip6t_log_packet(struct net *net, +		u_int8_t pf, +		unsigned int hooknum, +		const struct sk_buff *skb, +		const struct net_device *in, +		const struct net_device *out, +		const struct nf_loginfo *loginfo, +		const char *prefix) +{ +	struct sbuff *m; + +	/* FIXME: Disabled from containers until syslog ns is supported */ +	if (!net_eq(net, &init_net)) +		return; + +	m = sb_open(); + +	if (!loginfo) +		loginfo = &default_loginfo; + +	log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + +	if (in != NULL) +		dump_ipv6_mac_header(m, loginfo, skb); + +	dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + +	sb_close(m); +} +#endif + +static unsigned int +log_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_log_info *loginfo = par->targinfo; +	struct nf_loginfo li; +	struct net *net = dev_net(par->in ? par->in : par->out); + +	li.type = NF_LOG_TYPE_LOG; +	li.u.log.level = loginfo->level; +	li.u.log.logflags = loginfo->logflags; + +	if (par->family == NFPROTO_IPV4) +		ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in, +			       par->out, &li, loginfo->prefix); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	else if (par->family == NFPROTO_IPV6) +		ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in, +				par->out, &li, loginfo->prefix); +#endif +	else +		WARN_ON_ONCE(1); + +	return XT_CONTINUE; +} + +static int log_tg_check(const struct xt_tgchk_param *par) +{ +	const struct xt_log_info *loginfo = par->targinfo; + +	if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) +		return -EINVAL; + +	if (loginfo->level >= 8) { +		pr_debug("level %u >= 8\n", loginfo->level); +		return -EINVAL; +	} + +	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { +		pr_debug("prefix is not null-terminated\n"); +		return -EINVAL; +	} + +	return 0; +} + +static struct xt_target log_tg_regs[] __read_mostly = { +	{ +		.name		= "LOG", +		.family		= NFPROTO_IPV4, +		.target		= log_tg, +		.targetsize	= sizeof(struct xt_log_info), +		.checkentry	= log_tg_check, +		.me		= THIS_MODULE, +	}, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	{ +		.name		= "LOG", +		.family		= NFPROTO_IPV6, +		.target		= log_tg, +		.targetsize	= sizeof(struct xt_log_info), +		.checkentry	= log_tg_check, +		.me		= THIS_MODULE, +	}, +#endif +}; + +static struct nf_logger ipt_log_logger __read_mostly = { +	.name		= "ipt_LOG", +	.logfn		= &ipt_log_packet, +	.me		= THIS_MODULE, +}; + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static struct nf_logger ip6t_log_logger __read_mostly = { +	.name		= "ip6t_LOG", +	.logfn		= &ip6t_log_packet, +	.me		= THIS_MODULE, +}; +#endif + +static int __net_init log_net_init(struct net *net) +{ +	nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger); +#endif +	return 0; +} + +static void __net_exit log_net_exit(struct net *net) +{ +	nf_log_unset(net, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	nf_log_unset(net, &ip6t_log_logger); +#endif +} + +static struct pernet_operations log_net_ops = { +	.init = log_net_init, +	.exit = log_net_exit, +}; + +static int __init log_tg_init(void) +{ +	int ret; + +	ret = register_pernet_subsys(&log_net_ops); +	if (ret < 0) +		goto err_pernet; + +	ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); +	if (ret < 0) +		goto err_target; + +	nf_log_register(NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); +#endif +	return 0; + +err_target: +	unregister_pernet_subsys(&log_net_ops); +err_pernet: +	return ret; +} + +static void __exit log_tg_exit(void) +{ +	unregister_pernet_subsys(&log_net_ops); +	nf_log_unregister(&ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	nf_log_unregister(&ip6t_log_logger); +#endif +	xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); +} + +module_init(log_tg_init); +module_exit(log_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); +MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); +MODULE_ALIAS("ipt_LOG"); +MODULE_ALIAS("ip6t_LOG"); diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c new file mode 100644 index 00000000000..b253e07cb1c --- /dev/null +++ b/net/netfilter/xt_NETMAP.c @@ -0,0 +1,165 @@ +/* + * (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> + +static unsigned int +netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct nf_nat_range *range = par->targinfo; +	struct nf_nat_range newrange; +	struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; +	union nf_inet_addr new_addr, netmask; +	unsigned int i; + +	ct = nf_ct_get(skb, &ctinfo); +	for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++) +		netmask.ip6[i] = ~(range->min_addr.ip6[i] ^ +				   range->max_addr.ip6[i]); + +	if (par->hooknum == NF_INET_PRE_ROUTING || +	    par->hooknum == NF_INET_LOCAL_OUT) +		new_addr.in6 = ipv6_hdr(skb)->daddr; +	else +		new_addr.in6 = ipv6_hdr(skb)->saddr; + +	for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) { +		new_addr.ip6[i] &= ~netmask.ip6[i]; +		new_addr.ip6[i] |= range->min_addr.ip6[i] & +				   netmask.ip6[i]; +	} + +	newrange.flags	= range->flags | NF_NAT_RANGE_MAP_IPS; +	newrange.min_addr	= new_addr; +	newrange.max_addr	= new_addr; +	newrange.min_proto	= range->min_proto; +	newrange.max_proto	= range->max_proto; + +	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); +} + +static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) +{ +	const struct nf_nat_range *range = par->targinfo; + +	if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) +		return -EINVAL; +	return 0; +} + +static unsigned int +netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ +	struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; +	__be32 new_ip, netmask; +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; +	struct nf_nat_range newrange; + +	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || +		     par->hooknum == NF_INET_POST_ROUTING || +		     par->hooknum == NF_INET_LOCAL_OUT || +		     par->hooknum == NF_INET_LOCAL_IN); +	ct = nf_ct_get(skb, &ctinfo); + +	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); + +	if (par->hooknum == NF_INET_PRE_ROUTING || +	    par->hooknum == NF_INET_LOCAL_OUT) +		new_ip = ip_hdr(skb)->daddr & ~netmask; +	else +		new_ip = ip_hdr(skb)->saddr & ~netmask; +	new_ip |= mr->range[0].min_ip & netmask; + +	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); +	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); +	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; +	newrange.min_addr.ip = new_ip; +	newrange.max_addr.ip = new_ip; +	newrange.min_proto   = mr->range[0].min; +	newrange.max_proto   = mr->range[0].max; + +	/* Hand modified range to generic setup. */ +	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); +} + +static int netmap_tg4_check(const struct xt_tgchk_param *par) +{ +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + +	if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) { +		pr_debug("bad MAP_IPS.\n"); +		return -EINVAL; +	} +	if (mr->rangesize != 1) { +		pr_debug("bad rangesize %u.\n", mr->rangesize); +		return -EINVAL; +	} +	return 0; +} + +static struct xt_target netmap_tg_reg[] __read_mostly = { +	{ +		.name       = "NETMAP", +		.family     = NFPROTO_IPV6, +		.revision   = 0, +		.target     = netmap_tg6, +		.targetsize = sizeof(struct nf_nat_range), +		.table      = "nat", +		.hooks      = (1 << NF_INET_PRE_ROUTING) | +		              (1 << NF_INET_POST_ROUTING) | +		              (1 << NF_INET_LOCAL_OUT) | +		              (1 << NF_INET_LOCAL_IN), +		.checkentry = netmap_tg6_checkentry, +		.me         = THIS_MODULE, +	}, +	{ +		.name       = "NETMAP", +		.family     = NFPROTO_IPV4, +		.revision   = 0, +		.target     = netmap_tg4, +		.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), +		.table      = "nat", +		.hooks      = (1 << NF_INET_PRE_ROUTING) | +		              (1 << NF_INET_POST_ROUTING) | +		              (1 << NF_INET_LOCAL_OUT) | +		              (1 << NF_INET_LOCAL_IN), +		.checkentry = netmap_tg4_check, +		.me         = THIS_MODULE, +	}, +}; + +static int __init netmap_tg_init(void) +{ +	return xt_register_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); +} + +static void netmap_tg_exit(void) +{ +	xt_unregister_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); +} + +module_init(netmap_tg_init); +module_exit(netmap_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of subnets"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ip6t_NETMAP"); +MODULE_ALIAS("ipt_NETMAP"); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index a17dd0f589b..fb7497c928a 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -26,13 +26,14 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)  {  	const struct xt_nflog_info *info = par->targinfo;  	struct nf_loginfo li; +	struct net *net = dev_net(par->in ? par->in : par->out);  	li.type		     = NF_LOG_TYPE_ULOG;  	li.u.ulog.copy_len   = info->len;  	li.u.ulog.group	     = info->group;  	li.u.ulog.qthreshold = info->threshold; -	nfulnl_log_packet(par->family, par->hooknum, skb, par->in, +	nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,  			  par->out, &li, info->prefix);  	return XT_CONTINUE;  } diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 039cce1bde3..8f1779ff7e3 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -11,15 +11,13 @@  #include <linux/module.h>  #include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/jhash.h> -  #include <linux/netfilter.h>  #include <linux/netfilter_arp.h>  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter/xt_NFQUEUE.h> +#include <net/netfilter/nf_queue.h> +  MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");  MODULE_DESCRIPTION("Xtables: packet forwarding to netlink");  MODULE_LICENSE("GPL"); @@ -28,7 +26,6 @@ MODULE_ALIAS("ip6t_NFQUEUE");  MODULE_ALIAS("arpt_NFQUEUE");  static u32 jhash_initval __read_mostly; -static bool rnd_inited __read_mostly;  static unsigned int  nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -38,32 +35,6 @@ nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par)  	return NF_QUEUE_NR(tinfo->queuenum);  } -static u32 hash_v4(const struct sk_buff *skb) -{ -	const struct iphdr *iph = ip_hdr(skb); -	__be32 ipaddr; - -	/* packets in either direction go into same queue */ -	ipaddr = iph->saddr ^ iph->daddr; - -	return jhash_2words((__force u32)ipaddr, iph->protocol, jhash_initval); -} - -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -static u32 hash_v6(const struct sk_buff *skb) -{ -	const struct ipv6hdr *ip6h = ipv6_hdr(skb); -	__be32 addr[4]; - -	addr[0] = ip6h->saddr.s6_addr32[0] ^ ip6h->daddr.s6_addr32[0]; -	addr[1] = ip6h->saddr.s6_addr32[1] ^ ip6h->daddr.s6_addr32[1]; -	addr[2] = ip6h->saddr.s6_addr32[2] ^ ip6h->daddr.s6_addr32[2]; -	addr[3] = ip6h->saddr.s6_addr32[3] ^ ip6h->daddr.s6_addr32[3]; - -	return jhash2((__force u32 *)addr, ARRAY_SIZE(addr), jhash_initval); -} -#endif -  static unsigned int  nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)  { @@ -71,25 +42,30 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)  	u32 queue = info->queuenum;  	if (info->queues_total > 1) { -		if (par->family == NFPROTO_IPV4) -			queue = hash_v4(skb) % info->queues_total + queue; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -		else if (par->family == NFPROTO_IPV6) -			queue = hash_v6(skb) % info->queues_total + queue; -#endif +		queue = nfqueue_hash(skb, queue, info->queues_total, +				     par->family, jhash_initval);  	}  	return NF_QUEUE_NR(queue);  } -static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par) +static unsigned int +nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)  { -	const struct xt_NFQ_info_v1 *info = par->targinfo; +	const struct xt_NFQ_info_v2 *info = par->targinfo; +	unsigned int ret = nfqueue_tg_v1(skb, par); + +	if (info->bypass) +		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; +	return ret; +} + +static int nfqueue_tg_check(const struct xt_tgchk_param *par) +{ +	const struct xt_NFQ_info_v3 *info = par->targinfo;  	u32 maxid; -	if (unlikely(!rnd_inited)) { -		get_random_bytes(&jhash_initval, sizeof(jhash_initval)); -		rnd_inited = true; -	} +	init_hashrandom(&jhash_initval); +  	if (info->queues_total == 0) {  		pr_err("NFQUEUE: number of total queues is 0\n");  		return -EINVAL; @@ -100,9 +76,39 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)  		       info->queues_total, maxid);  		return -ERANGE;  	} +	if (par->target->revision == 2 && info->flags > 1) +		return -EINVAL; +	if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) +		return -EINVAL; +  	return 0;  } +static unsigned int +nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_NFQ_info_v3 *info = par->targinfo; +	u32 queue = info->queuenum; +	int ret; + +	if (info->queues_total > 1) { +		if (info->flags & NFQ_FLAG_CPU_FANOUT) { +			int cpu = smp_processor_id(); + +			queue = info->queuenum + cpu % info->queues_total; +		} else { +			queue = nfqueue_hash(skb, queue, info->queues_total, +					     par->family, jhash_initval); +		} +	} + +	ret = NF_QUEUE_NR(queue); +	if (info->flags & NFQ_FLAG_BYPASS) +		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + +	return ret; +} +  static struct xt_target nfqueue_tg_reg[] __read_mostly = {  	{  		.name		= "NFQUEUE", @@ -115,11 +121,29 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {  		.name		= "NFQUEUE",  		.revision	= 1,  		.family		= NFPROTO_UNSPEC, -		.checkentry	= nfqueue_tg_v1_check, +		.checkentry	= nfqueue_tg_check,  		.target		= nfqueue_tg_v1,  		.targetsize	= sizeof(struct xt_NFQ_info_v1),  		.me		= THIS_MODULE,  	}, +	{ +		.name		= "NFQUEUE", +		.revision	= 2, +		.family		= NFPROTO_UNSPEC, +		.checkentry	= nfqueue_tg_check, +		.target		= nfqueue_tg_v2, +		.targetsize	= sizeof(struct xt_NFQ_info_v2), +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "NFQUEUE", +		.revision	= 3, +		.family		= NFPROTO_UNSPEC, +		.checkentry	= nfqueue_tg_check, +		.target		= nfqueue_tg_v3, +		.targetsize	= sizeof(struct xt_NFQ_info_v3), +		.me		= THIS_MODULE, +	},  };  static int __init nfqueue_tg_init(void) diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c deleted file mode 100644 index 9d782181b6c..00000000000 --- a/net/netfilter/xt_NOTRACK.c +++ /dev/null @@ -1,53 +0,0 @@ -/* This is a module which is used for setting up fake conntracks - * on packets so that they are not seen by the conntrack/NAT code. - */ -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_conntrack.h> - -MODULE_DESCRIPTION("Xtables: Disabling connection tracking for packets"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ipt_NOTRACK"); -MODULE_ALIAS("ip6t_NOTRACK"); - -static unsigned int -notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ -	/* Previously seen (loopback)? Ignore. */ -	if (skb->nfct != NULL) -		return XT_CONTINUE; - -	/* Attach fake conntrack entry. -	   If there is a real ct entry correspondig to this packet, -	   it'll hang aroun till timing out. We don't deal with it -	   for performance reasons. JK */ -	skb->nfct = &nf_ct_untracked_get()->ct_general; -	skb->nfctinfo = IP_CT_NEW; -	nf_conntrack_get(skb->nfct); - -	return XT_CONTINUE; -} - -static struct xt_target notrack_tg_reg __read_mostly = { -	.name     = "NOTRACK", -	.revision = 0, -	.family   = NFPROTO_UNSPEC, -	.target   = notrack_tg, -	.table    = "raw", -	.me       = THIS_MODULE, -}; - -static int __init notrack_tg_init(void) -{ -	return xt_register_target(¬rack_tg_reg); -} - -static void __exit notrack_tg_exit(void) -{ -	xt_unregister_target(¬rack_tg_reg); -} - -module_init(notrack_tg_init); -module_exit(notrack_tg_exit); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index de079abd5bc..370adf622ce 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -43,12 +43,11 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)  struct xt_rateest *xt_rateest_lookup(const char *name)  {  	struct xt_rateest *est; -	struct hlist_node *n;  	unsigned int h;  	h = xt_rateest_hash(name);  	mutex_lock(&xt_rateest_mutex); -	hlist_for_each_entry(est, n, &rateest_hash[h], list) { +	hlist_for_each_entry(est, &rateest_hash[h], list) {  		if (strcmp(est->name, name) == 0) {  			est->refcnt++;  			mutex_unlock(&xt_rateest_mutex); @@ -60,11 +59,6 @@ struct xt_rateest *xt_rateest_lookup(const char *name)  }  EXPORT_SYMBOL_GPL(xt_rateest_lookup); -static void xt_rateest_free_rcu(struct rcu_head *head) -{ -	kfree(container_of(head, struct xt_rateest, rcu)); -} -  void xt_rateest_put(struct xt_rateest *est)  {  	mutex_lock(&xt_rateest_mutex); @@ -75,7 +69,7 @@ void xt_rateest_put(struct xt_rateest *est)  		 * gen_estimator est_timer() might access est->lock or bstats,  		 * wait a RCU grace period before freeing 'est'  		 */ -		call_rcu(&est->rcu, xt_rateest_free_rcu); +		kfree_rcu(est, rcu);  	}  	mutex_unlock(&xt_rateest_mutex);  } @@ -188,7 +182,6 @@ static int __init xt_rateest_tg_init(void)  static void __exit xt_rateest_tg_fini(void)  {  	xt_unregister_target(&xt_rateest_tg_reg); -	rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */  } diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c new file mode 100644 index 00000000000..22a10309297 --- /dev/null +++ b/net/netfilter/xt_REDIRECT.c @@ -0,0 +1,190 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/types.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/addrconf.h> +#include <net/checksum.h> +#include <net/protocol.h> +#include <net/netfilter/nf_nat.h> + +static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; + +static unsigned int +redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct nf_nat_range *range = par->targinfo; +	struct nf_nat_range newrange; +	struct in6_addr newdst; +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; + +	ct = nf_ct_get(skb, &ctinfo); +	if (par->hooknum == NF_INET_LOCAL_OUT) +		newdst = loopback_addr; +	else { +		struct inet6_dev *idev; +		struct inet6_ifaddr *ifa; +		bool addr = false; + +		rcu_read_lock(); +		idev = __in6_dev_get(skb->dev); +		if (idev != NULL) { +			list_for_each_entry(ifa, &idev->addr_list, if_list) { +				newdst = ifa->addr; +				addr = true; +				break; +			} +		} +		rcu_read_unlock(); + +		if (!addr) +			return NF_DROP; +	} + +	newrange.flags		= range->flags | NF_NAT_RANGE_MAP_IPS; +	newrange.min_addr.in6	= newdst; +	newrange.max_addr.in6	= newdst; +	newrange.min_proto	= range->min_proto; +	newrange.max_proto	= range->max_proto; + +	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + +static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) +{ +	const struct nf_nat_range *range = par->targinfo; + +	if (range->flags & NF_NAT_RANGE_MAP_IPS) +		return -EINVAL; +	return 0; +} + +/* FIXME: Take multiple ranges --RR */ +static int redirect_tg4_check(const struct xt_tgchk_param *par) +{ +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + +	if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { +		pr_debug("bad MAP_IPS.\n"); +		return -EINVAL; +	} +	if (mr->rangesize != 1) { +		pr_debug("bad rangesize %u.\n", mr->rangesize); +		return -EINVAL; +	} +	return 0; +} + +static unsigned int +redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ +	struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; +	__be32 newdst; +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; +	struct nf_nat_range newrange; + +	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || +		     par->hooknum == NF_INET_LOCAL_OUT); + +	ct = nf_ct_get(skb, &ctinfo); +	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + +	/* Local packets: make them go to loopback */ +	if (par->hooknum == NF_INET_LOCAL_OUT) +		newdst = htonl(0x7F000001); +	else { +		struct in_device *indev; +		struct in_ifaddr *ifa; + +		newdst = 0; + +		rcu_read_lock(); +		indev = __in_dev_get_rcu(skb->dev); +		if (indev && (ifa = indev->ifa_list)) +			newdst = ifa->ifa_local; +		rcu_read_unlock(); + +		if (!newdst) +			return NF_DROP; +	} + +	/* Transfer from original range. */ +	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); +	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); +	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; +	newrange.min_addr.ip = newdst; +	newrange.max_addr.ip = newdst; +	newrange.min_proto   = mr->range[0].min; +	newrange.max_proto   = mr->range[0].max; + +	/* Hand modified range to generic setup. */ +	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + +static struct xt_target redirect_tg_reg[] __read_mostly = { +	{ +		.name       = "REDIRECT", +		.family     = NFPROTO_IPV6, +		.revision   = 0, +		.table      = "nat", +		.checkentry = redirect_tg6_checkentry, +		.target     = redirect_tg6, +		.targetsize = sizeof(struct nf_nat_range), +		.hooks      = (1 << NF_INET_PRE_ROUTING) | +		              (1 << NF_INET_LOCAL_OUT), +		.me         = THIS_MODULE, +	}, +	{ +		.name       = "REDIRECT", +		.family     = NFPROTO_IPV4, +		.revision   = 0, +		.table      = "nat", +		.target     = redirect_tg4, +		.checkentry = redirect_tg4_check, +		.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), +		.hooks      = (1 << NF_INET_PRE_ROUTING) | +		              (1 << NF_INET_LOCAL_OUT), +		.me         = THIS_MODULE, +	}, +}; + +static int __init redirect_tg_init(void) +{ +	return xt_register_targets(redirect_tg_reg, +				   ARRAY_SIZE(redirect_tg_reg)); +} + +static void __exit redirect_tg_exit(void) +{ +	xt_unregister_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg)); +} + +module_init(redirect_tg_init); +module_exit(redirect_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); +MODULE_ALIAS("ip6t_REDIRECT"); +MODULE_ALIAS("ipt_REDIRECT"); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index eb81c380da1..e762de5ee89 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -2,6 +2,7 @@   * This is a module which is used for setting the MSS option in TCP packets.   *   * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * Copyright (C) 2007 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -42,40 +43,82 @@ optlen(const u_int8_t *opt, unsigned int offset)  		return opt[offset+1];  } +static u_int32_t tcpmss_reverse_mtu(struct net *net, +				    const struct sk_buff *skb, +				    unsigned int family) +{ +	struct flowi fl; +	const struct nf_afinfo *ai; +	struct rtable *rt = NULL; +	u_int32_t mtu     = ~0U; + +	if (family == PF_INET) { +		struct flowi4 *fl4 = &fl.u.ip4; +		memset(fl4, 0, sizeof(*fl4)); +		fl4->daddr = ip_hdr(skb)->saddr; +	} else { +		struct flowi6 *fl6 = &fl.u.ip6; + +		memset(fl6, 0, sizeof(*fl6)); +		fl6->daddr = ipv6_hdr(skb)->saddr; +	} +	rcu_read_lock(); +	ai = nf_get_afinfo(family); +	if (ai != NULL) +		ai->route(net, (struct dst_entry **)&rt, &fl, false); +	rcu_read_unlock(); + +	if (rt != NULL) { +		mtu = dst_mtu(&rt->dst); +		dst_release(&rt->dst); +	} +	return mtu; +} +  static int  tcpmss_mangle_packet(struct sk_buff *skb, -		     const struct xt_tcpmss_info *info, -		     unsigned int in_mtu, +		     const struct xt_action_param *par, +		     unsigned int family,  		     unsigned int tcphoff,  		     unsigned int minlen)  { +	const struct xt_tcpmss_info *info = par->targinfo;  	struct tcphdr *tcph; -	unsigned int tcplen, i; +	int len, tcp_hdrlen; +	unsigned int i;  	__be16 oldval;  	u16 newmss;  	u8 *opt; +	/* This is a fragment, no TCP header is available */ +	if (par->fragoff != 0) +		return 0; +  	if (!skb_make_writable(skb, skb->len))  		return -1; -	tcplen = skb->len - tcphoff; +	len = skb->len - tcphoff; +	if (len < (int)sizeof(struct tcphdr)) +		return -1; +  	tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); +	tcp_hdrlen = tcph->doff * 4; -	/* Header cannot be larger than the packet */ -	if (tcplen < tcph->doff*4) +	if (len < tcp_hdrlen)  		return -1;  	if (info->mss == XT_TCPMSS_CLAMP_PMTU) { +		struct net *net = dev_net(par->in ? par->in : par->out); +		unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); +  		if (dst_mtu(skb_dst(skb)) <= minlen) { -			if (net_ratelimit()) -				pr_err("unknown or invalid path-MTU (%u)\n", -				       dst_mtu(skb_dst(skb))); +			net_err_ratelimited("unknown or invalid path-MTU (%u)\n", +					    dst_mtu(skb_dst(skb)));  			return -1;  		}  		if (in_mtu <= minlen) { -			if (net_ratelimit()) -				pr_err("unknown or invalid path-MTU (%u)\n", -				       in_mtu); +			net_err_ratelimited("unknown or invalid path-MTU (%u)\n", +					    in_mtu);  			return -1;  		}  		newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen; @@ -83,9 +126,8 @@ tcpmss_mangle_packet(struct sk_buff *skb,  		newmss = info->mss;  	opt = (u_int8_t *)tcph; -	for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) { -		if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS && -		    opt[i+1] == TCPOLEN_MSS) { +	for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { +		if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) {  			u_int16_t oldmss;  			oldmss = (opt[i+2] << 8) | opt[i+3]; @@ -108,9 +150,10 @@ tcpmss_mangle_packet(struct sk_buff *skb,  	}  	/* There is data after the header so the option can't be added -	   without moving it, and doing so may make the SYN packet -	   itself too large. Accept the packet unmodified instead. */ -	if (tcplen > tcph->doff*4) +	 * without moving it, and doing so may make the SYN packet +	 * itself too large. Accept the packet unmodified instead. +	 */ +	if (len > tcp_hdrlen)  		return 0;  	/* @@ -126,11 +169,23 @@ tcpmss_mangle_packet(struct sk_buff *skb,  	skb_put(skb, TCPOLEN_MSS); +	/* +	 * IPv4: RFC 1122 states "If an MSS option is not received at +	 * connection setup, TCP MUST assume a default send MSS of 536". +	 * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum +	 * length IPv6 header of 60, ergo the default MSS value is 1220 +	 * Since no MSS was provided, we must use the default values +	 */ +	if (par->family == NFPROTO_IPV4) +		newmss = min(newmss, (u16)536); +	else +		newmss = min(newmss, (u16)1220); +  	opt = (u_int8_t *)tcph + sizeof(struct tcphdr); -	memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); +	memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr));  	inet_proto_csum_replace2(&tcph->check, skb, -				 htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); +				 htons(len), htons(len + TCPOLEN_MSS), 1);  	opt[0] = TCPOPT_MSS;  	opt[1] = TCPOLEN_MSS;  	opt[2] = (newmss & 0xff00) >> 8; @@ -145,32 +200,6 @@ tcpmss_mangle_packet(struct sk_buff *skb,  	return TCPOLEN_MSS;  } -static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, -				    unsigned int family) -{ -	struct flowi fl = {}; -	const struct nf_afinfo *ai; -	struct rtable *rt = NULL; -	u_int32_t mtu     = ~0U; - -	if (family == PF_INET) -		fl.fl4_dst = ip_hdr(skb)->saddr; -	else -		fl.fl6_dst = ipv6_hdr(skb)->saddr; - -	rcu_read_lock(); -	ai = nf_get_afinfo(family); -	if (ai != NULL) -		ai->route((struct dst_entry **)&rt, &fl); -	rcu_read_unlock(); - -	if (rt != NULL) { -		mtu = dst_mtu(&rt->dst); -		dst_release(&rt->dst); -	} -	return mtu; -} -  static unsigned int  tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)  { @@ -178,8 +207,8 @@ tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)  	__be16 newlen;  	int ret; -	ret = tcpmss_mangle_packet(skb, par->targinfo, -				   tcpmss_reverse_mtu(skb, PF_INET), +	ret = tcpmss_mangle_packet(skb, par, +				   PF_INET,  				   iph->ihl * 4,  				   sizeof(*iph) + sizeof(struct tcphdr));  	if (ret < 0) @@ -193,21 +222,22 @@ tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)  	return XT_CONTINUE;  } -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  static unsigned int  tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)  {  	struct ipv6hdr *ipv6h = ipv6_hdr(skb);  	u8 nexthdr; +	__be16 frag_off;  	int tcphoff;  	int ret;  	nexthdr = ipv6h->nexthdr; -	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); +	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);  	if (tcphoff < 0)  		return NF_DROP; -	ret = tcpmss_mangle_packet(skb, par->targinfo, -				   tcpmss_reverse_mtu(skb, PF_INET6), +	ret = tcpmss_mangle_packet(skb, par, +				   PF_INET6,  				   tcphoff,  				   sizeof(*ipv6h) + sizeof(struct tcphdr));  	if (ret < 0) @@ -254,7 +284,7 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)  	return -EINVAL;  } -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  static int tcpmss_tg6_check(const struct xt_tgchk_param *par)  {  	const struct xt_tcpmss_info *info = par->targinfo; @@ -287,7 +317,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {  		.proto		= IPPROTO_TCP,  		.me		= THIS_MODULE,  	}, -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  	{  		.family		= NFPROTO_IPV6,  		.name		= "TCPMSS", diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 9dc9ecfdd54..625fa1d636a 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -30,28 +30,43 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)  static unsigned int  tcpoptstrip_mangle_packet(struct sk_buff *skb, -			  const struct xt_tcpoptstrip_target_info *info, +			  const struct xt_action_param *par,  			  unsigned int tcphoff, unsigned int minlen)  { +	const struct xt_tcpoptstrip_target_info *info = par->targinfo;  	unsigned int optl, i, j;  	struct tcphdr *tcph;  	u_int16_t n, o;  	u_int8_t *opt; +	int len, tcp_hdrlen; + +	/* This is a fragment, no TCP header is available */ +	if (par->fragoff != 0) +		return XT_CONTINUE;  	if (!skb_make_writable(skb, skb->len))  		return NF_DROP; +	len = skb->len - tcphoff; +	if (len < (int)sizeof(struct tcphdr)) +		return NF_DROP; +  	tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); +	tcp_hdrlen = tcph->doff * 4; + +	if (len < tcp_hdrlen) +		return NF_DROP; +  	opt  = (u_int8_t *)tcph;  	/*  	 * Walk through all TCP options - if we find some option to remove,  	 * set all octets to %TCPOPT_NOP and adjust checksum.  	 */ -	for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) { +	for (i = sizeof(struct tcphdr); i < tcp_hdrlen - 1; i += optl) {  		optl = optlen(opt, i); -		if (i + optl > tcp_hdrlen(skb)) +		if (i + optl > tcp_hdrlen)  			break;  		if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) @@ -76,24 +91,25 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,  static unsigned int  tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)  { -	return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), +	return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),  	       sizeof(struct iphdr) + sizeof(struct tcphdr));  } -#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)  static unsigned int  tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)  {  	struct ipv6hdr *ipv6h = ipv6_hdr(skb);  	int tcphoff;  	u_int8_t nexthdr; +	__be16 frag_off;  	nexthdr = ipv6h->nexthdr; -	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); +	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);  	if (tcphoff < 0)  		return NF_DROP; -	return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, +	return tcpoptstrip_mangle_packet(skb, par, tcphoff,  	       sizeof(*ipv6h) + sizeof(struct tcphdr));  }  #endif @@ -108,7 +124,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {  		.targetsize = sizeof(struct xt_tcpoptstrip_target_info),  		.me         = THIS_MODULE,  	}, -#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)  	{  		.name       = "TCPOPTSTRIP",  		.family     = NFPROTO_IPV6, diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 5128a6c4cb2..292934d2348 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -25,13 +25,10 @@  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter/xt_TEE.h> -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  #	define WITH_CONNTRACK 1  #	include <net/netfilter/nf_conntrack.h>  #endif -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -#	define WITH_IPV6 1 -#endif  struct xt_tee_priv {  	struct notifier_block	notifier; @@ -62,18 +59,20 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)  	const struct iphdr *iph = ip_hdr(skb);  	struct net *net = pick_net(skb);  	struct rtable *rt; -	struct flowi fl; +	struct flowi4 fl4; -	memset(&fl, 0, sizeof(fl)); +	memset(&fl4, 0, sizeof(fl4));  	if (info->priv) {  		if (info->priv->oif == -1)  			return false; -		fl.oif = info->priv->oif; +		fl4.flowi4_oif = info->priv->oif;  	} -	fl.fl4_dst = info->gw.ip; -	fl.fl4_tos = RT_TOS(iph->tos); -	fl.fl4_scope = RT_SCOPE_UNIVERSE; -	if (ip_route_output_key(net, &rt, &fl) != 0) +	fl4.daddr = info->gw.ip; +	fl4.flowi4_tos = RT_TOS(iph->tos); +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE; +	fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; +	rt = ip_route_output_key(net, &fl4); +	if (IS_ERR(rt))  		return false;  	skb_dst_drop(skb); @@ -89,7 +88,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)  	const struct xt_tee_tginfo *info = par->targinfo;  	struct iphdr *iph; -	if (percpu_read(tee_active)) +	if (__this_cpu_read(tee_active))  		return XT_CONTINUE;  	/*  	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -126,37 +125,38 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)  	ip_send_check(iph);  	if (tee_tg_route4(skb, info)) { -		percpu_write(tee_active, true); +		__this_cpu_write(tee_active, true);  		ip_local_out(skb); -		percpu_write(tee_active, false); +		__this_cpu_write(tee_active, false);  	} else {  		kfree_skb(skb);  	}  	return XT_CONTINUE;  } -#ifdef WITH_IPV6 +#if IS_ENABLED(CONFIG_IPV6)  static bool  tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)  {  	const struct ipv6hdr *iph = ipv6_hdr(skb);  	struct net *net = pick_net(skb);  	struct dst_entry *dst; -	struct flowi fl; +	struct flowi6 fl6; -	memset(&fl, 0, sizeof(fl)); +	memset(&fl6, 0, sizeof(fl6));  	if (info->priv) {  		if (info->priv->oif == -1)  			return false; -		fl.oif = info->priv->oif; +		fl6.flowi6_oif = info->priv->oif;  	} -	fl.fl6_dst = info->gw.in6; -	fl.fl6_flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | +	fl6.daddr = info->gw.in6; +	fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |  			   (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; -	dst = ip6_route_output(net, NULL, &fl); -	if (dst == NULL) +	dst = ip6_route_output(net, NULL, &fl6); +	if (dst->error) { +		dst_release(dst);  		return false; - +	}  	skb_dst_drop(skb);  	skb_dst_set(skb, dst);  	skb->dev      = dst->dev; @@ -169,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)  {  	const struct xt_tee_tginfo *info = par->targinfo; -	if (percpu_read(tee_active)) +	if (__this_cpu_read(tee_active))  		return XT_CONTINUE;  	skb = pskb_copy(skb, GFP_ATOMIC);  	if (skb == NULL) @@ -187,20 +187,20 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)  		--iph->hop_limit;  	}  	if (tee_tg_route6(skb, info)) { -		percpu_write(tee_active, true); +		__this_cpu_write(tee_active, true);  		ip6_local_out(skb); -		percpu_write(tee_active, false); +		__this_cpu_write(tee_active, false);  	} else {  		kfree_skb(skb);  	}  	return XT_CONTINUE;  } -#endif /* WITH_IPV6 */ +#endif  static int tee_netdev_event(struct notifier_block *this, unsigned long event,  			    void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct xt_tee_priv *priv;  	priv = container_of(this, struct xt_tee_priv, notifier); @@ -275,7 +275,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {  		.destroy    = tee_tg_destroy,  		.me         = THIS_MODULE,  	}, -#ifdef WITH_IPV6 +#if IS_ENABLED(CONFIG_IPV6)  	{  		.name       = "TEE",  		.revision   = 1, diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 640678f47a2..ef8a926752a 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -15,24 +15,45 @@  #include <linux/ip.h>  #include <net/checksum.h>  #include <net/udp.h> +#include <net/tcp.h>  #include <net/inet_sock.h> +#include <net/inet_hashtables.h>  #include <linux/inetdevice.h>  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter_ipv4/ip_tables.h>  #include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  #define XT_TPROXY_HAVE_IPV6 1  #include <net/if_inet6.h>  #include <net/addrconf.h> +#include <net/inet6_hashtables.h>  #include <linux/netfilter_ipv6/ip6_tables.h>  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>  #endif -#include <net/netfilter/nf_tproxy_core.h>  #include <linux/netfilter/xt_TPROXY.h> +enum nf_tproxy_lookup_t { +	 NFT_LOOKUP_LISTENER, +	 NFT_LOOKUP_ESTABLISHED, +}; + +static bool tproxy_sk_is_transparent(struct sock *sk) +{ +	if (sk->sk_state != TCP_TIME_WAIT) { +		if (inet_sk(sk)->transparent) +			return true; +		sock_put(sk); +	} else { +		if (inet_twsk(sk)->tw_transparent) +			return true; +		inet_twsk_put(inet_twsk(sk)); +	} +	return false; +} +  static inline __be32  tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)  { @@ -54,8 +75,159 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)  	return laddr ? laddr : daddr;  } +/* + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + *   - match: if there's a fully established connection matching the + *     _packet_ tuple, it is returned, assuming the redirection + *     already took place and we process a packet belonging to an + *     established connection + * + *   - match: if there's a listening socket matching the redirection + *     (e.g. on-port & on-ip of the connection), it is returned, + *     regardless if it was bound to 0.0.0.0 or an explicit + *     address. The reasoning is that if there's an explicit rule, it + *     does not really matter if the listener is bound to an interface + *     or to 0. The user already stated that he wants redirection + *     (since he added the rule). + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +static inline struct sock * +nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, +		      const __be32 saddr, const __be32 daddr, +		      const __be16 sport, const __be16 dport, +		      const struct net_device *in, +		      const enum nf_tproxy_lookup_t lookup_type) +{ +	struct sock *sk; + +	switch (protocol) { +	case IPPROTO_TCP: +		switch (lookup_type) { +		case NFT_LOOKUP_LISTENER: +			sk = inet_lookup_listener(net, &tcp_hashinfo, +						    saddr, sport, +						    daddr, dport, +						    in->ifindex); + +			/* NOTE: we return listeners even if bound to +			 * 0.0.0.0, those are filtered out in +			 * xt_socket, since xt_TPROXY needs 0 bound +			 * listeners too +			 */ +			break; +		case NFT_LOOKUP_ESTABLISHED: +			sk = inet_lookup_established(net, &tcp_hashinfo, +						    saddr, sport, daddr, dport, +						    in->ifindex); +			break; +		default: +			BUG(); +		} +		break; +	case IPPROTO_UDP: +		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, +				     in->ifindex); +		if (sk) { +			int connected = (sk->sk_state == TCP_ESTABLISHED); +			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); + +			/* NOTE: we return listeners even if bound to +			 * 0.0.0.0, those are filtered out in +			 * xt_socket, since xt_TPROXY needs 0 bound +			 * listeners too +			 */ +			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || +			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) { +				sock_put(sk); +				sk = NULL; +			} +		} +		break; +	default: +		WARN_ON(1); +		sk = NULL; +	} + +	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", +		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); + +	return sk; +} + +#ifdef XT_TPROXY_HAVE_IPV6 +static inline struct sock * +nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, +		      const struct in6_addr *saddr, const struct in6_addr *daddr, +		      const __be16 sport, const __be16 dport, +		      const struct net_device *in, +		      const enum nf_tproxy_lookup_t lookup_type) +{ +	struct sock *sk; + +	switch (protocol) { +	case IPPROTO_TCP: +		switch (lookup_type) { +		case NFT_LOOKUP_LISTENER: +			sk = inet6_lookup_listener(net, &tcp_hashinfo, +						   saddr, sport, +						   daddr, ntohs(dport), +						   in->ifindex); + +			/* NOTE: we return listeners even if bound to +			 * 0.0.0.0, those are filtered out in +			 * xt_socket, since xt_TPROXY needs 0 bound +			 * listeners too +			 */ +			break; +		case NFT_LOOKUP_ESTABLISHED: +			sk = __inet6_lookup_established(net, &tcp_hashinfo, +							saddr, sport, daddr, ntohs(dport), +							in->ifindex); +			break; +		default: +			BUG(); +		} +		break; +	case IPPROTO_UDP: +		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, +				     in->ifindex); +		if (sk) { +			int connected = (sk->sk_state == TCP_ESTABLISHED); +			int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr); + +			/* NOTE: we return listeners even if bound to +			 * 0.0.0.0, those are filtered out in +			 * xt_socket, since xt_TPROXY needs 0 bound +			 * listeners too +			 */ +			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || +			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) { +				sock_put(sk); +				sk = NULL; +			} +		} +		break; +	default: +		WARN_ON(1); +		sk = NULL; +	} + +	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", +		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); + +	return sk; +} +#endif +  /** - * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections + * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections   * @skb:	The skb being processed.   * @laddr:	IPv4 address to redirect to or zero.   * @lport:	TCP port to redirect to or zero. @@ -103,6 +275,15 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,  	return sk;  } +/* assign a socket to the skb -- consumes sk */ +static void +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ +	skb_orphan(skb); +	skb->sk = sk; +	skb->destructor = sock_edemux; +} +  static unsigned int  tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,  	   u_int32_t mark_mask, u_int32_t mark_value) @@ -141,7 +322,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,  					   skb->dev, NFT_LOOKUP_LISTENER);  	/* NOTE: assign_sock consumes our sk reference */ -	if (sk && nf_tproxy_assign_sock(skb, sk)) { +	if (sk && tproxy_sk_is_transparent(sk)) {  		/* This should be in a separate target, but we don't do multiple  		   targets on the same rule yet */  		skb->mark = (skb->mark & ~mark_mask) ^ mark_value; @@ -149,6 +330,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,  		pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",  			 iph->protocol, &iph->daddr, ntohs(hp->dest),  			 &laddr, ntohs(lport), skb->mark); + +		nf_tproxy_assign_sock(skb, sk);  		return NF_ACCEPT;  	} @@ -204,7 +387,7 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,  }  /** - * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections + * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections   * @skb:	The skb being processed.   * @tproto:	Transport protocol.   * @thoff:	Transport protocol header offset. @@ -266,10 +449,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)  	struct sock *sk;  	const struct in6_addr *laddr;  	__be16 lport; -	int thoff; +	int thoff = 0;  	int tproto; -	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); +	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);  	if (tproto < 0) {  		pr_debug("unable to find transport header in IPv6 packet, dropping\n");  		return NF_DROP; @@ -306,7 +489,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)  					   par->in, NFT_LOOKUP_LISTENER);  	/* NOTE: assign_sock consumes our sk reference */ -	if (sk && nf_tproxy_assign_sock(skb, sk)) { +	if (sk && tproxy_sk_is_transparent(sk)) {  		/* This should be in a separate target, but we don't do multiple  		   targets on the same rule yet */  		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; @@ -314,6 +497,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)  		pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",  			 tproto, &iph->saddr, ntohs(hp->source),  			 laddr, ntohs(lport), skb->mark); + +		nf_tproxy_assign_sock(skb, sk);  		return NF_ACCEPT;  	} diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c new file mode 100644 index 00000000000..fab6eea1bf3 --- /dev/null +++ b/net/netfilter/xt_addrtype.c @@ -0,0 +1,248 @@ +/* + *  iptables module to match inet_addr_type() of an ip. + * + *  Copyright (c) 2004 Patrick McHardy <kaber@trash.net> + *  (C) 2007 Laszlo Attila Toth <panther@balabit.hu> + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License version 2 as + *  published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/ip.h> +#include <net/route.h> + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/ip6_fib.h> +#endif + +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/xt_addrtype.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: address type match"); +MODULE_ALIAS("ipt_addrtype"); +MODULE_ALIAS("ip6t_addrtype"); + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, +			    const struct in6_addr *addr, u16 mask) +{ +	const struct nf_afinfo *afinfo; +	struct flowi6 flow; +	struct rt6_info *rt; +	u32 ret = 0; +	int route_err; + +	memset(&flow, 0, sizeof(flow)); +	flow.daddr = *addr; +	if (dev) +		flow.flowi6_oif = dev->ifindex; + +	rcu_read_lock(); + +	afinfo = nf_get_afinfo(NFPROTO_IPV6); +	if (afinfo != NULL) { +		const struct nf_ipv6_ops *v6ops; + +		if (dev && (mask & XT_ADDRTYPE_LOCAL)) { +			v6ops = nf_get_ipv6_ops(); +			if (v6ops && v6ops->chk_addr(net, addr, dev, true)) +				ret = XT_ADDRTYPE_LOCAL; +		} +		route_err = afinfo->route(net, (struct dst_entry **)&rt, +					  flowi6_to_flowi(&flow), false); +	} else { +		route_err = 1; +	} +	rcu_read_unlock(); + +	if (route_err) +		return XT_ADDRTYPE_UNREACHABLE; + +	if (rt->rt6i_flags & RTF_REJECT) +		ret = XT_ADDRTYPE_UNREACHABLE; + +	if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) +		ret |= XT_ADDRTYPE_LOCAL; +	if (rt->rt6i_flags & RTF_ANYCAST) +		ret |= XT_ADDRTYPE_ANYCAST; + +	dst_release(&rt->dst); +	return ret; +} + +static bool match_type6(struct net *net, const struct net_device *dev, +				const struct in6_addr *addr, u16 mask) +{ +	int addr_type = ipv6_addr_type(addr); + +	if ((mask & XT_ADDRTYPE_MULTICAST) && +	    !(addr_type & IPV6_ADDR_MULTICAST)) +		return false; +	if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST)) +		return false; +	if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY) +		return false; + +	if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | +	     XT_ADDRTYPE_UNREACHABLE) & mask) +		return !!(mask & match_lookup_rt6(net, dev, addr, mask)); +	return true; +} + +static bool +addrtype_mt6(struct net *net, const struct net_device *dev, +	const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info) +{ +	const struct ipv6hdr *iph = ipv6_hdr(skb); +	bool ret = true; + +	if (info->source) +		ret &= match_type6(net, dev, &iph->saddr, info->source) ^ +		       (info->flags & XT_ADDRTYPE_INVERT_SOURCE); +	if (ret && info->dest) +		ret &= match_type6(net, dev, &iph->daddr, info->dest) ^ +		       !!(info->flags & XT_ADDRTYPE_INVERT_DEST); +	return ret; +} +#endif + +static inline bool match_type(struct net *net, const struct net_device *dev, +			      __be32 addr, u_int16_t mask) +{ +	return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); +} + +static bool +addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ +	struct net *net = dev_net(par->in ? par->in : par->out); +	const struct xt_addrtype_info *info = par->matchinfo; +	const struct iphdr *iph = ip_hdr(skb); +	bool ret = true; + +	if (info->source) +		ret &= match_type(net, NULL, iph->saddr, info->source) ^ +		       info->invert_source; +	if (info->dest) +		ret &= match_type(net, NULL, iph->daddr, info->dest) ^ +		       info->invert_dest; + +	return ret; +} + +static bool +addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ +	struct net *net = dev_net(par->in ? par->in : par->out); +	const struct xt_addrtype_info_v1 *info = par->matchinfo; +	const struct iphdr *iph; +	const struct net_device *dev = NULL; +	bool ret = true; + +	if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) +		dev = par->in; +	else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) +		dev = par->out; + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	if (par->family == NFPROTO_IPV6) +		return addrtype_mt6(net, dev, skb, info); +#endif +	iph = ip_hdr(skb); +	if (info->source) +		ret &= match_type(net, dev, iph->saddr, info->source) ^ +		       (info->flags & XT_ADDRTYPE_INVERT_SOURCE); +	if (ret && info->dest) +		ret &= match_type(net, dev, iph->daddr, info->dest) ^ +		       !!(info->flags & XT_ADDRTYPE_INVERT_DEST); +	return ret; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ +	struct xt_addrtype_info_v1 *info = par->matchinfo; + +	if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && +	    info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { +		pr_info("both incoming and outgoing " +			"interface limitation cannot be selected\n"); +		return -EINVAL; +	} + +	if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | +	    (1 << NF_INET_LOCAL_IN)) && +	    info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { +		pr_info("output interface limitation " +			"not valid in PREROUTING and INPUT\n"); +		return -EINVAL; +	} + +	if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | +	    (1 << NF_INET_LOCAL_OUT)) && +	    info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { +		pr_info("input interface limitation " +			"not valid in POSTROUTING and OUTPUT\n"); +		return -EINVAL; +	} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	if (par->family == NFPROTO_IPV6) { +		if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { +			pr_err("ipv6 BLACKHOLE matching not supported\n"); +			return -EINVAL; +		} +		if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { +			pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n"); +			return -EINVAL; +		} +		if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { +			pr_err("ipv6 does not support BROADCAST matching\n"); +			return -EINVAL; +		} +	} +#endif +	return 0; +} + +static struct xt_match addrtype_mt_reg[] __read_mostly = { +	{ +		.name		= "addrtype", +		.family		= NFPROTO_IPV4, +		.match		= addrtype_mt_v0, +		.matchsize	= sizeof(struct xt_addrtype_info), +		.me		= THIS_MODULE +	}, +	{ +		.name		= "addrtype", +		.family		= NFPROTO_UNSPEC, +		.revision	= 1, +		.match		= addrtype_mt_v1, +		.checkentry	= addrtype_mt_checkentry_v1, +		.matchsize	= sizeof(struct xt_addrtype_info_v1), +		.me		= THIS_MODULE +	} +}; + +static int __init addrtype_mt_init(void) +{ +	return xt_register_matches(addrtype_mt_reg, +				   ARRAY_SIZE(addrtype_mt_reg)); +} + +static void __exit addrtype_mt_exit(void) +{ +	xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); +} + +module_init(addrtype_mt_init); +module_exit(addrtype_mt_exit); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c new file mode 100644 index 00000000000..bbffdbdaf60 --- /dev/null +++ b/net/netfilter/xt_bpf.c @@ -0,0 +1,74 @@ +/* Xtables module to match packets using a BPF filter. + * Copyright 2013 Google Inc. + * Written by Willem de Bruijn <willemb@google.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/filter.h> + +#include <linux/netfilter/xt_bpf.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Willem de Bruijn <willemb@google.com>"); +MODULE_DESCRIPTION("Xtables: BPF filter match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_bpf"); +MODULE_ALIAS("ip6t_bpf"); + +static int bpf_mt_check(const struct xt_mtchk_param *par) +{ +	struct xt_bpf_info *info = par->matchinfo; +	struct sock_fprog_kern program; + +	program.len = info->bpf_program_num_elem; +	program.filter = info->bpf_program; + +	if (sk_unattached_filter_create(&info->filter, &program)) { +		pr_info("bpf: check failed: parse error\n"); +		return -EINVAL; +	} + +	return 0; +} + +static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_bpf_info *info = par->matchinfo; + +	return SK_RUN_FILTER(info->filter, skb); +} + +static void bpf_mt_destroy(const struct xt_mtdtor_param *par) +{ +	const struct xt_bpf_info *info = par->matchinfo; +	sk_unattached_filter_destroy(info->filter); +} + +static struct xt_match bpf_mt_reg __read_mostly = { +	.name		= "bpf", +	.revision	= 0, +	.family		= NFPROTO_UNSPEC, +	.checkentry	= bpf_mt_check, +	.match		= bpf_mt, +	.destroy	= bpf_mt_destroy, +	.matchsize	= sizeof(struct xt_bpf_info), +	.me		= THIS_MODULE, +}; + +static int __init bpf_mt_init(void) +{ +	return xt_register_match(&bpf_mt_reg); +} + +static void __exit bpf_mt_exit(void) +{ +	xt_unregister_match(&bpf_mt_reg); +} + +module_init(bpf_mt_init); +module_exit(bpf_mt_exit); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c new file mode 100644 index 00000000000..f4e83300532 --- /dev/null +++ b/net/netfilter/xt_cgroup.c @@ -0,0 +1,72 @@ +/* + * Xtables module to match the process control group. + * + * Might be used to implement individual "per-application" firewall + * policies in contrast to global policies based on control groups. + * Matching is based upon processes tagged to net_cls' classid marker. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/skbuff.h> +#include <linux/module.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_cgroup.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("Xtables: process control group matching"); +MODULE_ALIAS("ipt_cgroup"); +MODULE_ALIAS("ip6t_cgroup"); + +static int cgroup_mt_check(const struct xt_mtchk_param *par) +{ +	struct xt_cgroup_info *info = par->matchinfo; + +	if (info->invert & ~1) +		return -EINVAL; + +	return info->id ? 0 : -EINVAL; +} + +static bool +cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_cgroup_info *info = par->matchinfo; + +	if (skb->sk == NULL) +		return false; + +	return (info->id == skb->sk->sk_classid) ^ info->invert; +} + +static struct xt_match cgroup_mt_reg __read_mostly = { +	.name       = "cgroup", +	.revision   = 0, +	.family     = NFPROTO_UNSPEC, +	.checkentry = cgroup_mt_check, +	.match      = cgroup_mt, +	.matchsize  = sizeof(struct xt_cgroup_info), +	.me         = THIS_MODULE, +	.hooks      = (1 << NF_INET_LOCAL_OUT) | +		      (1 << NF_INET_POST_ROUTING) | +		      (1 << NF_INET_LOCAL_IN), +}; + +static int __init cgroup_mt_init(void) +{ +	return xt_register_match(&cgroup_mt_reg); +} + +static void __exit cgroup_mt_exit(void) +{ +	xt_unregister_match(&cgroup_mt_reg); +} + +module_init(cgroup_mt_init); +module_exit(cgroup_mt_exit); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 5b138506690..1e634615ab9 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -26,60 +26,62 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)  	u_int64_t what = 0;	/* initialize to make gcc happy */  	u_int64_t bytes = 0;  	u_int64_t pkts = 0; +	const struct nf_conn_acct *acct;  	const struct nf_conn_counter *counters;  	ct = nf_ct_get(skb, &ctinfo);  	if (!ct)  		return false; -	counters = nf_conn_acct_find(ct); -	if (!counters) +	acct = nf_conn_acct_find(ct); +	if (!acct)  		return false; +	counters = acct->counter;  	switch (sinfo->what) {  	case XT_CONNBYTES_PKTS:  		switch (sinfo->direction) {  		case XT_CONNBYTES_DIR_ORIGINAL: -			what = counters[IP_CT_DIR_ORIGINAL].packets; +			what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);  			break;  		case XT_CONNBYTES_DIR_REPLY: -			what = counters[IP_CT_DIR_REPLY].packets; +			what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets);  			break;  		case XT_CONNBYTES_DIR_BOTH: -			what = counters[IP_CT_DIR_ORIGINAL].packets; -			what += counters[IP_CT_DIR_REPLY].packets; +			what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); +			what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets);  			break;  		}  		break;  	case XT_CONNBYTES_BYTES:  		switch (sinfo->direction) {  		case XT_CONNBYTES_DIR_ORIGINAL: -			what = counters[IP_CT_DIR_ORIGINAL].bytes; +			what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes);  			break;  		case XT_CONNBYTES_DIR_REPLY: -			what = counters[IP_CT_DIR_REPLY].bytes; +			what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);  			break;  		case XT_CONNBYTES_DIR_BOTH: -			what = counters[IP_CT_DIR_ORIGINAL].bytes; -			what += counters[IP_CT_DIR_REPLY].bytes; +			what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); +			what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);  			break;  		}  		break;  	case XT_CONNBYTES_AVGPKT:  		switch (sinfo->direction) {  		case XT_CONNBYTES_DIR_ORIGINAL: -			bytes = counters[IP_CT_DIR_ORIGINAL].bytes; -			pkts  = counters[IP_CT_DIR_ORIGINAL].packets; +			bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); +			pkts  = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);  			break;  		case XT_CONNBYTES_DIR_REPLY: -			bytes = counters[IP_CT_DIR_REPLY].bytes; -			pkts  = counters[IP_CT_DIR_REPLY].packets; +			bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); +			pkts  = atomic64_read(&counters[IP_CT_DIR_REPLY].packets);  			break;  		case XT_CONNBYTES_DIR_BOTH: -			bytes = counters[IP_CT_DIR_ORIGINAL].bytes + -				counters[IP_CT_DIR_REPLY].bytes; -			pkts  = counters[IP_CT_DIR_ORIGINAL].packets + -				counters[IP_CT_DIR_REPLY].packets; +			bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + +				atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); +			pkts  = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + +				atomic64_read(&counters[IP_CT_DIR_REPLY].packets);  			break;  		}  		if (pkts != 0) @@ -87,10 +89,10 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)  		break;  	} -	if (sinfo->count.to) +	if (sinfo->count.to >= sinfo->count.from)  		return what <= sinfo->count.to && what >= sinfo->count.from; -	else -		return what >= sinfo->count.from; +	else /* inverted */ +		return what < sinfo->count.to || what > sinfo->count.from;  }  static int connbytes_mt_check(const struct xt_mtchk_param *par) diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c new file mode 100644 index 00000000000..9f8719df200 --- /dev/null +++ b/net/netfilter/xt_connlabel.c @@ -0,0 +1,99 @@ +/* + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); +MODULE_ALIAS("ipt_connlabel"); +MODULE_ALIAS("ip6t_connlabel"); + +static bool +connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_connlabel_mtinfo *info = par->matchinfo; +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; +	bool invert = info->options & XT_CONNLABEL_OP_INVERT; + +	ct = nf_ct_get(skb, &ctinfo); +	if (ct == NULL || nf_ct_is_untracked(ct)) +		return invert; + +	if (info->options & XT_CONNLABEL_OP_SET) +		return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; + +	return nf_connlabel_match(ct, info->bit) ^ invert; +} + +static int connlabel_mt_check(const struct xt_mtchk_param *par) +{ +	const int options = XT_CONNLABEL_OP_INVERT | +			    XT_CONNLABEL_OP_SET; +	struct xt_connlabel_mtinfo *info = par->matchinfo; +	int ret; +	size_t words; + +	if (info->bit > XT_CONNLABEL_MAXBIT) +		return -ERANGE; + +	if (info->options & ~options) { +		pr_err("Unknown options in mask %x\n", info->options); +		return -EINVAL; +	} + +	ret = nf_ct_l3proto_try_module_get(par->family); +	if (ret < 0) { +		pr_info("cannot load conntrack support for proto=%u\n", +							par->family); +		return ret; +	} + +	par->net->ct.labels_used++; +	words = BITS_TO_LONGS(info->bit+1); +	if (words > par->net->ct.label_words) +		par->net->ct.label_words = words; + +	return ret; +} + +static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) +{ +	par->net->ct.labels_used--; +	if (par->net->ct.labels_used == 0) +		par->net->ct.label_words = 0; +	nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match connlabels_mt_reg __read_mostly = { +	.name           = "connlabel", +	.family         = NFPROTO_UNSPEC, +	.checkentry     = connlabel_mt_check, +	.match          = connlabel_mt, +	.matchsize      = sizeof(struct xt_connlabel_mtinfo), +	.destroy        = connlabel_mt_destroy, +	.me             = THIS_MODULE, +}; + +static int __init connlabel_mt_init(void) +{ +	return xt_register_match(&connlabels_mt_reg); +} + +static void __exit connlabel_mt_exit(void) +{ +	xt_unregister_match(&connlabels_mt_reg); +} + +module_init(connlabel_mt_init); +module_exit(connlabel_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 5c5b6b921b8..fbc66bb250d 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -19,6 +19,7 @@  #include <linux/jhash.h>  #include <linux/slab.h>  #include <linux/list.h> +#include <linux/rbtree.h>  #include <linux/module.h>  #include <linux/random.h>  #include <linux/skbuff.h> @@ -31,23 +32,44 @@  #include <net/netfilter/nf_conntrack_tuple.h>  #include <net/netfilter/nf_conntrack_zones.h> +#define CONNLIMIT_SLOTS		256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS	8U +#else +#define CONNLIMIT_LOCK_SLOTS	256U +#endif + +#define CONNLIMIT_GC_MAX_NODES	8 +  /* we will save the tuples of all connections we care about */  struct xt_connlimit_conn { -	struct list_head list; -	struct nf_conntrack_tuple tuple; +	struct hlist_node		node; +	struct nf_conntrack_tuple	tuple; +	union nf_inet_addr		addr;  }; +struct xt_connlimit_rb { +	struct rb_node node; +	struct hlist_head hhead; /* connections/hosts in same subnet */ +	union nf_inet_addr addr; /* search key */ +}; + +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; +  struct xt_connlimit_data { -	struct list_head iphash[256]; -	spinlock_t lock; +	struct rb_root climit_root4[CONNLIMIT_SLOTS]; +	struct rb_root climit_root6[CONNLIMIT_SLOTS];  };  static u_int32_t connlimit_rnd __read_mostly; -static bool connlimit_rnd_inited __read_mostly; +static struct kmem_cache *connlimit_rb_cachep __read_mostly; +static struct kmem_cache *connlimit_conn_cachep __read_mostly;  static inline unsigned int connlimit_iphash(__be32 addr)  { -	return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; +	return jhash_1word((__force __u32)addr, +			    connlimit_rnd) % CONNLIMIT_SLOTS;  }  static inline unsigned int @@ -60,7 +82,8 @@ connlimit_iphash6(const union nf_inet_addr *addr,  	for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)  		res.ip6[i] = addr->ip6[i] & mask->ip6[i]; -	return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; +	return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), +		       connlimit_rnd) % CONNLIMIT_SLOTS;  }  static inline bool already_closed(const struct nf_conn *conn) @@ -72,13 +95,14 @@ static inline bool already_closed(const struct nf_conn *conn)  		return 0;  } -static inline unsigned int +static int  same_source_net(const union nf_inet_addr *addr,  		const union nf_inet_addr *mask,  		const union nf_inet_addr *u3, u_int8_t family)  {  	if (family == NFPROTO_IPV4) { -		return (addr->ip & mask->ip) == (u3->ip & mask->ip); +		return ntohl(addr->ip & mask->ip) - +		       ntohl(u3->ip & mask->ip);  	} else {  		union nf_inet_addr lh, rh;  		unsigned int i; @@ -88,88 +112,205 @@ same_source_net(const union nf_inet_addr *addr,  			rh.ip6[i] = u3->ip6[i] & mask->ip6[i];  		} -		return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; +		return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6));  	}  } -static int count_them(struct net *net, -		      struct xt_connlimit_data *data, +static bool add_hlist(struct hlist_head *head,  		      const struct nf_conntrack_tuple *tuple, -		      const union nf_inet_addr *addr, -		      const union nf_inet_addr *mask, -		      u_int8_t family) +		      const union nf_inet_addr *addr) +{ +	struct xt_connlimit_conn *conn; + +	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); +	if (conn == NULL) +		return false; +	conn->tuple = *tuple; +	conn->addr = *addr; +	hlist_add_head(&conn->node, head); +	return true; +} + +static unsigned int check_hlist(struct net *net, +				struct hlist_head *head, +				const struct nf_conntrack_tuple *tuple, +				bool *addit)  {  	const struct nf_conntrack_tuple_hash *found;  	struct xt_connlimit_conn *conn; -	struct xt_connlimit_conn *tmp; +	struct hlist_node *n;  	struct nf_conn *found_ct; -	struct list_head *hash; -	bool addit = true; -	int matches = 0; - -	if (family == NFPROTO_IPV6) -		hash = &data->iphash[connlimit_iphash6(addr, mask)]; -	else -		hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; +	unsigned int length = 0; +	*addit = true;  	rcu_read_lock();  	/* check the saved connections */ -	list_for_each_entry_safe(conn, tmp, hash, list) { +	hlist_for_each_entry_safe(conn, n, head, node) {  		found    = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE,  						 &conn->tuple); -		found_ct = NULL; +		if (found == NULL) { +			hlist_del(&conn->node); +			kmem_cache_free(connlimit_conn_cachep, conn); +			continue; +		} -		if (found != NULL) -			found_ct = nf_ct_tuplehash_to_ctrack(found); +		found_ct = nf_ct_tuplehash_to_ctrack(found); -		if (found_ct != NULL && -		    nf_ct_tuple_equal(&conn->tuple, tuple) && -		    !already_closed(found_ct)) +		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {  			/*  			 * Just to be sure we have it only once in the list.  			 * We should not see tuples twice unless someone hooks  			 * this into a table without "-p tcp --syn".  			 */ -			addit = false; - -		if (found == NULL) { -			/* this one is gone */ -			list_del(&conn->list); -			kfree(conn); -			continue; -		} - -		if (already_closed(found_ct)) { +			*addit = false; +		} else if (already_closed(found_ct)) {  			/*  			 * we do not care about connections which are  			 * closed already -> ditch it  			 */  			nf_ct_put(found_ct); -			list_del(&conn->list); -			kfree(conn); +			hlist_del(&conn->node); +			kmem_cache_free(connlimit_conn_cachep, conn);  			continue;  		} -		if (same_source_net(addr, mask, &conn->tuple.src.u3, family)) -			/* same source network -> be counted! */ -			++matches;  		nf_ct_put(found_ct); +		length++;  	}  	rcu_read_unlock(); -	if (addit) { -		/* save the new connection in our list */ -		conn = kzalloc(sizeof(*conn), GFP_ATOMIC); -		if (conn == NULL) -			return -ENOMEM; -		conn->tuple = *tuple; -		list_add(&conn->list, hash); -		++matches; +	return length; +} + +static void tree_nodes_free(struct rb_root *root, +			    struct xt_connlimit_rb *gc_nodes[], +			    unsigned int gc_count) +{ +	struct xt_connlimit_rb *rbconn; + +	while (gc_count) { +		rbconn = gc_nodes[--gc_count]; +		rb_erase(&rbconn->node, root); +		kmem_cache_free(connlimit_rb_cachep, rbconn); +	} +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, +	   const struct nf_conntrack_tuple *tuple, +	   const union nf_inet_addr *addr, const union nf_inet_addr *mask, +	   u8 family) +{ +	struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; +	struct rb_node **rbnode, *parent; +	struct xt_connlimit_rb *rbconn; +	struct xt_connlimit_conn *conn; +	unsigned int gc_count; +	bool no_gc = false; + + restart: +	gc_count = 0; +	parent = NULL; +	rbnode = &(root->rb_node); +	while (*rbnode) { +		int diff; +		bool addit; + +		rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); + +		parent = *rbnode; +		diff = same_source_net(addr, mask, &rbconn->addr, family); +		if (diff < 0) { +			rbnode = &((*rbnode)->rb_left); +		} else if (diff > 0) { +			rbnode = &((*rbnode)->rb_right); +		} else { +			/* same source network -> be counted! */ +			unsigned int count; +			count = check_hlist(net, &rbconn->hhead, tuple, &addit); + +			tree_nodes_free(root, gc_nodes, gc_count); +			if (!addit) +				return count; + +			if (!add_hlist(&rbconn->hhead, tuple, addr)) +				return 0; /* hotdrop */ + +			return count + 1; +		} + +		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) +			continue; + +		/* only used for GC on hhead, retval and 'addit' ignored */ +		check_hlist(net, &rbconn->hhead, tuple, &addit); +		if (hlist_empty(&rbconn->hhead)) +			gc_nodes[gc_count++] = rbconn; +	} + +	if (gc_count) { +		no_gc = true; +		tree_nodes_free(root, gc_nodes, gc_count); +		/* tree_node_free before new allocation permits +		 * allocator to re-use newly free'd object. +		 * +		 * This is a rare event; in most cases we will find +		 * existing node to re-use. (or gc_count is 0). +		 */ +		goto restart;  	} -	return matches; +	/* no match, need to insert new node */ +	rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); +	if (rbconn == NULL) +		return 0; + +	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); +	if (conn == NULL) { +		kmem_cache_free(connlimit_rb_cachep, rbconn); +		return 0; +	} + +	conn->tuple = *tuple; +	conn->addr = *addr; +	rbconn->addr = *addr; + +	INIT_HLIST_HEAD(&rbconn->hhead); +	hlist_add_head(&conn->node, &rbconn->hhead); + +	rb_link_node(&rbconn->node, parent, rbnode); +	rb_insert_color(&rbconn->node, root); +	return 1; +} + +static int count_them(struct net *net, +		      struct xt_connlimit_data *data, +		      const struct nf_conntrack_tuple *tuple, +		      const union nf_inet_addr *addr, +		      const union nf_inet_addr *mask, +		      u_int8_t family) +{ +	struct rb_root *root; +	int count; +	u32 hash; + +	if (family == NFPROTO_IPV6) { +		hash = connlimit_iphash6(addr, mask); +		root = &data->climit_root6[hash]; +	} else { +		hash = connlimit_iphash(addr->ip & mask->ip); +		root = &data->climit_root4[hash]; +	} + +	spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + +	count = count_tree(net, root, tuple, addr, mask, family); + +	spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + +	return count;  }  static bool @@ -182,35 +323,33 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)  	const struct nf_conntrack_tuple *tuple_ptr = &tuple;  	enum ip_conntrack_info ctinfo;  	const struct nf_conn *ct; -	int connections; +	unsigned int connections;  	ct = nf_ct_get(skb, &ctinfo);  	if (ct != NULL) -		tuple_ptr = &ct->tuplehash[0].tuple; +		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;  	else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),  				    par->family, &tuple))  		goto hotdrop;  	if (par->family == NFPROTO_IPV6) {  		const struct ipv6hdr *iph = ipv6_hdr(skb); -		memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); +		memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? +		       &iph->daddr : &iph->saddr, sizeof(addr.ip6));  	} else {  		const struct iphdr *iph = ip_hdr(skb); -		addr.ip = iph->saddr; +		addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? +			  iph->daddr : iph->saddr;  	} -	spin_lock_bh(&info->data->lock);  	connections = count_them(net, info->data, tuple_ptr, &addr,  	                         &info->mask, par->family); -	spin_unlock_bh(&info->data->lock); - -	if (connections < 0) { +	if (connections == 0)  		/* kmalloc failed, drop it entirely */ -		par->hotdrop = true; -		return false; -	} +		goto hotdrop; -	return (connections > info->limit) ^ info->inverse; +	return (connections > info->limit) ^ +	       !!(info->flags & XT_CONNLIMIT_INVERT);   hotdrop:  	par->hotdrop = true; @@ -223,9 +362,13 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)  	unsigned int i;  	int ret; -	if (unlikely(!connlimit_rnd_inited)) { -		get_random_bytes(&connlimit_rnd, sizeof(connlimit_rnd)); -		connlimit_rnd_inited = true; +	if (unlikely(!connlimit_rnd)) { +		u_int32_t rand; + +		do { +			get_random_bytes(&rand, sizeof(rand)); +		} while (!rand); +		cmpxchg(&connlimit_rnd, 0, rand);  	}  	ret = nf_ct_l3proto_try_module_get(par->family);  	if (ret < 0) { @@ -241,36 +384,51 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)  		return -ENOMEM;  	} -	spin_lock_init(&info->data->lock); -	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) -		INIT_LIST_HEAD(&info->data->iphash[i]); +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) +		info->data->climit_root4[i] = RB_ROOT; +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) +		info->data->climit_root6[i] = RB_ROOT;  	return 0;  } +static void destroy_tree(struct rb_root *r) +{ +	struct xt_connlimit_conn *conn; +	struct xt_connlimit_rb *rbconn; +	struct hlist_node *n; +	struct rb_node *node; + +	while ((node = rb_first(r)) != NULL) { +		rbconn = container_of(node, struct xt_connlimit_rb, node); + +		rb_erase(node, r); + +		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) +			kmem_cache_free(connlimit_conn_cachep, conn); + +		kmem_cache_free(connlimit_rb_cachep, rbconn); +	} +} +  static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)  {  	const struct xt_connlimit_info *info = par->matchinfo; -	struct xt_connlimit_conn *conn; -	struct xt_connlimit_conn *tmp; -	struct list_head *hash = info->data->iphash;  	unsigned int i;  	nf_ct_l3proto_module_put(par->family); -	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { -		list_for_each_entry_safe(conn, tmp, &hash[i], list) { -			list_del(&conn->list); -			kfree(conn); -		} -	} +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) +		destroy_tree(&info->data->climit_root4[i]); +	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) +		destroy_tree(&info->data->climit_root6[i]);  	kfree(info->data);  }  static struct xt_match connlimit_mt_reg __read_mostly = {  	.name       = "connlimit", -	.revision   = 0, +	.revision   = 1,  	.family     = NFPROTO_UNSPEC,  	.checkentry = connlimit_mt_check,  	.match      = connlimit_mt, @@ -281,12 +439,40 @@ static struct xt_match connlimit_mt_reg __read_mostly = {  static int __init connlimit_mt_init(void)  { -	return xt_register_match(&connlimit_mt_reg); +	int ret, i; + +	BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); +	BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + +	for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) +		spin_lock_init(&xt_connlimit_locks[i]); + +	connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", +					   sizeof(struct xt_connlimit_conn), +					   0, 0, NULL); +	if (!connlimit_conn_cachep) +		return -ENOMEM; + +	connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", +					   sizeof(struct xt_connlimit_rb), +					   0, 0, NULL); +	if (!connlimit_rb_cachep) { +		kmem_cache_destroy(connlimit_conn_cachep); +		return -ENOMEM; +	} +	ret = xt_register_match(&connlimit_mt_reg); +	if (ret != 0) { +		kmem_cache_destroy(connlimit_conn_cachep); +		kmem_cache_destroy(connlimit_rb_cachep); +	} +	return ret;  }  static void __exit connlimit_mt_exit(void)  {  	xt_unregister_match(&connlimit_mt_reg); +	kmem_cache_destroy(connlimit_conn_cachep); +	kmem_cache_destroy(connlimit_rb_cachep);  }  module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 7278145e6a6..69f78e96fdb 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -17,8 +17,7 @@   * GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #include <linux/module.h> diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index e536710ad91..188404b9b00 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -3,6 +3,7 @@   *	information. (Superset of Rusty's minimalistic state match.)   *   *	(C) 2001  Marc Boucher (marc@mbsi.ca). + *	(C) 2006-2012 Patrick McHardy <kaber@trash.net>   *	Copyright © CC Computer Consultants GmbH, 2007 - 2008   *   *	This program is free software; you can redistribute it and/or modify @@ -112,6 +113,54 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,  	return true;  } +static inline bool +port_match(u16 min, u16 max, u16 port, bool invert) +{ +	return (port >= min && port <= max) ^ invert; +} + +static inline bool +ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, +		       const struct nf_conn *ct) +{ +	const struct nf_conntrack_tuple *tuple; + +	tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; +	if ((info->match_flags & XT_CONNTRACK_PROTO) && +	    (nf_ct_protonum(ct) == info->l4proto) ^ +	    !(info->invert_flags & XT_CONNTRACK_PROTO)) +		return false; + +	/* Shortcut to match all recognized protocols by using ->src.all. */ +	if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && +	    !port_match(info->origsrc_port, info->origsrc_port_high, +			ntohs(tuple->src.u.all), +			info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) +		return false; + +	if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && +	    !port_match(info->origdst_port, info->origdst_port_high, +			ntohs(tuple->dst.u.all), +			info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) +		return false; + +	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + +	if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && +	    !port_match(info->replsrc_port, info->replsrc_port_high, +			ntohs(tuple->src.u.all), +			info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) +		return false; + +	if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && +	    !port_match(info->repldst_port, info->repldst_port_high, +			ntohs(tuple->dst.u.all), +			info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) +		return false; + +	return true; +} +  static bool  conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,               u16 state_mask, u16 status_mask) @@ -147,7 +196,7 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,  		return info->match_flags & XT_CONNTRACK_STATE;  	if ((info->match_flags & XT_CONNTRACK_DIRECTION) &&  	    (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ -	    !!(info->invert_flags & XT_CONNTRACK_DIRECTION)) +	    !(info->invert_flags & XT_CONNTRACK_DIRECTION))  		return false;  	if (info->match_flags & XT_CONNTRACK_ORIGSRC) @@ -170,8 +219,13 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,  		    !(info->invert_flags & XT_CONNTRACK_REPLDST))  			return false; -	if (!ct_proto_port_check(info, ct)) -		return false; +	if (par->match->revision != 3) { +		if (!ct_proto_port_check(info, ct)) +			return false; +	} else { +		if (!ct_proto_port_check_v3(par->matchinfo, ct)) +			return false; +	}  	if ((info->match_flags & XT_CONNTRACK_STATUS) &&  	    (!!(status_mask & ct->status) ^ @@ -207,6 +261,14 @@ conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)  	return conntrack_mt(skb, par, info->state_mask, info->status_mask);  } +static bool +conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_conntrack_mtinfo3 *info = par->matchinfo; + +	return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} +  static int conntrack_mt_check(const struct xt_mtchk_param *par)  {  	int ret; @@ -244,6 +306,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {  		.destroy    = conntrack_mt_destroy,  		.me         = THIS_MODULE,  	}, +	{ +		.name       = "conntrack", +		.revision   = 3, +		.family     = NFPROTO_UNSPEC, +		.matchsize  = sizeof(struct xt_conntrack_mtinfo3), +		.match      = conntrack_mt_v3, +		.checkentry = conntrack_mt_check, +		.destroy    = conntrack_mt_destroy, +		.me         = THIS_MODULE, +	},  };  static int __init conntrack_mt_init(void) diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c index b39db8a5cba..c7a2e5466bc 100644 --- a/net/netfilter/xt_cpu.c +++ b/net/netfilter/xt_cpu.c @@ -22,6 +22,8 @@  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");  MODULE_DESCRIPTION("Xtables: CPU match"); +MODULE_ALIAS("ipt_cpu"); +MODULE_ALIAS("ip6t_cpu");  static int cpu_mt_check(const struct xt_mtchk_param *par)  { diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c new file mode 100644 index 00000000000..d9202cdd25c --- /dev/null +++ b/net/netfilter/xt_devgroup.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> + +#include <linux/netfilter/xt_devgroup.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Device group match"); +MODULE_ALIAS("ipt_devgroup"); +MODULE_ALIAS("ip6t_devgroup"); + +static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_devgroup_info *info = par->matchinfo; + +	if (info->flags & XT_DEVGROUP_MATCH_SRC && +	    (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^ +	     ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) +		return false; + +	if (info->flags & XT_DEVGROUP_MATCH_DST && +	    (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^ +	     ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) +		return false; + +	return true; +} + +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ +	const struct xt_devgroup_info *info = par->matchinfo; + +	if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | +			    XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) +		return -EINVAL; + +	if (info->flags & XT_DEVGROUP_MATCH_SRC && +	    par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | +			       (1 << NF_INET_LOCAL_IN) | +			       (1 << NF_INET_FORWARD))) +		return -EINVAL; + +	if (info->flags & XT_DEVGROUP_MATCH_DST && +	    par->hook_mask & ~((1 << NF_INET_FORWARD) | +			       (1 << NF_INET_LOCAL_OUT) | +			       (1 << NF_INET_POST_ROUTING))) +		return -EINVAL; + +	return 0; +} + +static struct xt_match devgroup_mt_reg __read_mostly = { +	.name		= "devgroup", +	.match		= devgroup_mt, +	.checkentry	= devgroup_mt_checkentry, +	.matchsize	= sizeof(struct xt_devgroup_info), +	.family		= NFPROTO_UNSPEC, +	.me		= THIS_MODULE +}; + +static int __init devgroup_mt_init(void) +{ +	return xt_register_match(&devgroup_mt_reg); +} + +static void __exit devgroup_mt_exit(void) +{ +	xt_unregister_match(&devgroup_mt_reg); +} + +module_init(devgroup_mt_init); +module_exit(devgroup_mt_exit); diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c new file mode 100644 index 00000000000..3c831a8efeb --- /dev/null +++ b/net/netfilter/xt_ecn.c @@ -0,0 +1,179 @@ +/* + * Xtables module for matching the value of the IPv4/IPv6 and TCP ECN bits + * + * (C) 2002 by Harald Welte <laforge@gnumonks.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_ecn.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ecn"); +MODULE_ALIAS("ip6t_ecn"); + +static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_ecn_info *einfo = par->matchinfo; +	struct tcphdr _tcph; +	const struct tcphdr *th; + +	/* In practice, TCP match does this, so can't fail.  But let's +	 * be good citizens. +	 */ +	th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); +	if (th == NULL) +		return false; + +	if (einfo->operation & XT_ECN_OP_MATCH_ECE) { +		if (einfo->invert & XT_ECN_OP_MATCH_ECE) { +			if (th->ece == 1) +				return false; +		} else { +			if (th->ece == 0) +				return false; +		} +	} + +	if (einfo->operation & XT_ECN_OP_MATCH_CWR) { +		if (einfo->invert & XT_ECN_OP_MATCH_CWR) { +			if (th->cwr == 1) +				return false; +		} else { +			if (th->cwr == 0) +				return false; +		} +	} + +	return true; +} + +static inline bool match_ip(const struct sk_buff *skb, +			    const struct xt_ecn_info *einfo) +{ +	return ((ip_hdr(skb)->tos & XT_ECN_IP_MASK) == einfo->ip_ect) ^ +	       !!(einfo->invert & XT_ECN_OP_MATCH_IP); +} + +static bool ecn_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_ecn_info *info = par->matchinfo; + +	if (info->operation & XT_ECN_OP_MATCH_IP && !match_ip(skb, info)) +		return false; + +	if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && +	    !match_tcp(skb, par)) +		return false; + +	return true; +} + +static int ecn_mt_check4(const struct xt_mtchk_param *par) +{ +	const struct xt_ecn_info *info = par->matchinfo; +	const struct ipt_ip *ip = par->entryinfo; + +	if (info->operation & XT_ECN_OP_MATCH_MASK) +		return -EINVAL; + +	if (info->invert & XT_ECN_OP_MATCH_MASK) +		return -EINVAL; + +	if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && +	    (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { +		pr_info("cannot match TCP bits in rule for non-tcp packets\n"); +		return -EINVAL; +	} + +	return 0; +} + +static inline bool match_ipv6(const struct sk_buff *skb, +			      const struct xt_ecn_info *einfo) +{ +	return (((ipv6_hdr(skb)->flow_lbl[0] >> 4) & XT_ECN_IP_MASK) == +	        einfo->ip_ect) ^ +	       !!(einfo->invert & XT_ECN_OP_MATCH_IP); +} + +static bool ecn_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_ecn_info *info = par->matchinfo; + +	if (info->operation & XT_ECN_OP_MATCH_IP && !match_ipv6(skb, info)) +		return false; + +	if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && +	    !match_tcp(skb, par)) +		return false; + +	return true; +} + +static int ecn_mt_check6(const struct xt_mtchk_param *par) +{ +	const struct xt_ecn_info *info = par->matchinfo; +	const struct ip6t_ip6 *ip = par->entryinfo; + +	if (info->operation & XT_ECN_OP_MATCH_MASK) +		return -EINVAL; + +	if (info->invert & XT_ECN_OP_MATCH_MASK) +		return -EINVAL; + +	if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && +	    (ip->proto != IPPROTO_TCP || ip->invflags & IP6T_INV_PROTO)) { +		pr_info("cannot match TCP bits in rule for non-tcp packets\n"); +		return -EINVAL; +	} + +	return 0; +} + +static struct xt_match ecn_mt_reg[] __read_mostly = { +	{ +		.name		= "ecn", +		.family		= NFPROTO_IPV4, +		.match		= ecn_mt4, +		.matchsize	= sizeof(struct xt_ecn_info), +		.checkentry	= ecn_mt_check4, +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "ecn", +		.family		= NFPROTO_IPV6, +		.match		= ecn_mt6, +		.matchsize	= sizeof(struct xt_ecn_info), +		.checkentry	= ecn_mt_check6, +		.me		= THIS_MODULE, +	}, +}; + +static int __init ecn_mt_init(void) +{ +	return xt_register_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); +} + +static void __exit ecn_mt_exit(void) +{ +	xt_unregister_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); +} + +module_init(ecn_mt_init); +module_exit(ecn_mt_exit); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9228ee0dc11..a3910fc2122 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -3,6 +3,7 @@   *	separately for each hashbucket (sourceip/sourceport/dstip/dstport)   *   *	(C) 2003-2004 by Harald Welte <laforge@netfilter.org> + *	(C) 2006-2012 Patrick McHardy <kaber@trash.net>   *	Copyright © CC Computer Consultants GmbH, 2007 - 2008   *   * Development of this code was funded by Astaro AG, http://www.astaro.com/ @@ -21,7 +22,7 @@  #include <linux/mm.h>  #include <linux/in.h>  #include <linux/ip.h> -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  #include <linux/ipv6.h>  #include <net/ipv6.h>  #endif @@ -64,7 +65,7 @@ struct dsthash_dst {  			__be32 src;  			__be32 dst;  		} ip; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  		struct {  			__be32 src[4];  			__be32 dst[4]; @@ -107,6 +108,7 @@ struct xt_hashlimit_htable {  	/* seq_file stuff */  	struct proc_dir_entry *pde; +	const char *name;  	struct net *net;  	struct hlist_head hash[0];	/* hashtable itself */ @@ -141,11 +143,10 @@ dsthash_find(const struct xt_hashlimit_htable *ht,  	     const struct dsthash_dst *dst)  {  	struct dsthash_ent *ent; -	struct hlist_node *pos;  	u_int32_t hash = hash_dst(ht, dst);  	if (!hlist_empty(&ht->hash[hash])) { -		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node) +		hlist_for_each_entry_rcu(ent, &ht->hash[hash], node)  			if (dst_cmp(ent, dst)) {  				spin_lock(&ent->lock);  				return ent; @@ -157,11 +158,22 @@ dsthash_find(const struct xt_hashlimit_htable *ht,  /* allocate dsthash_ent, initialize dst, put in htable and lock it */  static struct dsthash_ent *  dsthash_alloc_init(struct xt_hashlimit_htable *ht, -		   const struct dsthash_dst *dst) +		   const struct dsthash_dst *dst, bool *race)  {  	struct dsthash_ent *ent;  	spin_lock(&ht->lock); + +	/* Two or more packets may race to create the same entry in the +	 * hashtable, double check if this packet lost race. +	 */ +	ent = dsthash_find(ht, dst); +	if (ent != NULL) { +		spin_unlock(&ht->lock); +		*race = true; +		return ent; +	} +  	/* initialize hash with random val at the time we allocate  	 * the first hashtable entry */  	if (unlikely(!ht->rnd_initialized)) { @@ -171,15 +183,11 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,  	if (ht->cfg.max && ht->count >= ht->cfg.max) {  		/* FIXME: do something. question is what.. */ -		if (net_ratelimit()) -			pr_err("max count of %u reached\n", ht->cfg.max); +		net_err_ratelimited("max count of %u reached\n", ht->cfg.max);  		ent = NULL;  	} else  		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); -	if (!ent) { -		if (net_ratelimit()) -			pr_err("cannot allocate dsthash_ent\n"); -	} else { +	if (ent) {  		memcpy(&ent->dst, dst, sizeof(ent->dst));  		spin_lock_init(&ent->lock); @@ -247,6 +255,11 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,  	hinfo->count = 0;  	hinfo->family = family;  	hinfo->rnd_initialized = false; +	hinfo->name = kstrdup(minfo->name, GFP_KERNEL); +	if (!hinfo->name) { +		vfree(hinfo); +		return -ENOMEM; +	}  	spin_lock_init(&hinfo->lock);  	hinfo->pde = proc_create_data(minfo->name, 0, @@ -254,6 +267,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,  		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,  		&dl_file_ops, hinfo);  	if (hinfo->pde == NULL) { +		kfree(hinfo->name);  		vfree(hinfo);  		return -ENOMEM;  	} @@ -290,8 +304,8 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,  	spin_lock_bh(&ht->lock);  	for (i = 0; i < ht->cfg.size; i++) {  		struct dsthash_ent *dh; -		struct hlist_node *pos, *n; -		hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { +		struct hlist_node *n; +		hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {  			if ((*select)(ht, dh))  				dsthash_free(ht, dh);  		} @@ -311,19 +325,26 @@ static void htable_gc(unsigned long htlong)  	add_timer(&ht->timer);  } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) +static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)  {  	struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net);  	struct proc_dir_entry *parent; -	del_timer_sync(&hinfo->timer); -  	if (hinfo->family == NFPROTO_IPV4)  		parent = hashlimit_net->ipt_hashlimit;  	else  		parent = hashlimit_net->ip6t_hashlimit; -	remove_proc_entry(hinfo->pde->name, parent); + +	if (parent != NULL) +		remove_proc_entry(hinfo->name, parent); +} + +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ +	del_timer_sync(&hinfo->timer); +	htable_remove_proc_entry(hinfo);  	htable_selective_cleanup(hinfo, select_all); +	kfree(hinfo->name);  	vfree(hinfo);  } @@ -333,10 +354,9 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,  {  	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);  	struct xt_hashlimit_htable *hinfo; -	struct hlist_node *pos; -	hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) { -		if (!strcmp(name, hinfo->pde->name) && +	hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { +		if (!strcmp(name, hinfo->name) &&  		    hinfo->family == family) {  			hinfo->use++;  			return hinfo; @@ -391,9 +411,20 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)  #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) +/* in byte mode, the lowest possible rate is one packet/second. + * credit_cap is used as a counter that tells us how many times we can + * refill the "credits available" counter when it becomes empty. + */ +#define MAX_CPJ_BYTES (0xFFFFFFFF / HZ) +#define CREDITS_PER_JIFFY_BYTES POW2_BELOW32(MAX_CPJ_BYTES) + +static u32 xt_hashlimit_len_to_chunks(u32 len) +{ +	return (len >> XT_HASHLIMIT_BYTE_SHIFT) + 1; +} +  /* Precision saver. */ -static inline u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user)  {  	/* If multiplying would overflow... */  	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -403,12 +434,53 @@ user2credits(u_int32_t user)  	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;  } -static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +static u32 user2credits_byte(u32 user)  { -	dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY; -	if (dh->rateinfo.credit > dh->rateinfo.credit_cap) -		dh->rateinfo.credit = dh->rateinfo.credit_cap; +	u64 us = user; +	us *= HZ * CREDITS_PER_JIFFY_BYTES; +	return (u32) (us >> 32); +} + +static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) +{ +	unsigned long delta = now - dh->rateinfo.prev; +	u32 cap; + +	if (delta == 0) +		return; +  	dh->rateinfo.prev = now; + +	if (mode & XT_HASHLIMIT_BYTES) { +		u32 tmp = dh->rateinfo.credit; +		dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; +		cap = CREDITS_PER_JIFFY_BYTES * HZ; +		if (tmp >= dh->rateinfo.credit) {/* overflow */ +			dh->rateinfo.credit = cap; +			return; +		} +	} else { +		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY; +		cap = dh->rateinfo.credit_cap; +	} +	if (dh->rateinfo.credit > cap) +		dh->rateinfo.credit = cap; +} + +static void rateinfo_init(struct dsthash_ent *dh, +			  struct xt_hashlimit_htable *hinfo) +{ +	dh->rateinfo.prev = jiffies; +	if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { +		dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; +		dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg); +		dh->rateinfo.credit_cap = hinfo->cfg.burst; +	} else { +		dh->rateinfo.credit = user2credits(hinfo->cfg.avg * +						   hinfo->cfg.burst); +		dh->rateinfo.cost = user2credits(hinfo->cfg.avg); +		dh->rateinfo.credit_cap = dh->rateinfo.credit; +	}  }  static inline __be32 maskl(__be32 a, unsigned int l) @@ -416,7 +488,7 @@ static inline __be32 maskl(__be32 a, unsigned int l)  	return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0;  } -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  static void hashlimit_ipv6_mask(__be32 *i, unsigned int p)  {  	switch (p) { @@ -466,8 +538,11 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,  			return 0;  		nexthdr = ip_hdr(skb)->protocol;  		break; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  	case NFPROTO_IPV6: +	{ +		__be16 frag_off; +  		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) {  			memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr,  			       sizeof(dst->ip6.dst)); @@ -483,10 +558,11 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,  		      (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))  			return 0;  		nexthdr = ipv6_hdr(skb)->nexthdr; -		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); +		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off);  		if ((int)protoff < 0)  			return -1;  		break; +	}  #endif  	default:  		BUG(); @@ -510,6 +586,21 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,  	return 0;  } +static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) +{ +	u64 tmp = xt_hashlimit_len_to_chunks(len); +	tmp = tmp * dh->rateinfo.cost; + +	if (unlikely(tmp > CREDITS_PER_JIFFY_BYTES * HZ)) +		tmp = CREDITS_PER_JIFFY_BYTES * HZ; + +	if (dh->rateinfo.credit < tmp && dh->rateinfo.credit_cap) { +		dh->rateinfo.credit_cap--; +		dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; +	} +	return (u32) tmp; +} +  static bool  hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)  { @@ -518,6 +609,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)  	unsigned long now = jiffies;  	struct dsthash_ent *dh;  	struct dsthash_dst dst; +	bool race = false; +	u32 cost;  	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)  		goto hotdrop; @@ -525,27 +618,32 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)  	rcu_read_lock_bh();  	dh = dsthash_find(hinfo, &dst);  	if (dh == NULL) { -		dh = dsthash_alloc_init(hinfo, &dst); +		dh = dsthash_alloc_init(hinfo, &dst, &race);  		if (dh == NULL) {  			rcu_read_unlock_bh();  			goto hotdrop; +		} else if (race) { +			/* Already got an entry, update expiration timeout */ +			dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); +			rateinfo_recalc(dh, now, hinfo->cfg.mode); +		} else { +			dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); +			rateinfo_init(dh, hinfo);  		} -		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); -		dh->rateinfo.prev = jiffies; -		dh->rateinfo.credit = user2credits(hinfo->cfg.avg * -		                      hinfo->cfg.burst); -		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * -		                          hinfo->cfg.burst); -		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);  	} else {  		/* update expiration timeout */  		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); -		rateinfo_recalc(dh, now); +		rateinfo_recalc(dh, now, hinfo->cfg.mode);  	} -	if (dh->rateinfo.credit >= dh->rateinfo.cost) { +	if (info->cfg.mode & XT_HASHLIMIT_BYTES) +		cost = hashlimit_byte_cost(skb->len, dh); +	else +		cost = dh->rateinfo.cost; + +	if (dh->rateinfo.credit >= cost) {  		/* below the limit */ -		dh->rateinfo.credit -= dh->rateinfo.cost; +		dh->rateinfo.credit -= cost;  		spin_unlock(&dh->lock);  		rcu_read_unlock_bh();  		return !(info->cfg.mode & XT_HASHLIMIT_INVERT); @@ -567,14 +665,6 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)  	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;  	int ret; -	/* Check for overflow. */ -	if (info->cfg.burst == 0 || -	    user2credits(info->cfg.avg * info->cfg.burst) < -	    user2credits(info->cfg.avg)) { -		pr_info("overflow, try lower: %u/%u\n", -			info->cfg.avg, info->cfg.burst); -		return -ERANGE; -	}  	if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)  		return -EINVAL;  	if (info->name[sizeof(info->name)-1] != '\0') @@ -587,6 +677,26 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)  			return -EINVAL;  	} +	if (info->cfg.mode & ~XT_HASHLIMIT_ALL) { +		pr_info("Unknown mode mask %X, kernel too old?\n", +						info->cfg.mode); +		return -EINVAL; +	} + +	/* Check for overflow. */ +	if (info->cfg.mode & XT_HASHLIMIT_BYTES) { +		if (user2credits_byte(info->cfg.avg) == 0) { +			pr_info("overflow, rate too high: %u\n", info->cfg.avg); +			return -EINVAL; +		} +	} else if (info->cfg.burst == 0 || +		    user2credits(info->cfg.avg * info->cfg.burst) < +		    user2credits(info->cfg.avg)) { +			pr_info("overflow, try lower: %u/%u\n", +				info->cfg.avg, info->cfg.burst); +			return -ERANGE; +	} +  	mutex_lock(&hashlimit_mutex);  	info->hinfo = htable_find_get(net, info->name, par->family);  	if (info->hinfo == NULL) { @@ -618,7 +728,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {  		.destroy        = hashlimit_mt_destroy,  		.me             = THIS_MODULE,  	}, -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  	{  		.name           = "hashlimit",  		.revision       = 1, @@ -679,10 +789,11 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,  				   struct seq_file *s)  {  	int res; +	const struct xt_hashlimit_htable *ht = s->private;  	spin_lock(&ent->lock);  	/* recalculate to show accurate numbers */ -	rateinfo_recalc(ent, jiffies); +	rateinfo_recalc(ent, jiffies, ht->cfg.mode);  	switch (family) {  	case NFPROTO_IPV4: @@ -695,7 +806,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,  				 ent->rateinfo.credit, ent->rateinfo.credit_cap,  				 ent->rateinfo.cost);  		break; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  	case NFPROTO_IPV6:  		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",  				 (long)(ent->expires - jiffies)/HZ, @@ -720,10 +831,9 @@ static int dl_seq_show(struct seq_file *s, void *v)  	struct xt_hashlimit_htable *htable = s->private;  	unsigned int *bucket = (unsigned int *)v;  	struct dsthash_ent *ent; -	struct hlist_node *pos;  	if (!hlist_empty(&htable->hash[*bucket])) { -		hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) +		hlist_for_each_entry(ent, &htable->hash[*bucket], node)  			if (dl_seq_real_show(ent, htable->family, s))  				return -1;  	} @@ -743,7 +853,7 @@ static int dl_proc_open(struct inode *inode, struct file *file)  	if (!ret) {  		struct seq_file *sf = file->private_data; -		sf->private = PDE(inode)->data; +		sf->private = PDE_DATA(inode);  	}  	return ret;  } @@ -763,10 +873,10 @@ static int __net_init hashlimit_proc_net_init(struct net *net)  	hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net);  	if (!hashlimit_net->ipt_hashlimit)  		return -ENOMEM; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  	hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net);  	if (!hashlimit_net->ip6t_hashlimit) { -		proc_net_remove(net, "ipt_hashlimit"); +		remove_proc_entry("ipt_hashlimit", net->proc_net);  		return -ENOMEM;  	}  #endif @@ -775,9 +885,23 @@ static int __net_init hashlimit_proc_net_init(struct net *net)  static void __net_exit hashlimit_proc_net_exit(struct net *net)  { -	proc_net_remove(net, "ipt_hashlimit"); -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -	proc_net_remove(net, "ip6t_hashlimit"); +	struct xt_hashlimit_htable *hinfo; +	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + +	/* hashlimit_net_exit() is called before hashlimit_mt_destroy(). +	 * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc +	 * entries is empty before trying to remove it. +	 */ +	mutex_lock(&hashlimit_mutex); +	hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) +		htable_remove_proc_entry(hinfo); +	hashlimit_net->ipt_hashlimit = NULL; +	hashlimit_net->ip6t_hashlimit = NULL; +	mutex_unlock(&hashlimit_mutex); + +	remove_proc_entry("ipt_hashlimit", net->proc_net); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	remove_proc_entry("ip6t_hashlimit", net->proc_net);  #endif  } @@ -791,9 +915,6 @@ static int __net_init hashlimit_net_init(struct net *net)  static void __net_exit hashlimit_net_exit(struct net *net)  { -	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); - -	BUG_ON(!hlist_empty(&hashlimit_net->htables));  	hashlimit_proc_net_exit(net);  } diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c index 7d12221ead8..003951149c9 100644 --- a/net/netfilter/xt_hl.c +++ b/net/netfilter/xt_hl.c @@ -31,14 +31,14 @@ static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par)  	const u8 ttl = ip_hdr(skb)->ttl;  	switch (info->mode) { -		case IPT_TTL_EQ: -			return ttl == info->ttl; -		case IPT_TTL_NE: -			return ttl != info->ttl; -		case IPT_TTL_LT: -			return ttl < info->ttl; -		case IPT_TTL_GT: -			return ttl > info->ttl; +	case IPT_TTL_EQ: +		return ttl == info->ttl; +	case IPT_TTL_NE: +		return ttl != info->ttl; +	case IPT_TTL_LT: +		return ttl < info->ttl; +	case IPT_TTL_GT: +		return ttl > info->ttl;  	}  	return false; @@ -50,14 +50,14 @@ static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par)  	const struct ipv6hdr *ip6h = ipv6_hdr(skb);  	switch (info->mode) { -		case IP6T_HL_EQ: -			return ip6h->hop_limit == info->hop_limit; -		case IP6T_HL_NE: -			return ip6h->hop_limit != info->hop_limit; -		case IP6T_HL_LT: -			return ip6h->hop_limit < info->hop_limit; -		case IP6T_HL_GT: -			return ip6h->hop_limit > info->hop_limit; +	case IP6T_HL_EQ: +		return ip6h->hop_limit == info->hop_limit; +	case IP6T_HL_NE: +		return ip6h->hop_limit != info->hop_limit; +	case IP6T_HL_LT: +		return ip6h->hop_limit < info->hop_limit; +	case IP6T_HL_GT: +		return ip6h->hop_limit > info->hop_limit;  	}  	return false; diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c new file mode 100644 index 00000000000..89d53104c6b --- /dev/null +++ b/net/netfilter/xt_ipcomp.c @@ -0,0 +1,111 @@ +/*  Kernel module to match IPComp parameters for IPv4 and IPv6 + * + *  Copyright (C) 2013 WindRiver + * + *  Author: + *  Fan Du <fan.du@windriver.com> + * + *  Based on: + *  net/netfilter/xt_esp.c + * + *  This program is free software; you can redistribute it and/or + *  modify it under the terms of the GNU General Public License + *  as published by the Free Software Foundation; either version + *  2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter/xt_ipcomp.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fan Du <fan.du@windriver.com>"); +MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ +	bool r; +	pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", +		 invert ? '!' : ' ', min, spi, max); +	r = (spi >= min && spi <= max) ^ invert; +	pr_debug(" result %s\n", r ? "PASS" : "FAILED"); +	return r; +} + +static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	struct ip_comp_hdr _comphdr; +	const struct ip_comp_hdr *chdr; +	const struct xt_ipcomp *compinfo = par->matchinfo; + +	/* Must not be a fragment. */ +	if (par->fragoff != 0) +		return false; + +	chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr); +	if (chdr == NULL) { +		/* We've been asked to examine this packet, and we +		 * can't.  Hence, no choice but to drop. +		 */ +		pr_debug("Dropping evil IPComp tinygram.\n"); +		par->hotdrop = true; +		return 0; +	} + +	return spi_match(compinfo->spis[0], compinfo->spis[1], +			 ntohs(chdr->cpi), +			 !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); +} + +static int comp_mt_check(const struct xt_mtchk_param *par) +{ +	const struct xt_ipcomp *compinfo = par->matchinfo; + +	/* Must specify no unknown invflags */ +	if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) { +		pr_err("unknown flags %X\n", compinfo->invflags); +		return -EINVAL; +	} +	return 0; +} + +static struct xt_match comp_mt_reg[] __read_mostly = { +	{ +		.name		= "ipcomp", +		.family		= NFPROTO_IPV4, +		.match		= comp_mt, +		.matchsize	= sizeof(struct xt_ipcomp), +		.proto		= IPPROTO_COMP, +		.checkentry	= comp_mt_check, +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "ipcomp", +		.family		= NFPROTO_IPV6, +		.match		= comp_mt, +		.matchsize	= sizeof(struct xt_ipcomp), +		.proto		= IPPROTO_COMP, +		.checkentry	= comp_mt_check, +		.me		= THIS_MODULE, +	}, +}; + +static int __init comp_mt_init(void) +{ +	return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +static void __exit comp_mt_exit(void) +{ +	xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +module_init(comp_mt_init); +module_exit(comp_mt_exit); diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index 88f7c3511c7..b46626cddd9 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -31,7 +31,7 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)  			pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",  			         &iph->saddr,  			         (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", -			         &info->src_max.ip, +			         &info->src_min.ip,  			         &info->src_max.ip);  			return false;  		} @@ -53,15 +53,13 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)  }  static inline int -iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b) +iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b)  {  	unsigned int i; -	int r;  	for (i = 0; i < 4; ++i) { -		r = ntohl(a->s6_addr32[i]) - ntohl(b->s6_addr32[i]); -		if (r != 0) -			return r; +		if (a->s6_addr32[i] != b->s6_addr32[i]) +			return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]);  	}  	return 0; @@ -75,18 +73,30 @@ iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)  	bool m;  	if (info->flags & IPRANGE_SRC) { -		m  = iprange_ipv6_sub(&iph->saddr, &info->src_min.in6) < 0; -		m |= iprange_ipv6_sub(&iph->saddr, &info->src_max.in6) > 0; +		m  = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6); +		m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr);  		m ^= !!(info->flags & IPRANGE_SRC_INV); -		if (m) +		if (m) { +			pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n", +				 &iph->saddr, +				 (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", +				 &info->src_min.in6, +				 &info->src_max.in6);  			return false; +		}  	}  	if (info->flags & IPRANGE_DST) { -		m  = iprange_ipv6_sub(&iph->daddr, &info->dst_min.in6) < 0; -		m |= iprange_ipv6_sub(&iph->daddr, &info->dst_max.in6) > 0; +		m  = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6); +		m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr);  		m ^= !!(info->flags & IPRANGE_DST_INV); -		if (m) +		if (m) { +			pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n", +				 &iph->daddr, +				 (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", +				 &info->dst_min.in6, +				 &info->dst_max.in6);  			return false; +		}  	}  	return true;  } diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index 9127a3d8aa3..8d47c3780fd 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -67,7 +67,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)  		goto out;  	} -	ip_vs_fill_iphdr(family, skb_network_header(skb), &iph); +	ip_vs_fill_iph_skb(family, skb, &iph);  	if (data->bitmask & XT_IPVS_PROTO)  		if ((iph.protocol == data->l4proto) ^ @@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)  	/*  	 * Check if the packet belongs to an existing entry  	 */ -	cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */); +	cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */);  	if (unlikely(cp == NULL)) {  		match = false;  		goto out; diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c new file mode 100644 index 00000000000..8aee572771f --- /dev/null +++ b/net/netfilter/xt_l2tp.c @@ -0,0 +1,354 @@ +/* Kernel module to match L2TP header parameters. */ + +/* (C) 2013      James Chapman <jchapman@katalix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <linux/l2tp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_l2tp.h> + +/* L2TP header masks */ +#define L2TP_HDR_T_BIT	0x8000 +#define L2TP_HDR_L_BIT	0x4000 +#define L2TP_HDR_VER	0x000f + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); +MODULE_DESCRIPTION("Xtables: L2TP header match"); +MODULE_ALIAS("ipt_l2tp"); +MODULE_ALIAS("ip6t_l2tp"); + +/* The L2TP fields that can be matched */ +struct l2tp_data { +	u32 tid; +	u32 sid; +	u8 type; +	u8 version; +}; + +union l2tp_val { +	__be16 val16[2]; +	__be32 val32; +}; + +static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data) +{ +	if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type)) +		return false; + +	if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version)) +		return false; + +	/* Check tid only for L2TPv3 control or any L2TPv2 packets */ +	if ((info->flags & XT_L2TP_TID) && +	    ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) && +	    (info->tid != data->tid)) +		return false; + +	/* Check sid only for L2TP data packets */ +	if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) && +	    (info->sid != data->sid)) +		return false; + +	return true; +} + +/* Parse L2TP header fields when UDP encapsulation is used. Handles + * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a + * different format. See + * RFC2661, Section 3.1, L2TPv2 Header Format + * RFC3931, Section 3.2.1, L2TPv3 Control Message Header + * RFC3931, Section 3.2.2, L2TPv3 Data Message Header + * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP + */ +static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	int uhlen = sizeof(struct udphdr); +	int offs = thoff + uhlen; +	union l2tp_val *lh; +	union l2tp_val lhbuf; +	u16 flags; +	struct l2tp_data data = { 0, }; + +	if (par->fragoff != 0) +		return false; + +	/* Extract L2TP header fields. The flags in the first 16 bits +	 * tell us where the other fields are. +	 */ +	lh = skb_header_pointer(skb, offs, 2, &lhbuf); +	if (lh == NULL) +		return false; + +	flags = ntohs(lh->val16[0]); +	if (flags & L2TP_HDR_T_BIT) +		data.type = XT_L2TP_TYPE_CONTROL; +	else +		data.type = XT_L2TP_TYPE_DATA; +	data.version = (u8) flags & L2TP_HDR_VER; + +	/* Now extract the L2TP tid/sid. These are in different places +	 * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we +	 * must also check to see if the length field is present, +	 * since this affects the offsets into the packet of the +	 * tid/sid fields. +	 */ +	if (data.version == 3) { +		lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf); +		if (lh == NULL) +			return false; +		if (data.type == XT_L2TP_TYPE_CONTROL) +			data.tid = ntohl(lh->val32); +		else +			data.sid = ntohl(lh->val32); +	} else if (data.version == 2) { +		if (flags & L2TP_HDR_L_BIT) +			offs += 2; +		lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf); +		if (lh == NULL) +			return false; +		data.tid = (u32) ntohs(lh->val16[0]); +		data.sid = (u32) ntohs(lh->val16[1]); +	} else +		return false; + +	return l2tp_match(info, &data); +} + +/* Parse L2TP header fields for IP encapsulation (no UDP header). + * L2TPv3 data packets have a different form with IP encap. See + * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP. + * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP. + */ +static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	union l2tp_val *lh; +	union l2tp_val lhbuf; +	struct l2tp_data data = { 0, }; + +	/* For IP encap, the L2TP sid is the first 32-bits. */ +	lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf); +	if (lh == NULL) +		return false; +	if (lh->val32 == 0) { +		/* Must be a control packet. The L2TP tid is further +		 * into the packet. +		 */ +		data.type = XT_L2TP_TYPE_CONTROL; +		lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf), +					&lhbuf); +		if (lh == NULL) +			return false; +		data.tid = ntohl(lh->val32); +	} else { +		data.sid = ntohl(lh->val32); +		data.type = XT_L2TP_TYPE_DATA; +	} + +	data.version = 3; + +	return l2tp_match(info, &data); +} + +static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ +	struct iphdr *iph = ip_hdr(skb); +	u8 ipproto = iph->protocol; + +	/* l2tp_mt_check4 already restricts the transport protocol */ +	switch (ipproto) { +	case IPPROTO_UDP: +		return l2tp_udp_mt(skb, par, par->thoff); +	case IPPROTO_L2TP: +		return l2tp_ip_mt(skb, par, par->thoff); +	} + +	return false; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ +	unsigned int thoff = 0; +	unsigned short fragoff = 0; +	int ipproto; + +	ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); +	if (fragoff != 0) +		return false; + +	/* l2tp_mt_check6 already restricts the transport protocol */ +	switch (ipproto) { +	case IPPROTO_UDP: +		return l2tp_udp_mt(skb, par, thoff); +	case IPPROTO_L2TP: +		return l2tp_ip_mt(skb, par, thoff); +	} + +	return false; +} +#endif + +static int l2tp_mt_check(const struct xt_mtchk_param *par) +{ +	const struct xt_l2tp_info *info = par->matchinfo; + +	/* Check for invalid flags */ +	if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | +			    XT_L2TP_TYPE)) { +		pr_info("unknown flags: %x\n", info->flags); +		return -EINVAL; +	} + +	/* At least one of tid, sid or type=control must be specified */ +	if ((!(info->flags & XT_L2TP_TID)) && +	    (!(info->flags & XT_L2TP_SID)) && +	    ((!(info->flags & XT_L2TP_TYPE)) || +	     (info->type != XT_L2TP_TYPE_CONTROL))) { +		pr_info("invalid flags combination: %x\n", info->flags); +		return -EINVAL; +	} + +	/* If version 2 is specified, check that incompatible params +	 * are not supplied +	 */ +	if (info->flags & XT_L2TP_VERSION) { +		if ((info->version < 2) || (info->version > 3)) { +			pr_info("wrong L2TP version: %u\n", info->version); +			return -EINVAL; +		} + +		if (info->version == 2) { +			if ((info->flags & XT_L2TP_TID) && +			    (info->tid > 0xffff)) { +				pr_info("v2 tid > 0xffff: %u\n", info->tid); +				return -EINVAL; +			} +			if ((info->flags & XT_L2TP_SID) && +			    (info->sid > 0xffff)) { +				pr_info("v2 sid > 0xffff: %u\n", info->sid); +				return -EINVAL; +			} +		} +	} + +	return 0; +} + +static int l2tp_mt_check4(const struct xt_mtchk_param *par) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	const struct ipt_entry *e = par->entryinfo; +	const struct ipt_ip *ip = &e->ip; +	int ret; + +	ret = l2tp_mt_check(par); +	if (ret != 0) +		return ret; + +	if ((ip->proto != IPPROTO_UDP) && +	    (ip->proto != IPPROTO_L2TP)) { +		pr_info("missing protocol rule (udp|l2tpip)\n"); +		return -EINVAL; +	} + +	if ((ip->proto == IPPROTO_L2TP) && +	    (info->version == 2)) { +		pr_info("v2 doesn't support IP mode\n"); +		return -EINVAL; +	} + +	return 0; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int l2tp_mt_check6(const struct xt_mtchk_param *par) +{ +	const struct xt_l2tp_info *info = par->matchinfo; +	const struct ip6t_entry *e = par->entryinfo; +	const struct ip6t_ip6 *ip = &e->ipv6; +	int ret; + +	ret = l2tp_mt_check(par); +	if (ret != 0) +		return ret; + +	if ((ip->proto != IPPROTO_UDP) && +	    (ip->proto != IPPROTO_L2TP)) { +		pr_info("missing protocol rule (udp|l2tpip)\n"); +		return -EINVAL; +	} + +	if ((ip->proto == IPPROTO_L2TP) && +	    (info->version == 2)) { +		pr_info("v2 doesn't support IP mode\n"); +		return -EINVAL; +	} + +	return 0; +} +#endif + +static struct xt_match l2tp_mt_reg[] __read_mostly = { +	{ +		.name      = "l2tp", +		.revision  = 0, +		.family    = NFPROTO_IPV4, +		.match     = l2tp_mt4, +		.matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), +		.checkentry = l2tp_mt_check4, +		.hooks     = ((1 << NF_INET_PRE_ROUTING) | +			      (1 << NF_INET_LOCAL_IN) | +			      (1 << NF_INET_LOCAL_OUT) | +			      (1 << NF_INET_FORWARD)), +		.me        = THIS_MODULE, +	}, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +	{ +		.name      = "l2tp", +		.revision  = 0, +		.family    = NFPROTO_IPV6, +		.match     = l2tp_mt6, +		.matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), +		.checkentry = l2tp_mt_check6, +		.hooks     = ((1 << NF_INET_PRE_ROUTING) | +			      (1 << NF_INET_LOCAL_IN) | +			      (1 << NF_INET_LOCAL_OUT) | +			      (1 << NF_INET_FORWARD)), +		.me        = THIS_MODULE, +	}, +#endif +}; + +static int __init l2tp_mt_init(void) +{ +	return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +static void __exit l2tp_mt_exit(void) +{ +	xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +module_init(l2tp_mt_init); +module_exit(l2tp_mt_exit); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 32b7a579a03..bef85059655 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -1,5 +1,6 @@  /* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>   * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -88,8 +89,7 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par)  }  /* Precision saver. */ -static u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user)  {  	/* If multiplying would overflow... */  	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -118,12 +118,12 @@ static int limit_mt_check(const struct xt_mtchk_param *par)  	/* For SMP, we only want to use one set of state. */  	r->master = priv; +	/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * +	   128. */ +	priv->prev = jiffies; +	priv->credit = user2credits(r->avg * r->burst); /* Credits full. */  	if (r->cost == 0) { -		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * -		   128. */ -		priv->prev = jiffies; -		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ -		r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ +		r->credit_cap = priv->credit; /* Credits full. */  		r->cost = user2credits(r->avg);  	}  	return 0; diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 8160f6b1435..d5b4fd4f91e 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -36,7 +36,7 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par)  		return false;  	if (skb_mac_header(skb) + ETH_HLEN > skb->data)  		return false; -	ret  = compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr) == 0; +	ret  = ether_addr_equal(eth_hdr(skb)->h_source, info->srcaddr);  	ret ^= info->invert;  	return ret;  } diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c new file mode 100644 index 00000000000..bea7464cc43 --- /dev/null +++ b/net/netfilter/xt_nat.c @@ -0,0 +1,170 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat_core.h> + +static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) +{ +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + +	if (mr->rangesize != 1) { +		pr_info("%s: multiple ranges no longer supported\n", +			par->target->name); +		return -EINVAL; +	} +	return 0; +} + +static void xt_nat_convert_range(struct nf_nat_range *dst, +				 const struct nf_nat_ipv4_range *src) +{ +	memset(&dst->min_addr, 0, sizeof(dst->min_addr)); +	memset(&dst->max_addr, 0, sizeof(dst->max_addr)); + +	dst->flags	 = src->flags; +	dst->min_addr.ip = src->min_ip; +	dst->max_addr.ip = src->max_ip; +	dst->min_proto	 = src->min; +	dst->max_proto	 = src->max; +} + +static unsigned int +xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; +	struct nf_nat_range range; +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; + +	ct = nf_ct_get(skb, &ctinfo); +	NF_CT_ASSERT(ct != NULL && +		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || +		      ctinfo == IP_CT_RELATED_REPLY)); + +	xt_nat_convert_range(&range, &mr->range[0]); +	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; +	struct nf_nat_range range; +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; + +	ct = nf_ct_get(skb, &ctinfo); +	NF_CT_ASSERT(ct != NULL && +		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + +	xt_nat_convert_range(&range, &mr->range[0]); +	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} + +static unsigned int +xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct nf_nat_range *range = par->targinfo; +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; + +	ct = nf_ct_get(skb, &ctinfo); +	NF_CT_ASSERT(ct != NULL && +		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || +		      ctinfo == IP_CT_RELATED_REPLY)); + +	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct nf_nat_range *range = par->targinfo; +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; + +	ct = nf_ct_get(skb, &ctinfo); +	NF_CT_ASSERT(ct != NULL && +		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + +	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST); +} + +static struct xt_target xt_nat_target_reg[] __read_mostly = { +	{ +		.name		= "SNAT", +		.revision	= 0, +		.checkentry	= xt_nat_checkentry_v0, +		.target		= xt_snat_target_v0, +		.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat), +		.family		= NFPROTO_IPV4, +		.table		= "nat", +		.hooks		= (1 << NF_INET_POST_ROUTING) | +				  (1 << NF_INET_LOCAL_IN), +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "DNAT", +		.revision	= 0, +		.checkentry	= xt_nat_checkentry_v0, +		.target		= xt_dnat_target_v0, +		.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat), +		.family		= NFPROTO_IPV4, +		.table		= "nat", +		.hooks		= (1 << NF_INET_PRE_ROUTING) | +				  (1 << NF_INET_LOCAL_OUT), +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "SNAT", +		.revision	= 1, +		.target		= xt_snat_target_v1, +		.targetsize	= sizeof(struct nf_nat_range), +		.table		= "nat", +		.hooks		= (1 << NF_INET_POST_ROUTING) | +				  (1 << NF_INET_LOCAL_IN), +		.me		= THIS_MODULE, +	}, +	{ +		.name		= "DNAT", +		.revision	= 1, +		.target		= xt_dnat_target_v1, +		.targetsize	= sizeof(struct nf_nat_range), +		.table		= "nat", +		.hooks		= (1 << NF_INET_PRE_ROUTING) | +				  (1 << NF_INET_LOCAL_OUT), +		.me		= THIS_MODULE, +	}, +}; + +static int __init xt_nat_init(void) +{ +	return xt_register_targets(xt_nat_target_reg, +				   ARRAY_SIZE(xt_nat_target_reg)); +} + +static void __exit xt_nat_exit(void) +{ +	xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); +} + +module_init(xt_nat_init); +module_exit(xt_nat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ipt_SNAT"); +MODULE_ALIAS("ipt_DNAT"); +MODULE_ALIAS("ip6t_SNAT"); +MODULE_ALIAS("ip6t_DNAT"); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c new file mode 100644 index 00000000000..8c646ed9c92 --- /dev/null +++ b/net/netfilter/xt_nfacct.c @@ -0,0 +1,79 @@ +/* + * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2011 Intra2net AG <http://www.intra2net.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 (or any + * later at your option) as published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/nfnetlink_acct.h> +#include <linux/netfilter/xt_nfacct.h> + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: match for the extended accounting infrastructure"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_nfacct"); +MODULE_ALIAS("ip6t_nfacct"); + +static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	int overquota; +	const struct xt_nfacct_match_info *info = par->targinfo; + +	nfnl_acct_update(skb, info->nfacct); + +	overquota = nfnl_acct_overquota(skb, info->nfacct); + +	return overquota == NFACCT_UNDERQUOTA ? false : true; +} + +static int +nfacct_mt_checkentry(const struct xt_mtchk_param *par) +{ +	struct xt_nfacct_match_info *info = par->matchinfo; +	struct nf_acct *nfacct; + +	nfacct = nfnl_acct_find_get(info->name); +	if (nfacct == NULL) { +		pr_info("xt_nfacct: accounting object with name `%s' " +			"does not exists\n", info->name); +		return -ENOENT; +	} +	info->nfacct = nfacct; +	return 0; +} + +static void +nfacct_mt_destroy(const struct xt_mtdtor_param *par) +{ +	const struct xt_nfacct_match_info *info = par->matchinfo; + +	nfnl_acct_put(info->nfacct); +} + +static struct xt_match nfacct_mt_reg __read_mostly = { +	.name       = "nfacct", +	.family     = NFPROTO_UNSPEC, +	.checkentry = nfacct_mt_checkentry, +	.match      = nfacct_mt, +	.destroy    = nfacct_mt_destroy, +	.matchsize  = sizeof(struct xt_nfacct_match_info), +	.me         = THIS_MODULE, +}; + +static int __init nfacct_mt_init(void) +{ +	return xt_register_match(&nfacct_mt_reg); +} + +static void __exit nfacct_mt_exit(void) +{ +	xt_unregister_match(&nfacct_mt_reg); +} + +module_init(nfacct_mt_init); +module_exit(nfacct_mt_exit); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 4327e101c04..c529161cdbf 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -13,8 +13,7 @@   * GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/module.h> @@ -62,13 +61,6 @@ static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {  	[OSF_ATTR_FINGER]	= { .len = sizeof(struct xt_osf_user_finger) },  }; -static void xt_osf_finger_free_rcu(struct rcu_head *rcu_head) -{ -	struct xt_osf_finger *f = container_of(rcu_head, struct xt_osf_finger, rcu_head); - -	kfree(f); -} -  static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb,  			       const struct nlmsghdr *nlh,  			       const struct nlattr * const osf_attrs[]) @@ -133,7 +125,7 @@ static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb,  		 * We are protected by nfnl mutex.  		 */  		list_del_rcu(&sf->finger_entry); -		call_rcu(&sf->rcu_head, xt_osf_finger_free_rcu); +		kfree_rcu(sf, rcu_head);  		err = 0;  		break; @@ -208,6 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)  	unsigned char opts[MAX_IPOPTLEN];  	const struct xt_osf_finger *kf;  	const struct xt_osf_user_finger *f; +	struct net *net = dev_net(p->in ? p->in : p->out);  	if (!info)  		return false; @@ -276,7 +269,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)  						mss <<= 8;  						mss |= optp[2]; -						mss = ntohs(mss); +						mss = ntohs((__force __be16)mss);  						break;  					case OSFOPT_TS:  						loop_cont = 1; @@ -332,7 +325,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)  			fcount++;  			if (info->flags & XT_OSF_LOG) -				nf_log_packet(p->family, p->hooknum, skb, +				nf_log_packet(net, p->family, p->hooknum, skb,  					p->in, p->out, NULL,  					"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",  					f->genre, f->version, f->subtype, @@ -348,7 +341,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)  	rcu_read_unlock();  	if (!fcount && (info->flags & XT_OSF_LOG)) -		nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL, +		nf_log_packet(net, p->family, p->hooknum, skb, p->in, +			      p->out, NULL,  			"Remote OS is not known: %pI4:%u -> %pI4:%u\n",  				&ip->saddr, ntohs(tcp->source),  				&ip->daddr, ntohs(tcp->dest)); @@ -414,7 +408,7 @@ static void __exit xt_osf_fini(void)  		list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {  			list_del_rcu(&f->finger_entry); -			call_rcu(&f->rcu_head, xt_osf_finger_free_rcu); +			kfree_rcu(f, rcu_head);  		}  	}  	rcu_read_unlock(); @@ -428,4 +422,6 @@ module_exit(xt_osf_fini);  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");  MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf");  MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 772d7389b33..ca2e577ed8a 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -17,6 +17,17 @@  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter/xt_owner.h> +static int owner_check(const struct xt_mtchk_param *par) +{ +	struct xt_owner_match_info *info = par->matchinfo; + +	/* For now only allow adding matches from the initial user namespace */ +	if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && +	    (current_user_ns() != &init_user_ns)) +		return -EINVAL; +	return 0; +} +  static bool  owner_mt(const struct sk_buff *skb, struct xt_action_param *par)  { @@ -37,17 +48,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)  		return ((info->match ^ info->invert) &  		       (XT_OWNER_UID | XT_OWNER_GID)) == 0; -	if (info->match & XT_OWNER_UID) -		if ((filp->f_cred->fsuid >= info->uid_min && -		    filp->f_cred->fsuid <= info->uid_max) ^ +	if (info->match & XT_OWNER_UID) { +		kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); +		kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); +		if ((uid_gte(filp->f_cred->fsuid, uid_min) && +		     uid_lte(filp->f_cred->fsuid, uid_max)) ^  		    !(info->invert & XT_OWNER_UID))  			return false; +	} -	if (info->match & XT_OWNER_GID) -		if ((filp->f_cred->fsgid >= info->gid_min && -		    filp->f_cred->fsgid <= info->gid_max) ^ +	if (info->match & XT_OWNER_GID) { +		kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); +		kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); +		if ((gid_gte(filp->f_cred->fsgid, gid_min) && +		     gid_lte(filp->f_cred->fsgid, gid_max)) ^  		    !(info->invert & XT_OWNER_GID))  			return false; +	}  	return true;  } @@ -56,6 +73,7 @@ static struct xt_match owner_mt_reg __read_mostly = {  	.name       = "owner",  	.revision   = 1,  	.family     = NFPROTO_UNSPEC, +	.checkentry = owner_check,  	.match      = owner_mt,  	.matchsize  = sizeof(struct xt_owner_match_info),  	.hooks      = (1 << NF_INET_LOCAL_OUT) | diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 70eb2b4984d..44c8eb4c9d6 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -9,6 +9,7 @@  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter/xt_quota.h> +#include <linux/module.h>  struct xt_quota_priv {  	spinlock_t	lock; diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 76a083184d8..7720b036d76 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -18,7 +18,7 @@ static bool  xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)  {  	const struct xt_rateest_match_info *info = par->matchinfo; -	struct gnet_stats_rate_est *r; +	struct gnet_stats_rate_est64 *r;  	u_int32_t bps1, bps2, pps1, pps2;  	bool ret = true; @@ -78,7 +78,7 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)  {  	struct xt_rateest_match_info *info = par->matchinfo;  	struct xt_rateest *est1, *est2; -	int ret = false; +	int ret = -EINVAL;  	if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS |  				     XT_RATEEST_MATCH_REL)) != 1) @@ -101,13 +101,12 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)  	if (!est1)  		goto err1; +	est2 = NULL;  	if (info->flags & XT_RATEEST_MATCH_REL) {  		est2 = xt_rateest_lookup(info->name2);  		if (!est2)  			goto err2; -	} else -		est2 = NULL; - +	}  	info->est1 = est1;  	info->est2 = est2; @@ -116,7 +115,7 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)  err2:  	xt_rateest_put(est1);  err1: -	return -EINVAL; +	return ret;  }  static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index d2ff15a2412..a9faae89f95 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -29,6 +29,7 @@  #include <linux/skbuff.h>  #include <linux/inet.h>  #include <linux/slab.h> +#include <linux/vmalloc.h>  #include <net/net_namespace.h>  #include <net/netns/generic.h> @@ -75,6 +76,7 @@ struct recent_entry {  struct recent_table {  	struct list_head	list;  	char			name[XT_RECENT_NAME_LEN]; +	union nf_inet_addr	mask;  	unsigned int		refcnt;  	unsigned int		entries;  	struct list_head	lru_list; @@ -228,10 +230,10 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)  {  	struct net *net = dev_net(par->in ? par->in : par->out);  	struct recent_net *recent_net = recent_pernet(net); -	const struct xt_recent_mtinfo *info = par->matchinfo; +	const struct xt_recent_mtinfo_v1 *info = par->matchinfo;  	struct recent_table *t;  	struct recent_entry *e; -	union nf_inet_addr addr = {}; +	union nf_inet_addr addr = {}, addr_mask;  	u_int8_t ttl;  	bool ret = info->invert; @@ -261,12 +263,15 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)  	spin_lock_bh(&recent_lock);  	t = recent_table_lookup(recent_net, info->name); -	e = recent_entry_lookup(t, &addr, par->family, + +	nf_inet_addr_mask(&addr, &addr_mask, &t->mask); + +	e = recent_entry_lookup(t, &addr_mask, par->family,  				(info->check_set & XT_RECENT_TTL) ? ttl : 0);  	if (e == NULL) {  		if (!(info->check_set & XT_RECENT_SET))  			goto out; -		e = recent_entry_init(t, &addr, par->family, ttl); +		e = recent_entry_init(t, &addr_mask, par->family, ttl);  		if (e == NULL)  			par->hotdrop = true;  		ret = !ret; @@ -306,16 +311,24 @@ out:  	return ret;  } -static int recent_mt_check(const struct xt_mtchk_param *par) +static void recent_table_free(void *addr) +{ +	kvfree(addr); +} + +static int recent_mt_check(const struct xt_mtchk_param *par, +			   const struct xt_recent_mtinfo_v1 *info)  {  	struct recent_net *recent_net = recent_pernet(par->net); -	const struct xt_recent_mtinfo *info = par->matchinfo;  	struct recent_table *t;  #ifdef CONFIG_PROC_FS  	struct proc_dir_entry *pde; +	kuid_t uid; +	kgid_t gid;  #endif -	unsigned i; +	unsigned int i;  	int ret = -EINVAL; +	size_t sz;  	if (unlikely(!hash_rnd_inited)) {  		get_random_bytes(&hash_rnd, sizeof(hash_rnd)); @@ -354,27 +367,38 @@ static int recent_mt_check(const struct xt_mtchk_param *par)  		goto out;  	} -	t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, -		    GFP_KERNEL); +	sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size; +	if (sz <= PAGE_SIZE) +		t = kzalloc(sz, GFP_KERNEL); +	else +		t = vzalloc(sz);  	if (t == NULL) {  		ret = -ENOMEM;  		goto out;  	}  	t->refcnt = 1; + +	memcpy(&t->mask, &info->mask, sizeof(t->mask));  	strcpy(t->name, info->name);  	INIT_LIST_HEAD(&t->lru_list);  	for (i = 0; i < ip_list_hash_size; i++)  		INIT_LIST_HEAD(&t->iphash[i]);  #ifdef CONFIG_PROC_FS +	uid = make_kuid(&init_user_ns, ip_list_uid); +	gid = make_kgid(&init_user_ns, ip_list_gid); +	if (!uid_valid(uid) || !gid_valid(gid)) { +		recent_table_free(t); +		ret = -EINVAL; +		goto out; +	}  	pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,  		  &recent_mt_fops, t);  	if (pde == NULL) { -		kfree(t); +		recent_table_free(t);  		ret = -ENOMEM;  		goto out;  	} -	pde->uid = ip_list_uid; -	pde->gid = ip_list_gid; +	proc_set_user(pde, uid, gid);  #endif  	spin_lock_bh(&recent_lock);  	list_add_tail(&t->list, &recent_net->tables); @@ -385,10 +409,28 @@ out:  	return ret;  } +static int recent_mt_check_v0(const struct xt_mtchk_param *par) +{ +	const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo; +	struct xt_recent_mtinfo_v1 info_v1; + +	/* Copy revision 0 structure to revision 1 */ +	memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo)); +	/* Set default mask to ensure backward compatible behaviour */ +	memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all)); + +	return recent_mt_check(par, &info_v1); +} + +static int recent_mt_check_v1(const struct xt_mtchk_param *par) +{ +	return recent_mt_check(par, par->matchinfo); +} +  static void recent_mt_destroy(const struct xt_mtdtor_param *par)  {  	struct recent_net *recent_net = recent_pernet(par->net); -	const struct xt_recent_mtinfo *info = par->matchinfo; +	const struct xt_recent_mtinfo_v1 *info = par->matchinfo;  	struct recent_table *t;  	mutex_lock(&recent_mutex); @@ -398,10 +440,11 @@ static void recent_mt_destroy(const struct xt_mtdtor_param *par)  		list_del(&t->list);  		spin_unlock_bh(&recent_lock);  #ifdef CONFIG_PROC_FS -		remove_proc_entry(t->name, recent_net->xt_recent); +		if (recent_net->xt_recent != NULL) +			remove_proc_entry(t->name, recent_net->xt_recent);  #endif  		recent_table_flush(t); -		kfree(t); +		recent_table_free(t);  	}  	mutex_unlock(&recent_mutex);  } @@ -478,14 +521,13 @@ static const struct seq_operations recent_seq_ops = {  static int recent_seq_open(struct inode *inode, struct file *file)  { -	struct proc_dir_entry *pde = PDE(inode);  	struct recent_iter_state *st;  	st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));  	if (st == NULL)  		return -ENOMEM; -	st->table    = pde->data; +	st->table    = PDE_DATA(inode);  	return 0;  } @@ -493,8 +535,7 @@ static ssize_t  recent_mt_proc_write(struct file *file, const char __user *input,  		     size_t size, loff_t *loff)  { -	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); -	struct recent_table *t = pde->data; +	struct recent_table *t = PDE_DATA(file_inode(file));  	struct recent_entry *e;  	char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];  	const char *c = buf; @@ -582,7 +623,21 @@ static int __net_init recent_proc_net_init(struct net *net)  static void __net_exit recent_proc_net_exit(struct net *net)  { -	proc_net_remove(net, "xt_recent"); +	struct recent_net *recent_net = recent_pernet(net); +	struct recent_table *t; + +	/* recent_net_exit() is called before recent_mt_destroy(). Make sure +	 * that the parent xt_recent proc entry is is empty before trying to +	 * remove it. +	 */ +	spin_lock_bh(&recent_lock); +	list_for_each_entry(t, &recent_net->tables, list) +	        remove_proc_entry(t->name, recent_net->xt_recent); + +	recent_net->xt_recent = NULL; +	spin_unlock_bh(&recent_lock); + +	remove_proc_entry("xt_recent", net->proc_net);  }  #else  static inline int recent_proc_net_init(struct net *net) @@ -605,9 +660,6 @@ static int __net_init recent_net_init(struct net *net)  static void __net_exit recent_net_exit(struct net *net)  { -	struct recent_net *recent_net = recent_pernet(net); - -	BUG_ON(!list_empty(&recent_net->tables));  	recent_proc_net_exit(net);  } @@ -625,7 +677,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = {  		.family     = NFPROTO_IPV4,  		.match      = recent_mt,  		.matchsize  = sizeof(struct xt_recent_mtinfo), -		.checkentry = recent_mt_check, +		.checkentry = recent_mt_check_v0,  		.destroy    = recent_mt_destroy,  		.me         = THIS_MODULE,  	}, @@ -635,10 +687,30 @@ static struct xt_match recent_mt_reg[] __read_mostly = {  		.family     = NFPROTO_IPV6,  		.match      = recent_mt,  		.matchsize  = sizeof(struct xt_recent_mtinfo), -		.checkentry = recent_mt_check, +		.checkentry = recent_mt_check_v0, +		.destroy    = recent_mt_destroy, +		.me         = THIS_MODULE, +	}, +	{ +		.name       = "recent", +		.revision   = 1, +		.family     = NFPROTO_IPV4, +		.match      = recent_mt, +		.matchsize  = sizeof(struct xt_recent_mtinfo_v1), +		.checkentry = recent_mt_check_v1,  		.destroy    = recent_mt_destroy,  		.me         = THIS_MODULE,  	}, +	{ +		.name       = "recent", +		.revision   = 1, +		.family     = NFPROTO_IPV6, +		.match      = recent_mt, +		.matchsize  = sizeof(struct xt_recent_mtinfo_v1), +		.checkentry = recent_mt_check_v1, +		.destroy    = recent_mt_destroy, +		.me         = THIS_MODULE, +	}  };  static int __init recent_mt_init(void) diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 6efe4e5a81c..8fd324116e6 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -5,23 +5,35 @@   * they serve as the hanging-off data accessed through repl.data[].   */ +/* tbl has the following structure equivalent, but is C99 compliant: + * struct { + *	struct type##_replace repl; + *	struct type##_standard entries[nhooks]; + *	struct type##_error term; + * } *tbl; + */ +  #define xt_alloc_initial_table(type, typ2) ({ \  	unsigned int hook_mask = info->valid_hooks; \  	unsigned int nhooks = hweight32(hook_mask); \  	unsigned int bytes = 0, hooknum = 0, i = 0; \  	struct { \  		struct type##_replace repl; \ -		struct type##_standard entries[nhooks]; \ -		struct type##_error term; \ -	} *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ +		struct type##_standard entries[]; \ +	} *tbl; \ +	struct type##_error *term; \ +	size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \ +		__alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \ +	tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \  	if (tbl == NULL) \  		return NULL; \ +	term = (struct type##_error *)&(((char *)tbl)[term_offset]); \  	strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ -	tbl->term = (struct type##_error)typ2##_ERROR_INIT;  \ +	*term = (struct type##_error)typ2##_ERROR_INIT;  \  	tbl->repl.valid_hooks = hook_mask; \  	tbl->repl.num_entries = nhooks + 1; \  	tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ -	                 sizeof(struct type##_error); \ +			 sizeof(struct type##_error); \  	for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \  		if (!(hook_mask & 1)) \  			continue; \ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c new file mode 100644 index 00000000000..80c2e2d603e --- /dev/null +++ b/net/netfilter/xt_set.c @@ -0,0 +1,523 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + *                         Patrick Schaaf <bof@bof.de> + *                         Martin Josefsson <gandalf@wlug.westbo.se> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module which implements the set match and SET target + * for netfilter/iptables. */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("Xtables: IP set match and target module"); +MODULE_ALIAS("xt_SET"); +MODULE_ALIAS("ipt_set"); +MODULE_ALIAS("ip6t_set"); +MODULE_ALIAS("ipt_SET"); +MODULE_ALIAS("ip6t_SET"); + +static inline int +match_set(ip_set_id_t index, const struct sk_buff *skb, +	  const struct xt_action_param *par, +	  struct ip_set_adt_opt *opt, int inv) +{ +	if (ip_set_test(index, skb, par, opt)) +		inv = !inv; +	return inv; +} + +#define ADT_OPT(n, f, d, fs, cfs, t)	\ +struct ip_set_adt_opt n = {		\ +	.family	= f,			\ +	.dim = d,			\ +	.flags = fs,			\ +	.cmdflags = cfs,		\ +	.ext.timeout = t,		\ +} + +/* Revision 0 interface: backward compatible with netfilter/iptables */ + +static bool +set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_set_info_match_v0 *info = par->matchinfo; +	ADT_OPT(opt, par->family, info->match_set.u.compat.dim, +		info->match_set.u.compat.flags, 0, UINT_MAX); + +	return match_set(info->match_set.index, skb, par, &opt, +			 info->match_set.u.compat.flags & IPSET_INV_MATCH); +} + +static void +compat_flags(struct xt_set_info_v0 *info) +{ +	u_int8_t i; + +	/* Fill out compatibility data according to enum ip_set_kopt */ +	info->u.compat.dim = IPSET_DIM_ZERO; +	if (info->u.flags[0] & IPSET_MATCH_INV) +		info->u.compat.flags |= IPSET_INV_MATCH; +	for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { +		info->u.compat.dim++; +		if (info->u.flags[i] & IPSET_SRC) +			info->u.compat.flags |= (1<<info->u.compat.dim); +	} +} + +static int +set_match_v0_checkentry(const struct xt_mtchk_param *par) +{ +	struct xt_set_info_match_v0 *info = par->matchinfo; +	ip_set_id_t index; + +	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); + +	if (index == IPSET_INVALID_ID) { +		pr_warning("Cannot find set identified by id %u to match\n", +			   info->match_set.index); +		return -ENOENT; +	} +	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { +		pr_warning("Protocol error: set match dimension " +			   "is over the limit!\n"); +		ip_set_nfnl_put(par->net, info->match_set.index); +		return -ERANGE; +	} + +	/* Fill out compatibility data */ +	compat_flags(&info->match_set); + +	return 0; +} + +static void +set_match_v0_destroy(const struct xt_mtdtor_param *par) +{ +	struct xt_set_info_match_v0 *info = par->matchinfo; + +	ip_set_nfnl_put(par->net, info->match_set.index); +} + +/* Revision 1 match */ + +static bool +set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_set_info_match_v1 *info = par->matchinfo; +	ADT_OPT(opt, par->family, info->match_set.dim, +		info->match_set.flags, 0, UINT_MAX); + +	if (opt.flags & IPSET_RETURN_NOMATCH) +		opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; + +	return match_set(info->match_set.index, skb, par, &opt, +			 info->match_set.flags & IPSET_INV_MATCH); +} + +static int +set_match_v1_checkentry(const struct xt_mtchk_param *par) +{ +	struct xt_set_info_match_v1 *info = par->matchinfo; +	ip_set_id_t index; + +	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); + +	if (index == IPSET_INVALID_ID) { +		pr_warning("Cannot find set identified by id %u to match\n", +			   info->match_set.index); +		return -ENOENT; +	} +	if (info->match_set.dim > IPSET_DIM_MAX) { +		pr_warning("Protocol error: set match dimension " +			   "is over the limit!\n"); +		ip_set_nfnl_put(par->net, info->match_set.index); +		return -ERANGE; +	} + +	return 0; +} + +static void +set_match_v1_destroy(const struct xt_mtdtor_param *par) +{ +	struct xt_set_info_match_v1 *info = par->matchinfo; + +	ip_set_nfnl_put(par->net, info->match_set.index); +} + +/* Revision 3 match */ + +static bool +match_counter(u64 counter, const struct ip_set_counter_match *info) +{ +	switch (info->op) { +	case IPSET_COUNTER_NONE: +		return true; +	case IPSET_COUNTER_EQ: +		return counter == info->value; +	case IPSET_COUNTER_NE: +		return counter != info->value; +	case IPSET_COUNTER_LT: +		return counter < info->value; +	case IPSET_COUNTER_GT: +		return counter > info->value; +	} +	return false; +} + +static bool +set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_set_info_match_v3 *info = par->matchinfo; +	ADT_OPT(opt, par->family, info->match_set.dim, +		info->match_set.flags, info->flags, UINT_MAX); +	int ret; + +	if (info->packets.op != IPSET_COUNTER_NONE || +	    info->bytes.op != IPSET_COUNTER_NONE) +		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; + +	ret = match_set(info->match_set.index, skb, par, &opt, +			info->match_set.flags & IPSET_INV_MATCH); + +	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) +		return ret; + +	if (!match_counter(opt.ext.packets, &info->packets)) +		return 0; +	return match_counter(opt.ext.bytes, &info->bytes); +} + +#define set_match_v3_checkentry	set_match_v1_checkentry +#define set_match_v3_destroy	set_match_v1_destroy + +/* Revision 0 interface: backward compatible with netfilter/iptables */ + +static unsigned int +set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_set_info_target_v0 *info = par->targinfo; +	ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, +		info->add_set.u.compat.flags, 0, UINT_MAX); +	ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, +		info->del_set.u.compat.flags, 0, UINT_MAX); + +	if (info->add_set.index != IPSET_INVALID_ID) +		ip_set_add(info->add_set.index, skb, par, &add_opt); +	if (info->del_set.index != IPSET_INVALID_ID) +		ip_set_del(info->del_set.index, skb, par, &del_opt); + +	return XT_CONTINUE; +} + +static int +set_target_v0_checkentry(const struct xt_tgchk_param *par) +{ +	struct xt_set_info_target_v0 *info = par->targinfo; +	ip_set_id_t index; + +	if (info->add_set.index != IPSET_INVALID_ID) { +		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); +		if (index == IPSET_INVALID_ID) { +			pr_warning("Cannot find add_set index %u as target\n", +				   info->add_set.index); +			return -ENOENT; +		} +	} + +	if (info->del_set.index != IPSET_INVALID_ID) { +		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); +		if (index == IPSET_INVALID_ID) { +			pr_warning("Cannot find del_set index %u as target\n", +				   info->del_set.index); +			if (info->add_set.index != IPSET_INVALID_ID) +				ip_set_nfnl_put(par->net, info->add_set.index); +			return -ENOENT; +		} +	} +	if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || +	    info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { +		pr_warning("Protocol error: SET target dimension " +			   "is over the limit!\n"); +		if (info->add_set.index != IPSET_INVALID_ID) +			ip_set_nfnl_put(par->net, info->add_set.index); +		if (info->del_set.index != IPSET_INVALID_ID) +			ip_set_nfnl_put(par->net, info->del_set.index); +		return -ERANGE; +	} + +	/* Fill out compatibility data */ +	compat_flags(&info->add_set); +	compat_flags(&info->del_set); + +	return 0; +} + +static void +set_target_v0_destroy(const struct xt_tgdtor_param *par) +{ +	const struct xt_set_info_target_v0 *info = par->targinfo; + +	if (info->add_set.index != IPSET_INVALID_ID) +		ip_set_nfnl_put(par->net, info->add_set.index); +	if (info->del_set.index != IPSET_INVALID_ID) +		ip_set_nfnl_put(par->net, info->del_set.index); +} + +/* Revision 1 target */ + +static unsigned int +set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_set_info_target_v1 *info = par->targinfo; +	ADT_OPT(add_opt, par->family, info->add_set.dim, +		info->add_set.flags, 0, UINT_MAX); +	ADT_OPT(del_opt, par->family, info->del_set.dim, +		info->del_set.flags, 0, UINT_MAX); + +	if (info->add_set.index != IPSET_INVALID_ID) +		ip_set_add(info->add_set.index, skb, par, &add_opt); +	if (info->del_set.index != IPSET_INVALID_ID) +		ip_set_del(info->del_set.index, skb, par, &del_opt); + +	return XT_CONTINUE; +} + +static int +set_target_v1_checkentry(const struct xt_tgchk_param *par) +{ +	const struct xt_set_info_target_v1 *info = par->targinfo; +	ip_set_id_t index; + +	if (info->add_set.index != IPSET_INVALID_ID) { +		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); +		if (index == IPSET_INVALID_ID) { +			pr_warning("Cannot find add_set index %u as target\n", +				   info->add_set.index); +			return -ENOENT; +		} +	} + +	if (info->del_set.index != IPSET_INVALID_ID) { +		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); +		if (index == IPSET_INVALID_ID) { +			pr_warning("Cannot find del_set index %u as target\n", +				   info->del_set.index); +			if (info->add_set.index != IPSET_INVALID_ID) +				ip_set_nfnl_put(par->net, info->add_set.index); +			return -ENOENT; +		} +	} +	if (info->add_set.dim > IPSET_DIM_MAX || +	    info->del_set.dim > IPSET_DIM_MAX) { +		pr_warning("Protocol error: SET target dimension " +			   "is over the limit!\n"); +		if (info->add_set.index != IPSET_INVALID_ID) +			ip_set_nfnl_put(par->net, info->add_set.index); +		if (info->del_set.index != IPSET_INVALID_ID) +			ip_set_nfnl_put(par->net, info->del_set.index); +		return -ERANGE; +	} + +	return 0; +} + +static void +set_target_v1_destroy(const struct xt_tgdtor_param *par) +{ +	const struct xt_set_info_target_v1 *info = par->targinfo; + +	if (info->add_set.index != IPSET_INVALID_ID) +		ip_set_nfnl_put(par->net, info->add_set.index); +	if (info->del_set.index != IPSET_INVALID_ID) +		ip_set_nfnl_put(par->net, info->del_set.index); +} + +/* Revision 2 target */ + +static unsigned int +set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_set_info_target_v2 *info = par->targinfo; +	ADT_OPT(add_opt, par->family, info->add_set.dim, +		info->add_set.flags, info->flags, info->timeout); +	ADT_OPT(del_opt, par->family, info->del_set.dim, +		info->del_set.flags, 0, UINT_MAX); + +	/* Normalize to fit into jiffies */ +	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && +	    add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) +		add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; +	if (info->add_set.index != IPSET_INVALID_ID) +		ip_set_add(info->add_set.index, skb, par, &add_opt); +	if (info->del_set.index != IPSET_INVALID_ID) +		ip_set_del(info->del_set.index, skb, par, &del_opt); + +	return XT_CONTINUE; +} + +#define set_target_v2_checkentry	set_target_v1_checkentry +#define set_target_v2_destroy		set_target_v1_destroy + +static struct xt_match set_matches[] __read_mostly = { +	{ +		.name		= "set", +		.family		= NFPROTO_IPV4, +		.revision	= 0, +		.match		= set_match_v0, +		.matchsize	= sizeof(struct xt_set_info_match_v0), +		.checkentry	= set_match_v0_checkentry, +		.destroy	= set_match_v0_destroy, +		.me		= THIS_MODULE +	}, +	{ +		.name		= "set", +		.family		= NFPROTO_IPV4, +		.revision	= 1, +		.match		= set_match_v1, +		.matchsize	= sizeof(struct xt_set_info_match_v1), +		.checkentry	= set_match_v1_checkentry, +		.destroy	= set_match_v1_destroy, +		.me		= THIS_MODULE +	}, +	{ +		.name		= "set", +		.family		= NFPROTO_IPV6, +		.revision	= 1, +		.match		= set_match_v1, +		.matchsize	= sizeof(struct xt_set_info_match_v1), +		.checkentry	= set_match_v1_checkentry, +		.destroy	= set_match_v1_destroy, +		.me		= THIS_MODULE +	}, +	/* --return-nomatch flag support */ +	{ +		.name		= "set", +		.family		= NFPROTO_IPV4, +		.revision	= 2, +		.match		= set_match_v1, +		.matchsize	= sizeof(struct xt_set_info_match_v1), +		.checkentry	= set_match_v1_checkentry, +		.destroy	= set_match_v1_destroy, +		.me		= THIS_MODULE +	}, +	{ +		.name		= "set", +		.family		= NFPROTO_IPV6, +		.revision	= 2, +		.match		= set_match_v1, +		.matchsize	= sizeof(struct xt_set_info_match_v1), +		.checkentry	= set_match_v1_checkentry, +		.destroy	= set_match_v1_destroy, +		.me		= THIS_MODULE +	}, +	/* counters support: update, match */ +	{ +		.name		= "set", +		.family		= NFPROTO_IPV4, +		.revision	= 3, +		.match		= set_match_v3, +		.matchsize	= sizeof(struct xt_set_info_match_v3), +		.checkentry	= set_match_v3_checkentry, +		.destroy	= set_match_v3_destroy, +		.me		= THIS_MODULE +	}, +	{ +		.name		= "set", +		.family		= NFPROTO_IPV6, +		.revision	= 3, +		.match		= set_match_v3, +		.matchsize	= sizeof(struct xt_set_info_match_v3), +		.checkentry	= set_match_v3_checkentry, +		.destroy	= set_match_v3_destroy, +		.me		= THIS_MODULE +	}, +}; + +static struct xt_target set_targets[] __read_mostly = { +	{ +		.name		= "SET", +		.revision	= 0, +		.family		= NFPROTO_IPV4, +		.target		= set_target_v0, +		.targetsize	= sizeof(struct xt_set_info_target_v0), +		.checkentry	= set_target_v0_checkentry, +		.destroy	= set_target_v0_destroy, +		.me		= THIS_MODULE +	}, +	{ +		.name		= "SET", +		.revision	= 1, +		.family		= NFPROTO_IPV4, +		.target		= set_target_v1, +		.targetsize	= sizeof(struct xt_set_info_target_v1), +		.checkentry	= set_target_v1_checkentry, +		.destroy	= set_target_v1_destroy, +		.me		= THIS_MODULE +	}, +	{ +		.name		= "SET", +		.revision	= 1, +		.family		= NFPROTO_IPV6, +		.target		= set_target_v1, +		.targetsize	= sizeof(struct xt_set_info_target_v1), +		.checkentry	= set_target_v1_checkentry, +		.destroy	= set_target_v1_destroy, +		.me		= THIS_MODULE +	}, +	/* --timeout and --exist flags support */ +	{ +		.name		= "SET", +		.revision	= 2, +		.family		= NFPROTO_IPV4, +		.target		= set_target_v2, +		.targetsize	= sizeof(struct xt_set_info_target_v2), +		.checkentry	= set_target_v2_checkentry, +		.destroy	= set_target_v2_destroy, +		.me		= THIS_MODULE +	}, +	{ +		.name		= "SET", +		.revision	= 2, +		.family		= NFPROTO_IPV6, +		.target		= set_target_v2, +		.targetsize	= sizeof(struct xt_set_info_target_v2), +		.checkentry	= set_target_v2_checkentry, +		.destroy	= set_target_v2_destroy, +		.me		= THIS_MODULE +	}, +}; + +static int __init xt_set_init(void) +{ +	int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches)); + +	if (!ret) { +		ret = xt_register_targets(set_targets, +					  ARRAY_SIZE(set_targets)); +		if (ret) +			xt_unregister_matches(set_matches, +					      ARRAY_SIZE(set_matches)); +	} +	return ret; +} + +static void __exit xt_set_fini(void) +{ +	xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches)); +	xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets)); +} + +module_init(xt_set_init); +module_exit(xt_set_fini); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 00d6ae83830..1ba67931eb1 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -19,18 +19,18 @@  #include <net/icmp.h>  #include <net/sock.h>  #include <net/inet_sock.h> -#include <net/netfilter/nf_tproxy_core.h>  #include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  #define XT_SOCKET_HAVE_IPV6 1  #include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/inet6_hashtables.h>  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>  #endif  #include <linux/netfilter/xt_socket.h> -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  #define XT_SOCKET_HAVE_CONNTRACK 1  #include <net/netfilter/nf_conntrack.h>  #endif @@ -92,16 +92,53 @@ extract_icmp4_fields(const struct sk_buff *skb,  	return 0;  } +/* "socket" match based redirection (no specific rule) + * =================================================== + * + * There are connections with dynamic endpoints (e.g. FTP data + * connection) that the user is unable to add explicit rules + * for. These are taken care of by a generic "socket" rule. It is + * assumed that the proxy application is trusted to open such + * connections without explicit iptables rule (except of course the + * generic 'socket' rule). In this case the following sockets are + * matched in preference order: + * + *   - match: if there's a fully established connection matching the + *     _packet_ tuple + * + *   - match: if there's a non-zero bound listener (possibly with a + *     non-local address) We don't accept zero-bound listeners, since + *     then local services could intercept traffic going through the + *     box. + */ +static struct sock * +xt_socket_get_sock_v4(struct net *net, const u8 protocol, +		      const __be32 saddr, const __be32 daddr, +		      const __be16 sport, const __be16 dport, +		      const struct net_device *in) +{ +	switch (protocol) { +	case IPPROTO_TCP: +		return __inet_lookup(net, &tcp_hashinfo, +				     saddr, sport, daddr, dport, +				     in->ifindex); +	case IPPROTO_UDP: +		return udp4_lib_lookup(net, saddr, sport, daddr, dport, +				       in->ifindex); +	} +	return NULL; +} +  static bool  socket_match(const struct sk_buff *skb, struct xt_action_param *par,  	     const struct xt_socket_mtinfo1 *info)  {  	const struct iphdr *iph = ip_hdr(skb);  	struct udphdr _hdr, *hp = NULL; -	struct sock *sk; -	__be32 daddr, saddr; -	__be16 dport, sport; -	u8 protocol; +	struct sock *sk = skb->sk; +	__be32 uninitialized_var(daddr), uninitialized_var(saddr); +	__be16 uninitialized_var(dport), uninitialized_var(sport); +	u8 uninitialized_var(protocol);  #ifdef XT_SOCKET_HAVE_CONNTRACK  	struct nf_conn const *ct;  	enum ip_conntrack_info ctinfo; @@ -134,9 +171,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,  	ct = nf_ct_get(skb, &ctinfo);  	if (ct && !nf_ct_is_untracked(ct) &&  	    ((iph->protocol != IPPROTO_ICMP && -	      ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) || +	      ctinfo == IP_CT_ESTABLISHED_REPLY) ||  	     (iph->protocol == IPPROTO_ICMP && -	      ctinfo == IP_CT_IS_REPLY + IP_CT_RELATED)) && +	      ctinfo == IP_CT_RELATED_REPLY)) &&  	    (ct->status & IPS_SRC_NAT_DONE)) {  		daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; @@ -146,25 +183,31 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,  	}  #endif -	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, -				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); -	if (sk != NULL) { +	if (!sk) +		sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, +					   saddr, daddr, sport, dport, +					   par->in); +	if (sk) {  		bool wildcard;  		bool transparent = true; -		/* Ignore sockets listening on INADDR_ANY */ -		wildcard = (sk->sk_state != TCP_TIME_WAIT && +		/* Ignore sockets listening on INADDR_ANY, +		 * unless XT_SOCKET_NOWILDCARD is set +		 */ +		wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && +			    sk->sk_state != TCP_TIME_WAIT &&  			    inet_sk(sk)->inet_rcv_saddr == 0);  		/* Ignore non-transparent sockets,  		   if XT_SOCKET_TRANSPARENT is used */ -		if (info && info->flags & XT_SOCKET_TRANSPARENT) +		if (info->flags & XT_SOCKET_TRANSPARENT)  			transparent = ((sk->sk_state != TCP_TIME_WAIT &&  					inet_sk(sk)->transparent) ||  				       (sk->sk_state == TCP_TIME_WAIT &&  					inet_twsk(sk)->tw_transparent)); -		nf_tproxy_put_sock(sk); +		if (sk != skb->sk) +			sock_gen_put(sk);  		if (wildcard || !transparent)  			sk = NULL; @@ -181,11 +224,15 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,  static bool  socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)  { -	return socket_match(skb, par, NULL); +	static struct xt_socket_mtinfo1 xt_info_v0 = { +		.flags = 0, +	}; + +	return socket_match(skb, par, &xt_info_v0);  }  static bool -socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)  {  	return socket_match(skb, par, par->matchinfo);  } @@ -205,6 +252,7 @@ extract_icmp6_fields(const struct sk_buff *skb,  	struct icmp6hdr *icmph, _icmph;  	__be16 *ports, _ports[2];  	u8 inside_nexthdr; +	__be16 inside_fragoff;  	int inside_hdrlen;  	icmph = skb_header_pointer(skb, outside_hdrlen, @@ -220,7 +268,8 @@ extract_icmp6_fields(const struct sk_buff *skb,  		return 1;  	inside_nexthdr = inside_iph->nexthdr; -	inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr); +	inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), +					 &inside_nexthdr, &inside_fragoff);  	if (inside_hdrlen < 0)  		return 1; /* hjm: Packet has no/incomplete transport layer headers. */ @@ -244,18 +293,37 @@ extract_icmp6_fields(const struct sk_buff *skb,  	return 0;  } +static struct sock * +xt_socket_get_sock_v6(struct net *net, const u8 protocol, +		      const struct in6_addr *saddr, const struct in6_addr *daddr, +		      const __be16 sport, const __be16 dport, +		      const struct net_device *in) +{ +	switch (protocol) { +	case IPPROTO_TCP: +		return inet6_lookup(net, &tcp_hashinfo, +				    saddr, sport, daddr, dport, +				    in->ifindex); +	case IPPROTO_UDP: +		return udp6_lib_lookup(net, saddr, sport, daddr, dport, +				       in->ifindex); +	} + +	return NULL; +} +  static bool -socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)  {  	struct ipv6hdr *iph = ipv6_hdr(skb);  	struct udphdr _hdr, *hp = NULL; -	struct sock *sk; -	struct in6_addr *daddr, *saddr; -	__be16 dport, sport; -	int thoff, tproto; +	struct sock *sk = skb->sk; +	struct in6_addr *daddr = NULL, *saddr = NULL; +	__be16 uninitialized_var(dport), uninitialized_var(sport); +	int thoff = 0, uninitialized_var(tproto);  	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; -	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); +	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);  	if (tproto < 0) {  		pr_debug("unable to find transport header in IPv6 packet, dropping\n");  		return NF_DROP; @@ -280,25 +348,31 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)  		return false;  	} -	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, -				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); -	if (sk != NULL) { +	if (!sk) +		sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, +					   saddr, daddr, sport, dport, +					   par->in); +	if (sk) {  		bool wildcard;  		bool transparent = true; -		/* Ignore sockets listening on INADDR_ANY */ -		wildcard = (sk->sk_state != TCP_TIME_WAIT && -			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)); +		/* Ignore sockets listening on INADDR_ANY +		 * unless XT_SOCKET_NOWILDCARD is set +		 */ +		wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && +			    sk->sk_state != TCP_TIME_WAIT && +			    ipv6_addr_any(&sk->sk_v6_rcv_saddr));  		/* Ignore non-transparent sockets,  		   if XT_SOCKET_TRANSPARENT is used */ -		if (info && info->flags & XT_SOCKET_TRANSPARENT) +		if (info->flags & XT_SOCKET_TRANSPARENT)  			transparent = ((sk->sk_state != TCP_TIME_WAIT &&  					inet_sk(sk)->transparent) ||  				       (sk->sk_state == TCP_TIME_WAIT &&  					inet_twsk(sk)->tw_transparent)); -		nf_tproxy_put_sock(sk); +		if (sk != skb->sk) +			sock_gen_put(sk);  		if (wildcard || !transparent)  			sk = NULL; @@ -314,6 +388,28 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)  }  #endif +static int socket_mt_v1_check(const struct xt_mtchk_param *par) +{ +	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + +	if (info->flags & ~XT_SOCKET_FLAGS_V1) { +		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); +		return -EINVAL; +	} +	return 0; +} + +static int socket_mt_v2_check(const struct xt_mtchk_param *par) +{ +	const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; + +	if (info->flags & ~XT_SOCKET_FLAGS_V2) { +		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); +		return -EINVAL; +	} +	return 0; +} +  static struct xt_match socket_mt_reg[] __read_mostly = {  	{  		.name		= "socket", @@ -328,7 +424,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = {  		.name		= "socket",  		.revision	= 1,  		.family		= NFPROTO_IPV4, -		.match		= socket_mt4_v1, +		.match		= socket_mt4_v1_v2, +		.checkentry	= socket_mt_v1_check,  		.matchsize	= sizeof(struct xt_socket_mtinfo1),  		.hooks		= (1 << NF_INET_PRE_ROUTING) |  				  (1 << NF_INET_LOCAL_IN), @@ -339,7 +436,32 @@ static struct xt_match socket_mt_reg[] __read_mostly = {  		.name		= "socket",  		.revision	= 1,  		.family		= NFPROTO_IPV6, -		.match		= socket_mt6_v1, +		.match		= socket_mt6_v1_v2, +		.checkentry	= socket_mt_v1_check, +		.matchsize	= sizeof(struct xt_socket_mtinfo1), +		.hooks		= (1 << NF_INET_PRE_ROUTING) | +				  (1 << NF_INET_LOCAL_IN), +		.me		= THIS_MODULE, +	}, +#endif +	{ +		.name		= "socket", +		.revision	= 2, +		.family		= NFPROTO_IPV4, +		.match		= socket_mt4_v1_v2, +		.checkentry	= socket_mt_v2_check, +		.matchsize	= sizeof(struct xt_socket_mtinfo1), +		.hooks		= (1 << NF_INET_PRE_ROUTING) | +				  (1 << NF_INET_LOCAL_IN), +		.me		= THIS_MODULE, +	}, +#ifdef XT_SOCKET_HAVE_IPV6 +	{ +		.name		= "socket", +		.revision	= 2, +		.family		= NFPROTO_IPV6, +		.match		= socket_mt6_v1_v2, +		.checkentry	= socket_mt_v2_check,  		.matchsize	= sizeof(struct xt_socket_mtinfo1),  		.hooks		= (1 << NF_INET_PRE_ROUTING) |  				  (1 << NF_INET_LOCAL_IN), diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 42ecb71d445..11de55e7a86 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -16,6 +16,7 @@  #include <linux/netfilter/xt_statistic.h>  #include <linux/netfilter/x_tables.h> +#include <linux/module.h>  struct xt_statistic_priv {  	atomic_t count; @@ -36,7 +37,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)  	switch (info->mode) {  	case XT_STATISTIC_MODE_RANDOM: -		if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) +		if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)  			ret = !ret;  		break;  	case XT_STATISTIC_MODE_NTH: diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index c48975ff8ea..0ae55a36f49 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -42,6 +42,7 @@ static const u_int16_t days_since_leapyear[] = {   */  enum {  	DSE_FIRST = 2039, +	SECONDS_PER_DAY = 86400,  };  static const u_int16_t days_since_epoch[] = {  	/* 2039 - 2030 */ @@ -78,7 +79,7 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time)  	unsigned int v, w;  	/* Each day has 86400s, so finding the hour/minute is actually easy. */ -	v         = time % 86400; +	v         = time % SECONDS_PER_DAY;  	r->second = v % 60;  	w         = v / 60;  	r->minute = w % 60; @@ -199,6 +200,18 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)  		if (packet_time < info->daytime_start &&  		    packet_time > info->daytime_stop)  			return false; + +		/** if user asked to ignore 'next day', then e.g. +		 *  '1 PM Wed, August 1st' should be treated +		 *  like 'Tue 1 PM July 31st'. +		 * +		 * This also causes +		 * 'Monday, "23:00 to 01:00", to match for 2 hours, starting +		 * Monday 23:00 to Tuesday 01:00. +		 */ +		if ((info->flags & XT_TIME_CONTIGUOUS) && +		     packet_time <= info->daytime_stop) +			stamp -= SECONDS_PER_DAY;  	}  	localtime_2(¤t_time, stamp); @@ -227,6 +240,15 @@ static int time_mt_check(const struct xt_mtchk_param *par)  		return -EDOM;  	} +	if (info->flags & ~XT_TIME_ALL_FLAGS) { +		pr_info("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS); +		return -EINVAL; +	} + +	if ((info->flags & XT_TIME_CONTIGUOUS) && +	     info->daytime_start < info->daytime_stop) +		return -EINVAL; +  	return 0;  }  | 
