diff options
Diffstat (limited to 'net/netfilter')
214 files changed, 72105 insertions, 10181 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index a567dae8e5f..e9410d17619 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -2,21 +2,28 @@ menu "Core Netfilter Configuration" depends on NET && INET && NETFILTER config NETFILTER_NETLINK - tristate "Netfilter netlink interface" - help - If this option is enabled, the kernel will include support - for the new netfilter netlink interface. + tristate + +config NETFILTER_NETLINK_ACCT +tristate "Netfilter NFACCT over NFNETLINK interface" + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for extended accounting via NFNETLINK. config NETFILTER_NETLINK_QUEUE tristate "Netfilter NFQUEUE over NFNETLINK interface" - depends on NETFILTER_NETLINK + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK help If this option is enabled, the kernel will include support for queueing packets via NFNETLINK. config NETFILTER_NETLINK_LOG tristate "Netfilter LOG over NFNETLINK interface" - depends on NETFILTER_NETLINK + default m if NETFILTER_ADVANCED=n + select NETFILTER_NETLINK help If this option is enabled, the kernel will include support for logging packets via NFNETLINK. @@ -25,40 +32,25 @@ config NETFILTER_NETLINK_LOG and is also scheduled to replace the old syslog-based ipt_LOG and ip6t_LOG modules. -# Rename this to NF_CONNTRACK in a 2.6.25 -config NF_CONNTRACK_ENABLED +config NF_CONNTRACK tristate "Netfilter connection tracking support" + default m if NETFILTER_ADVANCED=n help Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related into connections. This is required to do Masquerading or other kinds of Network - Address Translation (except for Fast NAT). It can also be used to - enhance packet filtering (see `Connection state match support' - below). + Address Translation. It can also be used to enhance packet + filtering (see `Connection state match support' below). To compile it as a module, choose M here. If unsure, say N. -config NF_CONNTRACK - tristate - default NF_CONNTRACK_ENABLED - -config NF_CT_ACCT - bool "Connection tracking flow accounting" - depends on NF_CONNTRACK - help - If this option is enabled, the connection tracking code will - keep per-flow packet and byte counters. - - Those counters can be used for flow-based accounting or the - `connbytes' match. - - If unsure, say `N'. +if NF_CONNTRACK config NF_CONNTRACK_MARK bool 'Connection mark tracking support' - depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help This option enables support for connection marks, used by the `CONNMARK' target and `connmark' match. Similar to the mark value @@ -67,7 +59,8 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' - depends on NF_CONNTRACK && NETWORK_SECMARK + depends on NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from @@ -77,9 +70,32 @@ config NF_CONNTRACK_SECMARK If unsure, say 'N'. +config NF_CONNTRACK_ZONES + bool 'Connection tracking zones' + depends on NETFILTER_ADVANCED + depends on NETFILTER_XT_TARGET_CT + help + This option enables support for connection tracking zones. + Normally, each connection needs to have a unique system wide + identity. Connection tracking zones allow to have multiple + connections using the same identity, as long as they are + contained in different zones. + + If unsure, say `N'. + +config NF_CONNTRACK_PROCFS + bool "Supply CT list in procfs (OBSOLETE)" + default y + depends on PROC_FS + ---help--- + This option enables for the list of known conntrack entries + to be shown in procfs under net/netfilter/nf_conntrack. This + is considered obsolete in favor of using the conntrack(8) + tool which uses Netlink. + config NF_CONNTRACK_EVENTS - bool "Connection tracking events (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + bool "Connection tracking events" + depends on NETFILTER_ADVANCED help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code @@ -87,14 +103,50 @@ config NF_CONNTRACK_EVENTS If unsure, say `N'. +config NF_CONNTRACK_TIMEOUT + bool 'Connection tracking timeout' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timeout + extension. This allows you to attach timeout policies to flow + via the CT target. + + If unsure, say `N'. + +config NF_CONNTRACK_TIMESTAMP + bool 'Connection tracking timestamping' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timestamping. + This allows you to store the flow start-time and to obtain + the flow-stop time (once it has been destroyed) via Connection + tracking events. + + If unsure, say `N'. + +config NF_CONNTRACK_LABELS + bool + help + This option enables support for assigning user-defined flag bits + to connection tracking entries. It selected by the connlabel match. + +config NF_CT_PROTO_DCCP + tristate 'DCCP protocol connection tracking support' + depends on NETFILTER_ADVANCED + default IP_DCCP + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on DCCP connections. + + If unsure, say 'N'. + config NF_CT_PROTO_GRE tristate - depends on NF_CONNTRACK config NF_CT_PROTO_SCTP - tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK - default n + tristate 'SCTP protocol connection tracking support' + depends on NETFILTER_ADVANCED + default IP_SCTP help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -102,9 +154,19 @@ config NF_CT_PROTO_SCTP If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NF_CT_PROTO_UDPLITE + tristate 'UDP-Lite protocol connection tracking support' + depends on NETFILTER_ADVANCED + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on UDP-Lite + connections. + + To compile it as a module, choose M here. If unsure, say N. + config NF_CONNTRACK_AMANDA tristate "Amanda backup protocol support" - depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP help @@ -119,7 +181,7 @@ config NF_CONNTRACK_AMANDA config NF_CONNTRACK_FTP tristate "FTP protocol support" - depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help Tracking FTP connections is problematic: special helpers are required for tracking them, and doing masquerading and other forms @@ -132,8 +194,9 @@ config NF_CONNTRACK_FTP To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_H323 - tristate "H.323 protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK && (IPV6 || IPV6=n) + tristate "H.323 protocol support" + depends on (IPV6 || IPV6=n) + depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most important VoIP protocols, it is widely used by voice hardware and @@ -152,7 +215,7 @@ config NF_CONNTRACK_H323 config NF_CONNTRACK_IRC tristate "IRC protocol support" - depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help There is a commonly-used extension to IRC called Direct Client-to-Client Protocol (DCC). This enables users to send @@ -165,9 +228,12 @@ config NF_CONNTRACK_IRC To compile it as a module, choose M here. If unsure, say N. +config NF_CONNTRACK_BROADCAST + tristate + config NF_CONNTRACK_NETBIOS_NS - tristate "NetBIOS name service protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + tristate "NetBIOS name service protocol support" + select NF_CONNTRACK_BROADCAST help NetBIOS name service requests are sent as broadcast messages from an unprivileged port and responded to with unicast messages to the @@ -184,9 +250,24 @@ config NF_CONNTRACK_NETBIOS_NS To compile it as a module, choose M here. If unsure, say N. +config NF_CONNTRACK_SNMP + tristate "SNMP service protocol support" + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST + help + SNMP service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating SNMP service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. + + To compile it as a module, choose M here. If unsure, say N. + config NF_CONNTRACK_PPTP tristate "PPtP protocol support" - depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED select NF_CT_PROTO_GRE help This module adds support for PPTP (Point to Point Tunnelling @@ -204,8 +285,8 @@ config NF_CONNTRACK_PPTP To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_SANE - tristate "SANE protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + tristate "SANE protocol support" + depends on NETFILTER_ADVANCED help SANE is a protocol for remote access to scanners as implemented by the 'saned' daemon. Like FTP, it uses separate control and @@ -217,8 +298,8 @@ config NF_CONNTRACK_SANE To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_SIP - tristate "SIP protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + tristate "SIP protocol support" + default m if NETFILTER_ADVANCED=n help SIP is an application-layer control protocol that can establish, modify, and terminate multimedia sessions (conferences) such as @@ -230,7 +311,7 @@ config NF_CONNTRACK_SIP config NF_CONNTRACK_TFTP tristate "TFTP protocol support" - depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help TFTP connection tracking helper, this is required depending on how restrictive your ruleset is. @@ -240,24 +321,302 @@ config NF_CONNTRACK_TFTP To compile it as a module, choose M here. If unsure, say N. config NF_CT_NETLINK - tristate 'Connection tracking netlink interface (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK && NETFILTER_NETLINK - depends on NF_CONNTRACK!=y || NETFILTER_NETLINK!=m - depends on NF_NAT=n || NF_NAT + tristate 'Connection tracking netlink interface' + select NETFILTER_NETLINK + default m if NETFILTER_ADVANCED=n help This option enables support for a netlink-based userspace interface +config NF_CT_NETLINK_TIMEOUT + tristate 'Connection tracking timeout tuning via Netlink' + select NETFILTER_NETLINK + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timeout + fine-grain tuning. This allows you to attach specific timeout + policies to flows, instead of using the global timeout policy. + + If unsure, say `N'. + +config NF_CT_NETLINK_HELPER + tristate 'Connection tracking helpers in user-space via Netlink' + select NETFILTER_NETLINK + depends on NF_CT_NETLINK + depends on NETFILTER_NETLINK_QUEUE + depends on NETFILTER_NETLINK_QUEUE_CT + depends on NETFILTER_ADVANCED + help + This option enables the user-space connection tracking helpers + infrastructure. + + If unsure, say `N'. + +config NETFILTER_NETLINK_QUEUE_CT + bool "NFQUEUE integration with Connection Tracking" + default n + depends on NETFILTER_NETLINK_QUEUE + help + If this option is enabled, NFQUEUE can include Connection Tracking + information together with the packet is the enqueued via NFNETLINK. + +config NF_NAT + tristate + +config NF_NAT_NEEDED + bool + depends on NF_NAT + default y + +config NF_NAT_PROTO_DCCP + tristate + depends on NF_NAT && NF_CT_PROTO_DCCP + default NF_NAT && NF_CT_PROTO_DCCP + +config NF_NAT_PROTO_UDPLITE + tristate + depends on NF_NAT && NF_CT_PROTO_UDPLITE + default NF_NAT && NF_CT_PROTO_UDPLITE + +config NF_NAT_PROTO_SCTP + tristate + default NF_NAT && NF_CT_PROTO_SCTP + depends on NF_NAT && NF_CT_PROTO_SCTP + select LIBCRC32C + +config NF_NAT_AMANDA + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_AMANDA + +config NF_NAT_FTP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_FTP + +config NF_NAT_IRC + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_IRC + +config NF_NAT_SIP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_SIP + +config NF_NAT_TFTP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_TFTP + +config NETFILTER_SYNPROXY + tristate + +endif # NF_CONNTRACK + +config NF_TABLES + select NETFILTER_NETLINK + tristate "Netfilter nf_tables support" + help + nftables is the new packet classification framework that intends to + replace the existing {ip,ip6,arp,eb}_tables infrastructure. It + provides a pseudo-state machine with an extensible instruction-set + (also known as expressions) that the userspace 'nft' utility + (http://www.netfilter.org/projects/nftables) uses to build the + rule-set. It also comes with the generic set infrastructure that + allows you to construct mappings between matchings and actions + for performance lookups. + + To compile it as a module, choose M here. + +config NF_TABLES_INET + depends on NF_TABLES && IPV6 + select NF_TABLES_IPV4 + select NF_TABLES_IPV6 + tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" + help + This option enables support for a mixed IPv4/IPv6 "inet" table. + +config NFT_EXTHDR + depends on NF_TABLES + tristate "Netfilter nf_tables IPv6 exthdr module" + help + This option adds the "exthdr" expression that you can use to match + IPv6 extension headers. + +config NFT_META + depends on NF_TABLES + tristate "Netfilter nf_tables meta module" + help + This option adds the "meta" expression that you can use to match and + to set packet metainformation such as the packet mark. + +config NFT_CT + depends on NF_TABLES + depends on NF_CONNTRACK + tristate "Netfilter nf_tables conntrack module" + help + This option adds the "meta" expression that you can use to match + connection tracking information such as the flow state. + +config NFT_RBTREE + depends on NF_TABLES + tristate "Netfilter nf_tables rbtree set module" + help + This option adds the "rbtree" set type (Red Black tree) that is used + to build interval-based sets. + +config NFT_HASH + depends on NF_TABLES + tristate "Netfilter nf_tables hash set module" + help + This option adds the "hash" set type that is used to build one-way + mappings between matchings and actions. + +config NFT_COUNTER + depends on NF_TABLES + tristate "Netfilter nf_tables counter module" + help + This option adds the "counter" expression that you can use to + include packet and byte counters in a rule. + +config NFT_LOG + depends on NF_TABLES + tristate "Netfilter nf_tables log module" + help + This option adds the "log" expression that you can use to log + packets matching some criteria. + +config NFT_LIMIT + depends on NF_TABLES + tristate "Netfilter nf_tables limit module" + help + This option adds the "limit" expression that you can use to + ratelimit rule matchings. + +config NFT_NAT + depends on NF_TABLES + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables nat module" + help + This option adds the "nat" expression that you can use to perform + typical Network Address Translation (NAT) packet transformations. + +config NFT_QUEUE + depends on NF_TABLES + depends on NETFILTER_XTABLES + depends on NETFILTER_NETLINK_QUEUE + tristate "Netfilter nf_tables queue module" + help + This is required if you intend to use the userspace queueing + infrastructure (also known as NFQUEUE) from nftables. + +config NFT_REJECT + depends on NF_TABLES + default m if NETFILTER_ADVANCED=n + tristate "Netfilter nf_tables reject support" + help + This option adds the "reject" expression that you can use to + explicitly deny and notify via TCP reset/ICMP informational errors + unallowed traffic. + +config NFT_REJECT_INET + depends on NF_TABLES_INET + default NFT_REJECT + tristate + +config NFT_COMPAT + depends on NF_TABLES + depends on NETFILTER_XTABLES + tristate "Netfilter x_tables over nf_tables module" + help + This is required if you intend to use any of existing + x_tables match/target extensions over the nf_tables + framework. + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" + default m if NETFILTER_ADVANCED=n help This is required if you intend to use any of ip_tables, ip6_tables or arp_tables. +if NETFILTER_XTABLES + +comment "Xtables combined modules" + +config NETFILTER_XT_MARK + tristate 'nfmark target and match support' + default m if NETFILTER_ADVANCED=n + ---help--- + This option adds the "MARK" target and "mark" match. + + Netfilter mark matching allows you to match packets based on the + "nfmark" value in the packet. + The target allows you to create rules in the "mangle" table which alter + the netfilter mark (nfmark) field associated with the packet. + + Prior to routing, the nfmark can influence the routing method (see + "Use netfilter MARK value as routing key") and can also be used by + other subsystems to change their behavior. + +config NETFILTER_XT_CONNMARK + tristate 'ctmark target and match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_MARK + ---help--- + This option adds the "CONNMARK" target and "connmark" match. + + Netfilter allows you to store a mark value per connection (a.k.a. + ctmark), similarly to the packet mark (nfmark). Using this + target and match, you can set and match on this mark. + +config NETFILTER_XT_SET + tristate 'set target and match support' + depends on IP_SET + depends on NETFILTER_ADVANCED + help + This option adds the "SET" target and "set" match. + + Using this target and match, you can add/delete and match + elements in the sets created by ipset(8). + + To compile it as a module, choose M here. If unsure, say N. + # alphabetically ordered list of targets +comment "Xtables targets" + +config NETFILTER_XT_TARGET_AUDIT + tristate "AUDIT target support" + depends on AUDIT + depends on NETFILTER_ADVANCED + ---help--- + This option adds a 'AUDIT' target, which can be used to create + audit records for packets dropped/accepted. + + To compileit as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CHECKSUM + tristate "CHECKSUM target support" + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds a `CHECKSUM' target, which can be used in the iptables mangle + table. + + You can use this target to compute and fill in the checksum in + a packet that lacks a checksum. This is particularly useful, + if you need to work around old applications such as dhcp clients, + that do not work well with checksum offloads, but don't want to disable + checksum offload in your device. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_CLASSIFY tristate '"CLASSIFY" target support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `CLASSIFY' target, which enables the user to set the priority of a packet. Some qdiscs can use this value for @@ -269,47 +628,162 @@ config NETFILTER_XT_TARGET_CLASSIFY config NETFILTER_XT_TARGET_CONNMARK tristate '"CONNMARK" target support' - depends on NETFILTER_XTABLES - depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NF_CONNTRACK - select NF_CONNTRACK_MARK + depends on NETFILTER_ADVANCED + select NETFILTER_XT_CONNMARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). + +config NETFILTER_XT_TARGET_CONNSECMARK + tristate '"CONNSECMARK" target support' + depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK + default m if NETFILTER_ADVANCED=n help - This option adds a `CONNMARK' target, which allows one to manipulate - the connection mark value. Similar to the MARK target, but - affects the connection mark value rather than the packet mark value. - - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. The module will be called - ipt_CONNMARK.ko. If unsure, say `N'. + The CONNSECMARK target copies security markings from packets + to connections, and restores security markings from connections + to packets (if the packets are not already marked). This would + normally be used in conjunction with the SECMARK target. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CT + tristate '"CT" target support' + depends on NF_CONNTRACK + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + help + This options adds a `CT' target, which allows to specify initial + connection tracking parameters like events to be delivered and + the helper to be used. + + To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_DSCP - tristate '"DSCP" target support' - depends on NETFILTER_XTABLES + tristate '"DSCP" and "TOS" target support' depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED help This option adds a `DSCP' target, which allows you to manipulate the IPv4/IPv6 header DSCP field (differentiated services codepoint). The DSCP field can have any value between 0x0 and 0x3f inclusive. + It also adds the "TOS" target, which allows you to create rules in + the "mangle" table which alter the Type Of Service field of an IPv4 + or the Priority field of an IPv6 packet, prior to routing. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_HL + tristate '"HL" hoplimit target support' + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds the "HL" (for IPv6) and "TTL" (for IPv4) + targets, which enable the user to change the + hoplimit/time-to-live value of the IP header. + + While it is safe to decrement the hoplimit/TTL value, the + modules also allow to increment and set the hoplimit value of + the header to arbitrary values. This is EXTREMELY DANGEROUS + since you can easily create immortal packets that loop + forever on the network. + +config NETFILTER_XT_TARGET_HMARK + tristate '"HMARK" target support' + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED + ---help--- + This option adds the "HMARK" target. + + The target allows you to create rules in the "raw" and "mangle" tables + which set the skbuff mark by means of hash calculation within a given + range. The nfmark can influence the routing method (see "Use netfilter + MARK value as routing key") and can also be used by other subsystems to + change their behaviour. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_IDLETIMER + tristate "IDLETIMER target support" + depends on NETFILTER_ADVANCED + help + + This option adds the `IDLETIMER' target. Each matching packet + resets the timer associated with label specified when the rule is + added. When the timer expires, it triggers a sysfs notification. + The remaining time for expiration can be read via sysfs. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_LED + tristate '"LED" target support' + depends on LEDS_CLASS && LEDS_TRIGGERS + depends on NETFILTER_ADVANCED + help + This option adds a `LED' target, which allows you to blink LEDs in + response to particular packets passing through your machine. + + This can be used to turn a spare LED into a network activity LED, + which only flashes in response to FTP transfers, for example. Or + you could have an LED which lights up for a minute or two every time + somebody connects to your machine via SSH. + + You will need support for the "led" class to make this work. + + To create an LED trigger for incoming SSH traffic: + iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000 + + Then attach the new trigger to an LED on your system: + echo netfilter-ssh > /sys/class/leds/<ledname>/trigger + + For more information on the LEDs available on your system, see + Documentation/leds/leds-class.txt + +config NETFILTER_XT_TARGET_LOG + tristate "LOG target support" + default m if NETFILTER_ADVANCED=n + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). + +config NETFILTER_XT_TARGET_NETMAP + tristate '"NETMAP" target support' + depends on NF_NAT + ---help--- + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_NFLOG + tristate '"NFLOG" target support' + default m if NETFILTER_ADVANCED=n + select NETFILTER_NETLINK_LOG help - This option adds a `MARK' target, which allows you to create rules - in the `mangle' table which alter the netfilter mark (nfmark) field - associated with the packet prior to routing. This can change - the routing method (see `Use netfilter MARK value as routing - key') and can also be used by other subsystems to change their - behavior. + This option enables the NFLOG target, which allows to LOG + messages through nfnetlink_log. To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_NFQUEUE tristate '"NFQUEUE" target Support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_QUEUE help This target replaced the old obsolete QUEUE target. @@ -318,54 +792,87 @@ config NETFILTER_XT_TARGET_NFQUEUE To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_TARGET_NFLOG - tristate '"NFLOG" target support' - depends on NETFILTER_XTABLES +config NETFILTER_XT_TARGET_NOTRACK + tristate '"NOTRACK" target support (DEPRECATED)' + depends on NF_CONNTRACK + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_CT + +config NETFILTER_XT_TARGET_RATEEST + tristate '"RATEEST" target support' + depends on NETFILTER_ADVANCED help - This option enables the NFLOG target, which allows to LOG - messages through the netfilter logging API, which can use - either the old LOG target, the old ULOG target or nfnetlink_log - as backend. + This option adds a `RATEEST' target, which allows to measure + rates similar to TC estimators. The `rateest' match can be + used to match on the measured rates. To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_TARGET_NOTRACK - tristate '"NOTRACK" target support' +config NETFILTER_XT_TARGET_REDIRECT + tristate "REDIRECT target support" + depends on NF_NAT + ---help--- + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TEE + tristate '"TEE" - packet cloning to alternate destination' + depends on NETFILTER_ADVANCED + depends on (IPV6 || IPV6=n) + depends on !NF_CONNTRACK || NF_CONNTRACK + ---help--- + This option adds a "TEE" target with which a packet can be cloned and + this clone be rerouted to another nexthop. + +config NETFILTER_XT_TARGET_TPROXY + tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + help + This option adds a `TPROXY' target, which is somewhat similar to + REDIRECT. It can only be used in the mangle table and is useful + to redirect traffic to a transparent proxy. It does _not_ depend + on Netfilter connection tracking and NAT, unlike REDIRECT. + For it to work you will have to configure certain iptables rules + and use policy routing. For more information on how to set it up + see Documentation/networking/tproxy.txt. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TRACE + tristate '"TRACE" target support' depends on IP_NF_RAW || IP6_NF_RAW - depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help - The NOTRACK target allows a select rule to specify - which packets *not* to enter the conntrack/NAT - subsystem with all the consequences (no ICMP error tracking, - no protocol helpers for the selected packets). - + The TRACE target allows you to mark packets so that the kernel + will log every rule which match the packets as those traverse + the tables, chains, rules. + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. config NETFILTER_XT_TARGET_SECMARK tristate '"SECMARK" target support' - depends on NETFILTER_XTABLES && NETWORK_SECMARK + depends on NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n help The SECMARK target allows security marking of network packets, for use with security subsystems. To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_TARGET_CONNSECMARK - tristate '"CONNSECMARK" target support' - depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK - help - The CONNSECMARK target copies security markings from packets - to connections, and restores security markings from connections - to packets (if the packets are not already marked). This would - normally be used in conjunction with the SECMARK target. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on NETFILTER_XTABLES && (IPV6 || IPV6=n) + depends on (IPV6 || IPV6=n) + default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the MSS value of TCP SYN packets, to control the maximum size for that @@ -389,9 +896,66 @@ config NETFILTER_XT_TARGET_TCPMSS To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_TCPOPTSTRIP + tristate '"TCPOPTSTRIP" target support' + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option adds a "TCPOPTSTRIP" target, which allows you to strip + TCP options from TCP packets. + +# alphabetically ordered list of matches + +comment "Xtables matches" + +config NETFILTER_XT_MATCH_ADDRTYPE + tristate '"addrtype" address type match support' + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_BPF + tristate '"bpf" match support' + depends on NETFILTER_ADVANCED + help + BPF matching applies a linux socket filter to each packet and + accepts those for which the filter returns non-zero. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_CGROUP + tristate '"control group" match support' + depends on NETFILTER_ADVANCED + depends on CGROUPS + select CGROUP_NET_CLASSID + ---help--- + Socket/process control group matching allows you to match locally + generated packets based on which net_cls control group processes + belong to. + +config NETFILTER_XT_MATCH_CLUSTER + tristate '"cluster" match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to build work-load-sharing clusters of + network servers/stateful firewalls without having a dedicated + load-balancing router/server/switch. Basically, this match returns + true when the packet must be handled by this cluster node. Thus, + all nodes see all packets and this match decides which node handles + what packets. The work-load sharing algorithm is based on source + address hashing. + + If you say Y or M here, try `iptables -m cluster --help` for + more information. + config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `comment' dummy-match, which allows you to put comments in your iptables ruleset. @@ -401,9 +965,8 @@ config NETFILTER_XT_MATCH_COMMENT config NETFILTER_XT_MATCH_CONNBYTES tristate '"connbytes" per-connection counter match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK - select NF_CT_ACCT + depends on NETFILTER_ADVANCED help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -411,23 +974,41 @@ config NETFILTER_XT_MATCH_CONNBYTES If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_CONNLABEL + tristate '"connlabel" match support' + select NF_CONNTRACK_LABELS + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This match allows you to test and assign userspace-defined labels names + to a connection. The kernel only stores bit values - mapping + names to bits is done by userspace. + + Unlike connmark, more than 32 flag bits may be assigned to a + connection simultaneously. + +config NETFILTER_XT_MATCH_CONNLIMIT + tristate '"connlimit" match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This match allows you to match against the number of parallel + connections to a server per client IP address (or address block). + config NETFILTER_XT_MATCH_CONNMARK tristate '"connmark" connection mark match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK - select NF_CONNTRACK_MARK - help - This option adds a `connmark' match, which allows you to match the - connection mark value previously set for the session by `CONNMARK'. - - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. The module will be called - ipt_connmark.ko. If unsure, say `N'. + depends on NETFILTER_ADVANCED + select NETFILTER_XT_CONNMARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). config NETFILTER_XT_MATCH_CONNTRACK tristate '"conntrack" connection tracking match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help This is a general conntrack match module, a superset of the state match. @@ -437,9 +1018,19 @@ config NETFILTER_XT_MATCH_CONNTRACK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_CPU + tristate '"cpu" match support' + depends on NETFILTER_ADVANCED + help + CPU matching allows you to match packets based on the CPU + currently handling the packet. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_DCCP - tristate '"DCCP" protocol match support' - depends on NETFILTER_XTABLES + tristate '"dccp" protocol match support' + depends on NETFILTER_ADVANCED + default IP_DCCP help With this option enabled, you will be able to use the iptables `dccp' match in order to match on DCCP source/destination ports @@ -448,39 +1039,123 @@ config NETFILTER_XT_MATCH_DCCP If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_DEVGROUP + tristate '"devgroup" match support' + depends on NETFILTER_ADVANCED + help + This options adds a `devgroup' match, which allows to match on the + device group a network device is assigned to. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_DSCP - tristate '"DSCP" match support' - depends on NETFILTER_XTABLES + tristate '"dscp" and "tos" match support' + depends on NETFILTER_ADVANCED help This option adds a `DSCP' match, which allows you to match against the IPv4/IPv6 header DSCP field (differentiated services codepoint). The DSCP field can have any value between 0x0 and 0x3f inclusive. + It will also add a "tos" match, which allows you to match packets + based on the Type Of Service fields of the IPv4 packet (which share + the same bits as DSCP). + To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_ECN + tristate '"ecn" match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds an "ECN" match, which allows you to match against + the IPv4 and TCP header ECN fields. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_ESP - tristate '"ESP" match support' - depends on NETFILTER_XTABLES + tristate '"esp" match support' + depends on NETFILTER_ADVANCED help This match extension allows you to match a range of SPIs inside ESP header of IPSec packets. To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_HASHLIMIT + tristate '"hashlimit" match support' + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED + help + This option adds a `hashlimit' match. + + As opposed to `limit', this match dynamically creates a hash table + of limit buckets, based on your selection of source/destination + addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination address' or `500pps from any given source address' + with a single rule. + config NETFILTER_XT_MATCH_HELPER tristate '"helper" match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help Helper matching allows you to match packets in dynamic connections tracked by a conntrack-helper, ie. ip_conntrack_ftp To compile it as a module, choose M here. If unsure, say Y. +config NETFILTER_XT_MATCH_HL + tristate '"hl" hoplimit/TTL match support' + depends on NETFILTER_ADVANCED + ---help--- + HL matching allows you to match packets based on the hoplimit + in the IPv6 header, or the time-to-live field in the IPv4 + header of the packet. + +config NETFILTER_XT_MATCH_IPCOMP + tristate '"ipcomp" match support' + depends on NETFILTER_ADVANCED + help + This match extension allows you to match a range of CPIs(16 bits) + inside IPComp header of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_IPRANGE + tristate '"iprange" address range match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds a "iprange" match, which allows you to match based on + an IP address range. (Normal iptables only matches on single addresses + with an optional mask.) + + If unsure, say M. + +config NETFILTER_XT_MATCH_IPVS + tristate '"ipvs" match support' + depends on IP_VS + depends on NETFILTER_ADVANCED + depends on NF_CONNTRACK + help + This option allows you to match against IPVS properties of a packet. + + If unsure, say N. + +config NETFILTER_XT_MATCH_L2TP + tristate '"l2tp" match support' + depends on NETFILTER_ADVANCED + default L2TP + ---help--- + This option adds an "L2TP" match, which allows you to match against + L2TP protocol header fields. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option allows you to match the length of a packet against a specific value or range of values. @@ -489,7 +1164,7 @@ config NETFILTER_XT_MATCH_LENGTH config NETFILTER_XT_MATCH_LIMIT tristate '"limit" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help limit matching allows you to control the rate at which a rule can be matched: mainly useful in combination with the LOG target ("LOG @@ -499,7 +1174,7 @@ config NETFILTER_XT_MATCH_LIMIT config NETFILTER_XT_MATCH_MAC tristate '"mac" address match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help MAC matching allows you to match packets based on the source Ethernet address of the packet. @@ -508,17 +1183,58 @@ config NETFILTER_XT_MATCH_MAC config NETFILTER_XT_MATCH_MARK tristate '"mark" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). + +config NETFILTER_XT_MATCH_MULTIPORT + tristate '"multiport" Multiple port match support' + depends on NETFILTER_ADVANCED help - Netfilter mark matching allows you to match packets based on the - `nfmark' value in the packet. This can be set by the MARK target - (see below). + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_NFACCT + tristate '"nfacct" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_ACCT + help + This option allows you to use the extended accounting through + nfnetlink_acct. To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_OSF + tristate '"osf" Passive OS fingerprint match' + depends on NETFILTER_ADVANCED && NETFILTER_NETLINK + help + This option selects the Passive OS Fingerprinting match module + that allows to passively match the remote operating system by + analyzing incoming TCP SYN packets. + + Rules and loading software can be downloaded from + http://www.ioremap.net/projects/osf + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_OWNER + tristate '"owner" match support' + depends on NETFILTER_ADVANCED + ---help--- + Socket owner matching allows you to match locally-generated packets + based on who created the socket: the user or group. It is also + possible to check whether a socket actually exists. + config NETFILTER_XT_MATCH_POLICY tristate 'IPsec "policy" match support' - depends on NETFILTER_XTABLES && XFRM + depends on XFRM + default m if NETFILTER_ADVANCED=n help Policy matching allows you to match packets based on the IPsec policy that was used during decapsulation/will @@ -526,19 +1242,10 @@ config NETFILTER_XT_MATCH_POLICY To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_MATCH_MULTIPORT - tristate "Multiple port match support" - depends on NETFILTER_XTABLES - help - Multiport matching allows you to match TCP or UDP packets based on - a series of source or destination ports: normally a rule can only - match a single range of ports. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' - depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER + depends on BRIDGE && BRIDGE_NETFILTER + depends on NETFILTER_ADVANCED help Physdev packet matching matches against the physical bridge ports the IP packet arrived on or will leave by. @@ -547,7 +1254,7 @@ config NETFILTER_XT_MATCH_PHYSDEV config NETFILTER_XT_MATCH_PKTTYPE tristate '"pkttype" packet type match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help Packet type matching allows you to match a packet by its "class", eg. BROADCAST, MULTICAST, ... @@ -559,7 +1266,7 @@ config NETFILTER_XT_MATCH_PKTTYPE config NETFILTER_XT_MATCH_QUOTA tristate '"quota" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `quota' match, which allows to match on a byte counter. @@ -567,23 +1274,44 @@ config NETFILTER_XT_MATCH_QUOTA If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_RATEEST + tristate '"rateest" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_RATEEST + help + This option adds a `rateest' match, which allows to match on the + rate estimated by the RATEEST target. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' - depends on NETFILTER_XTABLES - select NET_CLS_ROUTE + depends on NETFILTER_ADVANCED + select IP_ROUTE_CLASSID help This option adds a `realm' match, which allows you to use the realm key from the routing subsystem inside iptables. - + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option in tc world. - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_RECENT + tristate '"recent" match support' + depends on NETFILTER_ADVANCED + ---help--- + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: <http://snowman.net/projects/ipt_recent/> + config NETFILTER_XT_MATCH_SCTP - tristate '"sctp" protocol match support (EXPERIMENTAL)' - depends on NETFILTER_XTABLES && EXPERIMENTAL + tristate '"sctp" protocol match support' + depends on NETFILTER_ADVANCED + default IP_SCTP help With this option enabled, you will be able to use the `sctp' match in order to match on SCTP source/destination ports @@ -592,10 +1320,26 @@ config NETFILTER_XT_MATCH_SCTP If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_SOCKET + tristate '"socket" match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + depends on !NF_CONNTRACK || NF_CONNTRACK + depends on (IPV6 || IPV6=n) + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + help + This option adds a `socket' match, which can be used to match + packets for which a TCP or UDP socket lookup finds a valid socket. + It can be used in combination with the MARK target and policy + routing to implement full featured non-locally bound sockets. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_STATE tristate '"state" match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help Connection state matching allows you to match packets based on their relationship to a tracked connection (ie. previous packets). This @@ -605,7 +1349,7 @@ config NETFILTER_XT_MATCH_STATE config NETFILTER_XT_MATCH_STATISTIC tristate '"statistic" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `statistic' match, which allows you to match on packets periodically or randomly with a given percentage. @@ -614,7 +1358,7 @@ config NETFILTER_XT_MATCH_STATISTIC config NETFILTER_XT_MATCH_STRING tristate '"string" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP select TEXTSEARCH_BM @@ -627,7 +1371,7 @@ config NETFILTER_XT_MATCH_STRING config NETFILTER_XT_MATCH_TCPMSS tristate '"tcpmss" match support' - depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `tcpmss' match, which allows you to examine the MSS value of TCP SYN packets, which control the maximum packet size @@ -635,19 +1379,37 @@ config NETFILTER_XT_MATCH_TCPMSS To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_MATCH_HASHLIMIT - tristate '"hashlimit" match support' - depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) - help - This option adds a `hashlimit' match. +config NETFILTER_XT_MATCH_TIME + tristate '"time" match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds a "time" match, which allows you to match based on + the packet arrival time (at the machine which netfilter is running) + on) or departure time/date (for locally generated packets). - As opposed to `limit', this match dynamically creates a hash table - of limit buckets, based on your selection of source/destination - addresses and/or ports. + If you say Y here, try `iptables -m time --help` for + more information. - It enables you to express policies like `10kpps for any given - destination address' or `500pps from any given source address' - with a single rule. + If you want to compile it as a module, say M here. + If unsure, say N. + +config NETFILTER_XT_MATCH_U32 + tristate '"u32" match support' + depends on NETFILTER_ADVANCED + ---help--- + u32 allows you to extract quantities of up to 4 bytes from a packet, + AND them with specified masks, shift them by specified amounts and + test whether the results are in any of a set of specified ranges. + The specification of what to extract is general enough to skip over + headers with lengths stored in the packet, as in IP or TCP header + lengths. + + Details and examples are in the kernel module source. + +endif # NETFILTER_XTABLES endmenu +source "net/netfilter/ipset/Kconfig" + +source "net/netfilter/ipvs/Kconfig" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index b2b5c7566b2..bffdad774da 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,12 +1,17 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o obj-$(CONFIG_NETFILTER) = netfilter.o -obj-$(CONFIG_SYSCTL) += nf_sysctl.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o +obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o +nfnetlink_queue-y := nfnetlink_queue_core.o +nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o @@ -14,11 +19,15 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o # SCTP protocol connection tracking +obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o +obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o # netlink interface for nf_conntrack obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o +obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o +obj-$(CONFIG_NF_CT_NETLINK_HELPER) += nfnetlink_cthelper.o # connection tracking helpers nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o @@ -27,49 +36,136 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o +obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o +obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o +nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ + nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o + +obj-$(CONFIG_NF_NAT) += nf_nat.o + +# NAT protocols (nf_nat) +obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o +obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o +obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o + +# NAT helpers +obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o +obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o +obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o +obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o +obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o + +# SYNPROXY +obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o + +# nf_tables +nf_tables-objs += nf_tables_core.o nf_tables_api.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o +nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o + +obj-$(CONFIG_NF_TABLES) += nf_tables.o +obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o +obj-$(CONFIG_NFT_COMPAT) += nft_compat.o +obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o +obj-$(CONFIG_NFT_META) += nft_meta.o +obj-$(CONFIG_NFT_CT) += nft_ct.o +obj-$(CONFIG_NFT_LIMIT) += nft_limit.o +obj-$(CONFIG_NFT_NAT) += nft_nat.o +obj-$(CONFIG_NFT_QUEUE) += nft_queue.o +obj-$(CONFIG_NFT_REJECT) += nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o +obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o +obj-$(CONFIG_NFT_HASH) += nft_hash.o +obj-$(CONFIG_NFT_COUNTER) += nft_counter.o +obj-$(CONFIG_NFT_LOG) += nft_log.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o +# combos +obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o +obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o +obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o +obj-$(CONFIG_NF_NAT) += xt_nat.o + # targets +obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o -obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o -obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o -obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o -obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o +obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o -obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o # matches +obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_BPF) += xt_bpf.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o -obj-$(CONFIG_NETFILTER_XT_MATCH_CONNMARK) += xt_connmark.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLABEL) += xt_connlabel.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_ECN) += xt_ecn.o obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o +obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o -obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o -obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o +obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o +obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o +obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o -obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o -obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o +obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o + +# ipset +obj-$(CONFIG_IP_SET) += ipset/ + +# IPVS +obj-$(CONFIG_IP_VS) += ipvs/ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index a84478ee2de..1fbab0cdd30 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -5,6 +5,7 @@ * way. * * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 */ #include <linux/kernel.h> #include <linux/netfilter.h> @@ -19,60 +20,68 @@ #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <net/net_namespace.h> #include <net/sock.h> #include "nf_internals.h" static DEFINE_MUTEX(afinfo_mutex); -struct nf_afinfo *nf_afinfo[NPROTO] __read_mostly; +const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); +const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; +EXPORT_SYMBOL_GPL(nf_ipv6_ops); -int nf_register_afinfo(struct nf_afinfo *afinfo) +int nf_register_afinfo(const struct nf_afinfo *afinfo) { int err; err = mutex_lock_interruptible(&afinfo_mutex); if (err < 0) return err; - rcu_assign_pointer(nf_afinfo[afinfo->family], afinfo); + RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo); mutex_unlock(&afinfo_mutex); return 0; } EXPORT_SYMBOL_GPL(nf_register_afinfo); -void nf_unregister_afinfo(struct nf_afinfo *afinfo) +void nf_unregister_afinfo(const struct nf_afinfo *afinfo) { mutex_lock(&afinfo_mutex); - rcu_assign_pointer(nf_afinfo[afinfo->family], NULL); + RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL); mutex_unlock(&afinfo_mutex); synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -/* In this code, we can be waiting indefinitely for userspace to - * service a packet if a hook returns NF_QUEUE. We could keep a count - * of skbuffs queued for userspace, and not deregister a hook unless - * this is zero, but that sucks. Now, we simply check when the - * packets come back: if the hook is gone, the packet is discarded. */ -struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS] __read_mostly; +struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); + +#if defined(CONFIG_JUMP_LABEL) +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +EXPORT_SYMBOL(nf_hooks_needed); +#endif + static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) { - struct list_head *i; + struct nf_hook_ops *elem; int err; err = mutex_lock_interruptible(&nf_hook_mutex); if (err < 0) return err; - list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { - if (reg->priority < ((struct nf_hook_ops *)i)->priority) + list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + if (reg->priority < elem->priority) break; } - list_add_rcu(®->list, i->prev); + list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); +#if defined(CONFIG_JUMP_LABEL) + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif return 0; } EXPORT_SYMBOL(nf_register_hook); @@ -82,7 +91,9 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); - +#if defined(CONFIG_JUMP_LABEL) + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif synchronize_net(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -108,19 +119,17 @@ EXPORT_SYMBOL(nf_register_hooks); void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) { - unsigned int i; - - for (i = 0; i < n; i++) - nf_unregister_hook(®[i]); + while (n-- > 0) + nf_unregister_hook(®[n]); } EXPORT_SYMBOL(nf_unregister_hooks); unsigned int nf_iterate(struct list_head *head, - struct sk_buff **skb, - int hook, + struct sk_buff *skb, + unsigned int hook, const struct net_device *indev, const struct net_device *outdev, - struct list_head **i, + struct nf_hook_ops **elemp, int (*okfn)(struct sk_buff *), int hook_thresh) { @@ -130,27 +139,26 @@ unsigned int nf_iterate(struct list_head *head, * The caller must not block between calls to this * function because of risk of continuing from deleted element. */ - list_for_each_continue_rcu(*i, head) { - struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; - - if (hook_thresh > elem->priority) + list_for_each_entry_continue_rcu((*elemp), head, list) { + if (hook_thresh > (*elemp)->priority) continue; /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ - verdict = elem->hook(hook, skb, indev, outdev, okfn); +repeat: + verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", - elem->hook, hook); + (*elemp)->hook, hook); continue; } #endif if (verdict != NF_REPEAT) return verdict; - *i = (*i)->prev; + goto repeat; } } return NF_ACCEPT; @@ -159,97 +167,80 @@ unsigned int nf_iterate(struct list_head *head, /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */ -int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, +int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh) { - struct list_head *elem; + struct nf_hook_ops *elem; unsigned int verdict; int ret = 0; /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = &nf_hooks[pf][hook]; + elem = list_entry_rcu(&nf_hooks[pf][hook], struct nf_hook_ops, list); next_hook: - verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, + verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; - goto unlock; - } else if (verdict == NF_DROP) { - kfree_skb(*pskb); - ret = -EPERM; - } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - NFDEBUG("nf_hook: Verdict = QUEUE.\n"); - if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn, - verdict >> NF_VERDICT_BITS)) - goto next_hook; + } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { + kfree_skb(skb); + ret = NF_DROP_GETERR(verdict); + if (ret == 0) + ret = -EPERM; + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { + int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_QBITS); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; + if (err == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } } -unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(nf_hook_slow); -int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len) +int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) { - struct sk_buff *nskb; - - if (writable_len > (*pskb)->len) + if (writable_len > skb->len) return 0; /* Not exclusive use of packet? Must copy. */ - if (skb_shared(*pskb) || skb_cloned(*pskb)) - goto copy_skb; - - return pskb_may_pull(*pskb, writable_len); - -copy_skb: - nskb = skb_copy(*pskb, GFP_ATOMIC); - if (!nskb) - return 0; - BUG_ON(skb_is_nonlinear(nskb)); - - /* Rest of kernel will get very unhappy if we pass it a - suddenly-orphaned skbuff */ - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - return 1; + if (!skb_cloned(skb)) { + if (writable_len <= skb_headlen(skb)) + return 1; + } else if (skb_clone_writable(skb, writable_len)) + return 1; + + if (writable_len <= skb_headlen(skb)) + writable_len = 0; + else + writable_len -= skb_headlen(skb); + + return !!__pskb_pull_tail(skb, writable_len); } EXPORT_SYMBOL(skb_make_writable); -void nf_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr) -{ - __be32 diff[] = { ~from, to }; - if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_partial((char *)diff, sizeof(diff), - ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial((char *)diff, sizeof(diff), - ~skb->csum); - } else if (pseudohdr) - *sum = ~csum_fold(csum_partial((char *)diff, sizeof(diff), - csum_unfold(*sum))); -} -EXPORT_SYMBOL(nf_proto_csum_replace4); - -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) + __rcu __read_mostly; EXPORT_SYMBOL(ip_ct_attach); -void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { - void (*attach)(struct sk_buff *, struct sk_buff *); + void (*attach)(struct sk_buff *, const struct sk_buff *); if (skb->nfct) { rcu_read_lock(); @@ -261,7 +252,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) } EXPORT_SYMBOL(nf_ct_attach); -void (*nf_ct_destroy)(struct nf_conntrack *); +void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly; EXPORT_SYMBOL(nf_ct_destroy); void nf_conntrack_destroy(struct nf_conntrack *nfct) @@ -275,29 +266,65 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) rcu_read_unlock(); } EXPORT_SYMBOL(nf_conntrack_destroy); + +struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfq_ct_hook); + +struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfq_ct_nat_hook); + #endif /* CONFIG_NF_CONNTRACK */ +#ifdef CONFIG_NF_NAT_NEEDED +void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); +EXPORT_SYMBOL(nf_nat_decode_session_hook); +#endif + +static int __net_init netfilter_net_init(struct net *net) +{ #ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_net_netfilter; -EXPORT_SYMBOL(proc_net_netfilter); + net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", + net->proc_net); + if (!net->nf.proc_netfilter) { + if (!net_eq(net, &init_net)) + pr_err("cannot create netfilter proc entry"); + + return -ENOMEM; + } #endif + return 0; +} -void __init netfilter_init(void) +static void __net_exit netfilter_net_exit(struct net *net) { - int i, h; - for (i = 0; i < NPROTO; i++) { + remove_proc_entry("netfilter", net->proc_net); +} + +static struct pernet_operations netfilter_net_ops = { + .init = netfilter_net_init, + .exit = netfilter_net_exit, +}; + +int __init netfilter_init(void) +{ + int i, h, ret; + + for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) INIT_LIST_HEAD(&nf_hooks[i][h]); } -#ifdef CONFIG_PROC_FS - proc_net_netfilter = proc_mkdir("netfilter", proc_net); - if (!proc_net_netfilter) - panic("cannot create netfilter proc entry"); -#endif + ret = register_pernet_subsys(&netfilter_net_ops); + if (ret < 0) + goto err; - if (netfilter_queue_init() < 0) - panic("cannot initialize nf_queue"); - if (netfilter_log_init() < 0) - panic("cannot initialize nf_log"); + ret = netfilter_log_init(); + if (ret < 0) + goto err_pernet; + + return 0; +err_pernet: + unregister_pernet_subsys(&netfilter_net_ops); +err: + return ret; } diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig new file mode 100644 index 00000000000..2f7f5c32c6f --- /dev/null +++ b/net/netfilter/ipset/Kconfig @@ -0,0 +1,159 @@ +menuconfig IP_SET + tristate "IP set support" + depends on INET && NETFILTER + select NETFILTER_NETLINK + help + This option adds IP set support to the kernel. + In order to define and use the sets, you need the userspace utility + ipset(8). You can use the sets in netfilter via the "set" match + and "SET" target. + + To compile it as a module, choose M here. If unsure, say N. + +if IP_SET + +config IP_SET_MAX + int "Maximum number of IP sets" + default 256 + range 2 65534 + depends on IP_SET + help + You can define here default value of the maximum number + of IP sets for the kernel. + + The value can be overridden by the 'max_sets' module + parameter of the 'ip_set' module. + +config IP_SET_BITMAP_IP + tristate "bitmap:ip set support" + depends on IP_SET + help + This option adds the bitmap:ip set type support, by which one + can store IPv4 addresses (or network addresse) from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_BITMAP_IPMAC + tristate "bitmap:ip,mac set support" + depends on IP_SET + help + This option adds the bitmap:ip,mac set type support, by which one + can store IPv4 address and (source) MAC address pairs from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_BITMAP_PORT + tristate "bitmap:port set support" + depends on IP_SET + help + This option adds the bitmap:port set type support, by which one + can store TCP/UDP port numbers from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IP + tristate "hash:ip set support" + depends on IP_SET + help + This option adds the hash:ip set type support, by which one + can store arbitrary IPv4 or IPv6 addresses (or network addresses) + in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPMARK + tristate "hash:ip,mark set support" + depends on IP_SET + help + This option adds the hash:ip,mark set type support, by which one + can store IPv4/IPv6 address and mark pairs. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORT + tristate "hash:ip,port set support" + depends on IP_SET + help + This option adds the hash:ip,port set type support, by which one + can store IPv4/IPv6 address and protocol/port pairs. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORTIP + tristate "hash:ip,port,ip set support" + depends on IP_SET + help + This option adds the hash:ip,port,ip set type support, by which + one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 + address triples in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORTNET + tristate "hash:ip,port,net set support" + depends on IP_SET + help + This option adds the hash:ip,port,net set type support, by which + one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 + network address/prefix triples in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NETPORTNET + tristate "hash:net,port,net set support" + depends on IP_SET + help + This option adds the hash:net,port,net set type support, by which + one can store two IPv4/IPv6 subnets, and a protocol/port in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NET + tristate "hash:net set support" + depends on IP_SET + help + This option adds the hash:net set type support, by which + one can store IPv4/IPv6 network address/prefix elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NETNET + tristate "hash:net,net set support" + depends on IP_SET + help + This option adds the hash:net,net set type support, by which + one can store IPv4/IPv6 network address/prefix pairs in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NETPORT + tristate "hash:net,port set support" + depends on IP_SET + help + This option adds the hash:net,port set type support, by which + one can store IPv4/IPv6 network address/prefix and + protocol/port pairs as elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NETIFACE + tristate "hash:net,iface set support" + depends on IP_SET + help + This option adds the hash:net,iface set type support, by which + one can store IPv4/IPv6 network address/prefix and + interface name pairs as elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_LIST_SET + tristate "list:set set support" + depends on IP_SET + help + This option adds the list:set set type support. In this + kind of set one can store the name of other sets and it forms + an ordered union of the member sets. + + To compile it as a module, choose M here. If unsure, say N. + +endif # IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile new file mode 100644 index 00000000000..231f10196cb --- /dev/null +++ b/net/netfilter/ipset/Makefile @@ -0,0 +1,28 @@ +# +# Makefile for the ipset modules +# + +ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o + +# ipset core +obj-$(CONFIG_IP_SET) += ip_set.o + +# bitmap types +obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o +obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o +obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o + +# hash types +obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o +obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o +obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o +obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o +obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o +obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o +obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o +obj-$(CONFIG_IP_SET_HASH_NETNET) += ip_set_hash_netnet.o +obj-$(CONFIG_IP_SET_HASH_NETPORTNET) += ip_set_hash_netportnet.o + +# list types +obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h new file mode 100644 index 00000000000..f2c7d83dc23 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -0,0 +1,289 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __IP_SET_BITMAP_IP_GEN_H +#define __IP_SET_BITMAP_IP_GEN_H + +#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) +#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) +#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) +#define mtype_do_add IPSET_TOKEN(MTYPE, _do_add) +#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_do_del IPSET_TOKEN(MTYPE, _do_del) +#define mtype_do_list IPSET_TOKEN(MTYPE, _do_list) +#define mtype_do_head IPSET_TOKEN(MTYPE, _do_head) +#define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem) +#define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) +#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) +#define mtype_flush IPSET_TOKEN(MTYPE, _flush) +#define mtype_head IPSET_TOKEN(MTYPE, _head) +#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) +#define mtype_elem IPSET_TOKEN(MTYPE, _elem) +#define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_add IPSET_TOKEN(MTYPE, _add) +#define mtype_del IPSET_TOKEN(MTYPE, _del) +#define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype MTYPE + +#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id)) + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct mtype *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&map->gc); +} + +static void +mtype_ext_cleanup(struct ip_set *set) +{ + struct mtype *map = set->data; + u32 id; + + for (id = 0; id < map->elements; id++) + if (test_bit(id, map->members)) + ip_set_ext_destroy(set, get_ext(set, map, id)); +} + +static void +mtype_destroy(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + if (set->dsize) { + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + ip_set_free(map->extensions); + } + kfree(map); + + set->data = NULL; +} + +static void +mtype_flush(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + memset(map->members, 0, map->memsize); +} + +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct mtype *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (mtype_do_head(skb, map) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + + map->memsize + + set->dsize * map->elements))) + goto nla_put_failure; + if (unlikely(ip_set_put_flags(skb, set))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(set, map, e->id); + int ret = mtype_do_test(e, map, set->dsize); + + if (ret <= 0) + return ret; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, set))) + return 0; + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(x, set), ext, mext, flags); + return 1; +} + +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(set, map, e->id); + int ret = mtype_do_add(e, map, flags, set->dsize); + + if (ret == IPSET_ADD_FAILED) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, set))) + ret = 0; + else if (!(flags & IPSET_FLAG_EXIST)) + return -IPSET_ERR_EXIST; + /* Element is re-added, cleanup extensions */ + ip_set_ext_destroy(set, x); + } + + if (SET_WITH_TIMEOUT(set)) +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret); +#else + ip_set_timeout_set(ext_timeout(x, set), ext->timeout); +#endif + + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(x, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(x, set), ext); + return 0; +} + +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(set, map, e->id); + + if (mtype_do_del(e, map)) + return -IPSET_ERR_EXIST; + + ip_set_ext_destroy(set, x); + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, set))) + return -IPSET_ERR_EXIST; + + return 0; +} + +#ifndef IP_SET_BITMAP_STORED_TIMEOUT +static inline bool +mtype_is_filled(const struct mtype_elem *x) +{ + return true; +} +#endif + +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + struct mtype *map = set->data; + struct nlattr *adt, *nested; + void *x; + u32 id, first = cb->args[IPSET_CB_ARG0]; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[IPSET_CB_ARG0] < map->elements; + cb->args[IPSET_CB_ARG0]++) { + id = cb->args[IPSET_CB_ARG0]; + x = get_ext(set, map, id); + if (!test_bit(id, map->members) || + (SET_WITH_TIMEOUT(set) && +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_is_filled((const struct mtype_elem *) x) && +#endif + ip_set_timeout_expired(ext_timeout(x, set)))) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_do_list(skb, map, id, set->dsize)) + goto nla_put_failure; + if (ip_set_put_extensions(skb, set, x, + mtype_is_filled((const struct mtype_elem *) x))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[IPSET_CB_ARG0] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + if (unlikely(id == first)) { + cb->args[IPSET_CB_ARG0] = 0; + return -EMSGSIZE; + } + ipset_nest_end(skb, adt); + return 0; +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct mtype *map = set->data; + void *x; + u32 id; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id < map->elements; id++) + if (mtype_gc_test(id, map, set->dsize)) { + x = get_ext(set, map, id); + if (ip_set_timeout_expired(ext_timeout(x, set))) { + clear_bit(id, map->members); + ip_set_ext_destroy(set, x); + } + } + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&map->gc); +} + +static const struct ip_set_type_variant mtype = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .same_set = mtype_same_set, +}; + +#endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c new file mode 100644 index 00000000000..6f1f9f49480 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -0,0 +1,377 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counter support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_bitmap:ip"); + +#define MTYPE bitmap_ip + +/* Type structure */ +struct bitmap_ip { + void *members; /* the set members */ + void *extensions; /* data extensions */ + u32 first_ip; /* host byte order, included in range */ + u32 last_ip; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ + u32 hosts; /* number of hosts in a subnet */ + size_t memsize; /* members size */ + u8 netmask; /* subnet netmask */ + struct timer_list gc; /* garbage collection */ +}; + +/* ADT structure for generic function args */ +struct bitmap_ip_adt_elem { + u16 id; +}; + +static inline u32 +ip_to_id(const struct bitmap_ip *m, u32 ip) +{ + return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; +} + +/* Common functions */ + +static inline int +bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, + struct bitmap_ip *map, size_t dsize) +{ + return !!test_bit(e->id, map->members); +} + +static inline int +bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) +{ + return !!test_bit(id, map->members); +} + +static inline int +bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, + u32 flags, size_t dsize) +{ + return !!test_and_set_bit(e->id, map->members); +} + +static inline int +bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) +{ + return !test_and_clear_bit(e->id, map->members); +} + +static inline int +bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, + size_t dsize) +{ + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); +} + +static inline int +bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) +{ + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || + (map->netmask != 32 && + nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)); +} + +static int +bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct bitmap_ip *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + u32 ip; + + ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + e.id = ip_to_id(map, ip); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct bitmap_ip *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 ip = 0, ip_to = 0; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + if (adt == IPSET_TEST) { + e.id = ip_to_id(map, ip); + return adtfn(set, &e, &ext, &ext, flags); + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) { + swap(ip, ip_to); + if (ip < map->first_ip) + return -IPSET_ERR_BITMAP_RANGE; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } else + ip_to = ip; + + if (ip_to > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + for (; !before(ip_to, ip); ip += map->hosts) { + e.id = ip_to_id(map, ip); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_ip *x = a->data; + const struct bitmap_ip *y = b->data; + + return x->first_ip == y->first_ip && + x->last_ip == y->last_ip && + x->netmask == y->netmask && + a->timeout == b->timeout && + a->extensions == b->extensions; +} + +/* Plain variant */ + +struct bitmap_ip_elem { +}; + +#include "ip_set_bitmap_gen.h" + +/* Create bitmap:ip type of sets */ + +static bool +init_map_ip(struct ip_set *set, struct bitmap_ip *map, + u32 first_ip, u32 last_ip, + u32 elements, u32 hosts, u8 netmask) +{ + map->members = ip_set_alloc(map->memsize); + if (!map->members) + return false; + if (set->dsize) { + map->extensions = ip_set_alloc(set->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } + map->first_ip = first_ip; + map->last_ip = last_ip; + map->elements = elements; + map->hosts = hosts; + map->netmask = netmask; + set->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = NFPROTO_IPV4; + + return true; +} + +static int +bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + struct bitmap_ip *map; + u32 first_ip = 0, last_ip = 0, hosts; + u64 elements; + u8 netmask = 32; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); + if (ret) + return ret; + if (first_ip > last_ip) { + u32 tmp = first_ip; + + first_ip = last_ip; + last_ip = tmp; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr >= 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(first_ip, last_ip, cidr); + } else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if (netmask > 32) + return -IPSET_ERR_INVALID_NETMASK; + + first_ip &= ip_set_hostmask(netmask); + last_ip |= ~ip_set_hostmask(netmask); + } + + if (netmask == 32) { + hosts = 1; + elements = (u64)last_ip - first_ip + 1; + } else { + u8 mask_bits; + u32 mask; + + mask = range_to_mask(first_ip, last_ip, &mask_bits); + + if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) || + netmask <= mask_bits) + return -IPSET_ERR_BITMAP_RANGE; + + pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask); + hosts = 2 << (32 - netmask - 1); + elements = 2 << (netmask - mask_bits - 1); + } + if (elements > IPSET_BITMAP_MAX_RANGE + 1) + return -IPSET_ERR_BITMAP_RANGE_SIZE; + + pr_debug("hosts %u, elements %llu\n", + hosts, (unsigned long long)elements); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ip; + set->dsize = ip_set_elem_len(set, tb, 0); + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + if (tb[IPSET_ATTR_TIMEOUT]) { + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + bitmap_ip_gc_init(set, bitmap_ip_gc); + } + return 0; +} + +static struct ip_set_type bitmap_ip_type __read_mostly = { + .name = "bitmap:ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_IPV4, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = bitmap_ip_create, + .create_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_ip_init(void) +{ + return ip_set_type_register(&bitmap_ip_type); +} + +static void __exit +bitmap_ip_fini(void) +{ + ip_set_type_unregister(&bitmap_ip_type); +} + +module_init(bitmap_ip_init); +module_exit(bitmap_ip_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c new file mode 100644 index 00000000000..740eabededd --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -0,0 +1,414 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Martin Josefsson <gandalf@wlug.westbo.se> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip,mac type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/if_ether.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> + +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counter support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_bitmap:ip,mac"); + +#define MTYPE bitmap_ipmac +#define IP_SET_BITMAP_STORED_TIMEOUT + +enum { + MAC_UNSET, /* element is set, without MAC */ + MAC_FILLED, /* element is set with MAC */ +}; + +/* Type structure */ +struct bitmap_ipmac { + void *members; /* the set members */ + void *extensions; /* MAC + data extensions */ + u32 first_ip; /* host byte order, included in range */ + u32 last_ip; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ + size_t memsize; /* members size */ + struct timer_list gc; /* garbage collector */ +}; + +/* ADT structure for generic function args */ +struct bitmap_ipmac_adt_elem { + u16 id; + unsigned char *ether; +}; + +struct bitmap_ipmac_elem { + unsigned char ether[ETH_ALEN]; + unsigned char filled; +} __attribute__ ((aligned)); + +static inline u32 +ip_to_id(const struct bitmap_ipmac *m, u32 ip) +{ + return ip - m->first_ip; +} + +static inline struct bitmap_ipmac_elem * +get_elem(void *extensions, u16 id, size_t dsize) +{ + return (struct bitmap_ipmac_elem *)(extensions + id * dsize); +} + +/* Common functions */ + +static inline int +bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, + const struct bitmap_ipmac *map, size_t dsize) +{ + const struct bitmap_ipmac_elem *elem; + + if (!test_bit(e->id, map->members)) + return 0; + elem = get_elem(map->extensions, e->id, dsize); + if (elem->filled == MAC_FILLED) + return e->ether == NULL || + ether_addr_equal(e->ether, elem->ether); + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; +} + +static inline int +bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) +{ + const struct bitmap_ipmac_elem *elem; + + if (!test_bit(id, map->members)) + return 0; + elem = get_elem(map->extensions, id, dsize); + /* Timer not started for the incomplete elements */ + return elem->filled == MAC_FILLED; +} + +static inline int +bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) +{ + return elem->filled == MAC_FILLED; +} + +static inline int +bitmap_ipmac_add_timeout(unsigned long *timeout, + const struct bitmap_ipmac_adt_elem *e, + const struct ip_set_ext *ext, struct ip_set *set, + struct bitmap_ipmac *map, int mode) +{ + u32 t = ext->timeout; + + if (mode == IPSET_ADD_START_STORED_TIMEOUT) { + if (t == set->timeout) + /* Timeout was not specified, get stored one */ + t = *timeout; + ip_set_timeout_set(timeout, t); + } else { + /* If MAC is unset yet, we store plain timeout value + * because the timer is not activated yet + * and we can reuse it later when MAC is filled out, + * possibly by the kernel */ + if (e->ether) + ip_set_timeout_set(timeout, t); + else + *timeout = t; + } + return 0; +} + +static inline int +bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map, u32 flags, size_t dsize) +{ + struct bitmap_ipmac_elem *elem; + + elem = get_elem(map->extensions, e->id, dsize); + if (test_and_set_bit(e->id, map->members)) { + if (elem->filled == MAC_FILLED) { + if (e->ether && (flags & IPSET_FLAG_EXIST)) + memcpy(elem->ether, e->ether, ETH_ALEN); + return IPSET_ADD_FAILED; + } else if (!e->ether) + /* Already added without ethernet address */ + return IPSET_ADD_FAILED; + /* Fill the MAC address and trigger the timer activation */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return IPSET_ADD_START_STORED_TIMEOUT; + } else if (e->ether) { + /* We can store MAC too */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return 0; + } else { + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; + } +} + +static inline int +bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map) +{ + return !test_and_clear_bit(e->id, map->members); +} + +static inline int +bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, + u32 id, size_t dsize) +{ + const struct bitmap_ipmac_elem *elem = + get_elem(map->extensions, id, dsize); + + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)) || + (elem->filled == MAC_FILLED && + nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); +} + +static inline int +bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) +{ + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); +} + +static int +bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct bitmap_ipmac *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + u32 ip; + + /* MAC can be src only */ + if (!(opt->flags & IPSET_DIM_TWO_SRC)) + return 0; + + ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + /* Backward compatibility: we don't check the second flag */ + if (skb_mac_header(skb) < skb->head || + (skb_mac_header(skb) + ETH_HLEN) > skb->data) + return -EINVAL; + + e.id = ip_to_id(map, ip); + e.ether = eth_hdr(skb)->h_source; + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct bitmap_ipmac *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + e.id = ip_to_id(map, ip); + if (tb[IPSET_ATTR_ETHER]) + e.ether = nla_data(tb[IPSET_ATTR_ETHER]); + else + e.ether = NULL; + + ret = adtfn(set, &e, &ext, &ext, flags); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static bool +bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_ipmac *x = a->data; + const struct bitmap_ipmac *y = b->data; + + return x->first_ip == y->first_ip && + x->last_ip == y->last_ip && + a->timeout == b->timeout && + a->extensions == b->extensions; +} + +/* Plain variant */ + +#include "ip_set_bitmap_gen.h" + +/* Create bitmap:ip,mac type of sets */ + +static bool +init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, + u32 first_ip, u32 last_ip, u32 elements) +{ + map->members = ip_set_alloc(map->memsize); + if (!map->members) + return false; + if (set->dsize) { + map->extensions = ip_set_alloc(set->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } + map->first_ip = first_ip; + map->last_ip = last_ip; + map->elements = elements; + set->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = NFPROTO_IPV4; + + return true; +} + +static int +bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + u32 first_ip = 0, last_ip = 0; + u64 elements; + struct bitmap_ipmac *map; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); + if (ret) + return ret; + if (first_ip > last_ip) { + u32 tmp = first_ip; + + first_ip = last_ip; + last_ip = tmp; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr >= 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(first_ip, last_ip, cidr); + } else + return -IPSET_ERR_PROTOCOL; + + elements = (u64)last_ip - first_ip + 1; + + if (elements > IPSET_BITMAP_MAX_RANGE + 1) + return -IPSET_ERR_BITMAP_RANGE_SIZE; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ipmac; + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct bitmap_ipmac_elem)); + if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { + kfree(map); + return -ENOMEM; + } + if (tb[IPSET_ATTR_TIMEOUT]) { + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); + } + return 0; +} + +static struct ip_set_type bitmap_ipmac_type = { + .name = "bitmap:ip,mac", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_IPV4, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = bitmap_ipmac_create, + .create_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, + .len = ETH_ALEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_ipmac_init(void) +{ + return ip_set_type_register(&bitmap_ipmac_type); +} + +static void __exit +bitmap_ipmac_fini(void) +{ + ip_set_type_unregister(&bitmap_ipmac_type); +} + +module_init(bitmap_ipmac_init); +module_exit(bitmap_ipmac_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c new file mode 100644 index 00000000000..cf99676e69f --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -0,0 +1,311 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:port type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> + +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> +#include <linux/netfilter/ipset/ip_set_getport.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counter support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_bitmap:port"); + +#define MTYPE bitmap_port + +/* Type structure */ +struct bitmap_port { + void *members; /* the set members */ + void *extensions; /* data extensions */ + u16 first_port; /* host byte order, included in range */ + u16 last_port; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ + size_t memsize; /* members size */ + struct timer_list gc; /* garbage collection */ +}; + +/* ADT structure for generic function args */ +struct bitmap_port_adt_elem { + u16 id; +}; + +static inline u16 +port_to_id(const struct bitmap_port *m, u16 port) +{ + return port - m->first_port; +} + +/* Common functions */ + +static inline int +bitmap_port_do_test(const struct bitmap_port_adt_elem *e, + const struct bitmap_port *map, size_t dsize) +{ + return !!test_bit(e->id, map->members); +} + +static inline int +bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) +{ + return !!test_bit(id, map->members); +} + +static inline int +bitmap_port_do_add(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map, u32 flags, size_t dsize) +{ + return !!test_and_set_bit(e->id, map->members); +} + +static inline int +bitmap_port_do_del(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map) +{ + return !test_and_clear_bit(e->id, map->members); +} + +static inline int +bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, + size_t dsize) +{ + return nla_put_net16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); +} + +static inline int +bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) +{ + return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || + nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); +} + +static int +bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct bitmap_port *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + __be16 __port; + u16 port = 0; + + if (!ip_set_get_ip_port(skb, opt->family, + opt->flags & IPSET_DIM_ONE_SRC, &__port)) + return -EINVAL; + + port = ntohs(__port); + + if (port < map->first_port || port > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + e.id = port_to_id(map, port); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct bitmap_port *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port; /* wraparound */ + u16 port_to; + int ret = 0; + + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); + if (port < map->first_port || port > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (adt == IPSET_TEST) { + e.id = port_to_id(map, port); + return adtfn(set, &e, &ext, &ext, flags); + } + + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) { + swap(port, port_to); + if (port < map->first_port) + return -IPSET_ERR_BITMAP_RANGE; + } + } else + port_to = port; + + if (port_to > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + for (; port <= port_to; port++) { + e.id = port_to_id(map, port); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_port *x = a->data; + const struct bitmap_port *y = b->data; + + return x->first_port == y->first_port && + x->last_port == y->last_port && + a->timeout == b->timeout && + a->extensions == b->extensions; +} + +/* Plain variant */ + +struct bitmap_port_elem { +}; + +#include "ip_set_bitmap_gen.h" + +/* Create bitmap:ip type of sets */ + +static bool +init_map_port(struct ip_set *set, struct bitmap_port *map, + u16 first_port, u16 last_port) +{ + map->members = ip_set_alloc(map->memsize); + if (!map->members) + return false; + if (set->dsize) { + map->extensions = ip_set_alloc(set->dsize * map->elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } + map->first_port = first_port; + map->last_port = last_port; + set->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = NFPROTO_UNSPEC; + + return true; +} + +static int +bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + struct bitmap_port *map; + u16 first_port, last_port; + + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); + last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (first_port > last_port) { + u16 tmp = first_port; + + first_port = last_port; + last_port = tmp; + } + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + map->elements = last_port - first_port + 1; + map->memsize = bitmap_bytes(0, map->elements); + set->variant = &bitmap_port; + set->dsize = ip_set_elem_len(set, tb, 0); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + if (tb[IPSET_ATTR_TIMEOUT]) { + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + bitmap_port_gc_init(set, bitmap_port_gc); + } + return 0; +} + +static struct ip_set_type bitmap_port_type = { + .name = "bitmap:port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_PORT, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = bitmap_port_create, + .create_policy = { + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_port_init(void) +{ + return ip_set_type_register(&bitmap_port_type); +} + +static void __exit +bitmap_port_fini(void) +{ + ip_set_type_unregister(&bitmap_port_type); +} + +module_init(bitmap_port_init); +module_exit(bitmap_port_fini); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c new file mode 100644 index 00000000000..ec8114fae50 --- /dev/null +++ b/net/netfilter/ipset/ip_set_core.c @@ -0,0 +1,2011 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module for IP set management */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/rculist.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/ipset/ip_set.h> + +static LIST_HEAD(ip_set_type_list); /* all registered set types */ +static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ +static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ + +struct ip_set_net { + struct ip_set * __rcu *ip_set_list; /* all individual sets */ + ip_set_id_t ip_set_max; /* max number of sets */ + int is_deleted; /* deleted by ip_set_net_exit */ +}; +static int ip_set_net_id __read_mostly; + +static inline struct ip_set_net *ip_set_pernet(struct net *net) +{ + return net_generic(net, ip_set_net_id); +} + +#define IP_SET_INC 64 +#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) + +static unsigned int max_sets; + +module_param(max_sets, int, 0600); +MODULE_PARM_DESC(max_sets, "maximal number of sets"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("core IP set support"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); + +/* When the nfnl mutex is held: */ +#define ip_set_dereference(p) \ + rcu_dereference_protected(p, 1) +#define ip_set(inst, id) \ + ip_set_dereference((inst)->ip_set_list)[id] + +/* + * The set types are implemented in modules and registered set types + * can be found in ip_set_type_list. Adding/deleting types is + * serialized by ip_set_type_mutex. + */ + +static inline void +ip_set_type_lock(void) +{ + mutex_lock(&ip_set_type_mutex); +} + +static inline void +ip_set_type_unlock(void) +{ + mutex_unlock(&ip_set_type_mutex); +} + +/* Register and deregister settype */ + +static struct ip_set_type * +find_set_type(const char *name, u8 family, u8 revision) +{ + struct ip_set_type *type; + + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name) && + (type->family == family || + type->family == NFPROTO_UNSPEC) && + revision >= type->revision_min && + revision <= type->revision_max) + return type; + return NULL; +} + +/* Unlock, try to load a set type module and lock again */ +static bool +load_settype(const char *name) +{ + nfnl_unlock(NFNL_SUBSYS_IPSET); + pr_debug("try to load ip_set_%s\n", name); + if (request_module("ip_set_%s", name) < 0) { + pr_warning("Can't find ip_set type %s\n", name); + nfnl_lock(NFNL_SUBSYS_IPSET); + return false; + } + nfnl_lock(NFNL_SUBSYS_IPSET); + return true; +} + +/* Find a set type and reference it */ +#define find_set_type_get(name, family, revision, found) \ + __find_set_type_get(name, family, revision, found, false) + +static int +__find_set_type_get(const char *name, u8 family, u8 revision, + struct ip_set_type **found, bool retry) +{ + struct ip_set_type *type; + int err; + + if (retry && !load_settype(name)) + return -IPSET_ERR_FIND_TYPE; + + rcu_read_lock(); + *found = find_set_type(name, family, revision); + if (*found) { + err = !try_module_get((*found)->me) ? -EFAULT : 0; + goto unlock; + } + /* Make sure the type is already loaded + * but we don't support the revision */ + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name)) { + err = -IPSET_ERR_FIND_TYPE; + goto unlock; + } + rcu_read_unlock(); + + return retry ? -IPSET_ERR_FIND_TYPE : + __find_set_type_get(name, family, revision, found, true); + +unlock: + rcu_read_unlock(); + return err; +} + +/* Find a given set type by name and family. + * If we succeeded, the supported minimal and maximum revisions are + * filled out. + */ +#define find_set_type_minmax(name, family, min, max) \ + __find_set_type_minmax(name, family, min, max, false) + +static int +__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, + bool retry) +{ + struct ip_set_type *type; + bool found = false; + + if (retry && !load_settype(name)) + return -IPSET_ERR_FIND_TYPE; + + *min = 255; *max = 0; + rcu_read_lock(); + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name) && + (type->family == family || + type->family == NFPROTO_UNSPEC)) { + found = true; + if (type->revision_min < *min) + *min = type->revision_min; + if (type->revision_max > *max) + *max = type->revision_max; + } + rcu_read_unlock(); + if (found) + return 0; + + return retry ? -IPSET_ERR_FIND_TYPE : + __find_set_type_minmax(name, family, min, max, true); +} + +#define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \ + (f) == NFPROTO_IPV6 ? "inet6" : "any") + +/* Register a set type structure. The type is identified by + * the unique triple of name, family and revision. + */ +int +ip_set_type_register(struct ip_set_type *type) +{ + int ret = 0; + + if (type->protocol != IPSET_PROTOCOL) { + pr_warning("ip_set type %s, family %s, revision %u:%u uses " + "wrong protocol version %u (want %u)\n", + type->name, family_name(type->family), + type->revision_min, type->revision_max, + type->protocol, IPSET_PROTOCOL); + return -EINVAL; + } + + ip_set_type_lock(); + if (find_set_type(type->name, type->family, type->revision_min)) { + /* Duplicate! */ + pr_warning("ip_set type %s, family %s with revision min %u " + "already registered!\n", type->name, + family_name(type->family), type->revision_min); + ret = -EINVAL; + goto unlock; + } + list_add_rcu(&type->list, &ip_set_type_list); + pr_debug("type %s, family %s, revision %u:%u registered.\n", + type->name, family_name(type->family), + type->revision_min, type->revision_max); +unlock: + ip_set_type_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_type_register); + +/* Unregister a set type. There's a small race with ip_set_create */ +void +ip_set_type_unregister(struct ip_set_type *type) +{ + ip_set_type_lock(); + if (!find_set_type(type->name, type->family, type->revision_min)) { + pr_warning("ip_set type %s, family %s with revision min %u " + "not registered\n", type->name, + family_name(type->family), type->revision_min); + goto unlock; + } + list_del_rcu(&type->list); + pr_debug("type %s, family %s with revision min %u unregistered.\n", + type->name, family_name(type->family), type->revision_min); +unlock: + ip_set_type_unlock(); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(ip_set_type_unregister); + +/* Utility functions */ +void * +ip_set_alloc(size_t size) +{ + void *members = NULL; + + if (size < KMALLOC_MAX_SIZE) + members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + + if (members) { + pr_debug("%p: allocated with kmalloc\n", members); + return members; + } + + members = vzalloc(size); + if (!members) + return NULL; + pr_debug("%p: allocated with vmalloc\n", members); + + return members; +} +EXPORT_SYMBOL_GPL(ip_set_alloc); + +void +ip_set_free(void *members) +{ + pr_debug("%p: free with %s\n", members, + is_vmalloc_addr(members) ? "vfree" : "kfree"); + kvfree(members); +} +EXPORT_SYMBOL_GPL(ip_set_free); + +static inline bool +flag_nested(const struct nlattr *nla) +{ + return nla->nla_type & NLA_F_NESTED; +} + +static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { + [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, + [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, +}; + +int +ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) +{ + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + + if (unlikely(!flag_nested(nla))) + return -IPSET_ERR_PROTOCOL; + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + return -IPSET_ERR_PROTOCOL; + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) + return -IPSET_ERR_PROTOCOL; + + *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]); + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); + +int +ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) +{ + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + + if (unlikely(!flag_nested(nla))) + return -IPSET_ERR_PROTOCOL; + + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + return -IPSET_ERR_PROTOCOL; + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) + return -IPSET_ERR_PROTOCOL; + + memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), + sizeof(struct in6_addr)); + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); + +typedef void (*destroyer)(void *); +/* ipset data extension types, in size order */ + +const struct ip_set_ext_type ip_set_extensions[] = { + [IPSET_EXT_ID_COUNTER] = { + .type = IPSET_EXT_COUNTER, + .flag = IPSET_FLAG_WITH_COUNTERS, + .len = sizeof(struct ip_set_counter), + .align = __alignof__(struct ip_set_counter), + }, + [IPSET_EXT_ID_TIMEOUT] = { + .type = IPSET_EXT_TIMEOUT, + .len = sizeof(unsigned long), + .align = __alignof__(unsigned long), + }, + [IPSET_EXT_ID_COMMENT] = { + .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, + .flag = IPSET_FLAG_WITH_COMMENT, + .len = sizeof(struct ip_set_comment), + .align = __alignof__(struct ip_set_comment), + .destroy = (destroyer) ip_set_comment_free, + }, +}; +EXPORT_SYMBOL_GPL(ip_set_extensions); + +static inline bool +add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) +{ + return ip_set_extensions[id].flag ? + (flags & ip_set_extensions[id].flag) : + !!tb[IPSET_ATTR_TIMEOUT]; +} + +size_t +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +{ + enum ip_set_ext_id id; + size_t offset = 0; + u32 cadt_flags = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) + set->flags |= IPSET_CREATE_FLAG_FORCEADD; + for (id = 0; id < IPSET_EXT_ID_MAX; id++) { + if (!add_extension(id, cadt_flags, tb)) + continue; + offset += ALIGN(len + offset, ip_set_extensions[id].align); + set->offset[id] = offset; + set->extensions |= ip_set_extensions[id].type; + offset += ip_set_extensions[id].len; + } + return len + offset; +} +EXPORT_SYMBOL_GPL(ip_set_elem_len); + +int +ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], + struct ip_set_ext *ext) +{ + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!(set->extensions & IPSET_EXT_TIMEOUT)) + return -IPSET_ERR_TIMEOUT; + ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { + if (!(set->extensions & IPSET_EXT_COUNTER)) + return -IPSET_ERR_COUNTER; + if (tb[IPSET_ATTR_BYTES]) + ext->bytes = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_BYTES])); + if (tb[IPSET_ATTR_PACKETS]) + ext->packets = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_PACKETS])); + } + if (tb[IPSET_ATTR_COMMENT]) { + if (!(set->extensions & IPSET_EXT_COMMENT)) + return -IPSET_ERR_COMMENT; + ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); + } + + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_extensions); + +/* + * Creating/destroying/renaming/swapping affect the existence and + * the properties of a set. All of these can be executed from userspace + * only and serialized by the nfnl mutex indirectly from nfnetlink. + * + * Sets are identified by their index in ip_set_list and the index + * is used by the external references (set/SET netfilter modules). + * + * The set behind an index may change by swapping only, from userspace. + */ + +static inline void +__ip_set_get(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref++; + write_unlock_bh(&ip_set_ref_lock); +} + +static inline void +__ip_set_put(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + BUG_ON(set->ref == 0); + set->ref--; + write_unlock_bh(&ip_set_ref_lock); +} + +/* + * Add, del and test set entries from kernel. + * + * The set behind the index must exist and must be referenced + * so it can't be destroyed (or changed) under our foot. + */ + +static inline struct ip_set * +ip_set_rcu_get(struct net *net, ip_set_id_t index) +{ + struct ip_set *set; + struct ip_set_net *inst = ip_set_pernet(net); + + rcu_read_lock(); + /* ip_set_list itself needs to be protected */ + set = rcu_dereference(inst->ip_set_list)[index]; + rcu_read_unlock(); + + return set; +} + +int +ip_set_test(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, struct ip_set_adt_opt *opt) +{ + struct ip_set *set = ip_set_rcu_get( + dev_net(par->in ? par->in : par->out), index); + int ret = 0; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (opt->dim < set->type->dimension || + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return 0; + + read_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); + read_unlock_bh(&set->lock); + + if (ret == -EAGAIN) { + /* Type requests element to be completed */ + pr_debug("element must be competed, ADD is triggered\n"); + write_lock_bh(&set->lock); + set->variant->kadt(set, skb, par, IPSET_ADD, opt); + write_unlock_bh(&set->lock); + ret = 1; + } else { + /* --return-nomatch: invert matched element */ + if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && + (set->type->features & IPSET_TYPE_NOMATCH) && + (ret > 0 || ret == -ENOTEMPTY)) + ret = -ret; + } + + /* Convert error codes to nomatch */ + return (ret < 0 ? 0 : ret); +} +EXPORT_SYMBOL_GPL(ip_set_test); + +int +ip_set_add(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, struct ip_set_adt_opt *opt) +{ + struct ip_set *set = ip_set_rcu_get( + dev_net(par->in ? par->in : par->out), index); + int ret; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (opt->dim < set->type->dimension || + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return -IPSET_ERR_TYPE_MISMATCH; + + write_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); + write_unlock_bh(&set->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_add); + +int +ip_set_del(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, struct ip_set_adt_opt *opt) +{ + struct ip_set *set = ip_set_rcu_get( + dev_net(par->in ? par->in : par->out), index); + int ret = 0; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (opt->dim < set->type->dimension || + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return -IPSET_ERR_TYPE_MISMATCH; + + write_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); + write_unlock_bh(&set->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_del); + +/* + * Find set by name, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + */ +ip_set_id_t +ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) +{ + ip_set_id_t i, index = IPSET_INVALID_ID; + struct ip_set *s; + struct ip_set_net *inst = ip_set_pernet(net); + + rcu_read_lock(); + for (i = 0; i < inst->ip_set_max; i++) { + s = rcu_dereference(inst->ip_set_list)[i]; + if (s != NULL && STREQ(s->name, name)) { + __ip_set_get(s); + index = i; + *set = s; + break; + } + } + rcu_read_unlock(); + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_get_byname); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + */ + +static inline void +__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) +{ + struct ip_set *set; + + rcu_read_lock(); + set = rcu_dereference(inst->ip_set_list)[index]; + if (set != NULL) + __ip_set_put(set); + rcu_read_unlock(); +} + +void +ip_set_put_byindex(struct net *net, ip_set_id_t index) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + __ip_set_put_byindex(inst, index); +} +EXPORT_SYMBOL_GPL(ip_set_put_byindex); + +/* + * Get the name of a set behind a set index. + * We assume the set is referenced, so it does exist and + * can't be destroyed. The set cannot be renamed due to + * the referencing either. + * + */ +const char * +ip_set_name_byindex(struct net *net, ip_set_id_t index) +{ + const struct ip_set *set = ip_set_rcu_get(net, index); + + BUG_ON(set == NULL); + BUG_ON(set->ref == 0); + + /* Referenced, so it's safe */ + return set->name; +} +EXPORT_SYMBOL_GPL(ip_set_name_byindex); + +/* + * Routines to call by external subsystems, which do not + * call nfnl_lock for us. + */ + +/* + * Find set by index, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + * The nfnl mutex is used in the function. + */ +ip_set_id_t +ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) +{ + struct ip_set *set; + struct ip_set_net *inst = ip_set_pernet(net); + + if (index > inst->ip_set_max) + return IPSET_INVALID_ID; + + nfnl_lock(NFNL_SUBSYS_IPSET); + set = ip_set(inst, index); + if (set) + __ip_set_get(set); + else + index = IPSET_INVALID_ID; + nfnl_unlock(NFNL_SUBSYS_IPSET); + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + * The nfnl mutex is used in the function. + */ +void +ip_set_nfnl_put(struct net *net, ip_set_id_t index) +{ + struct ip_set *set; + struct ip_set_net *inst = ip_set_pernet(net); + + nfnl_lock(NFNL_SUBSYS_IPSET); + if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ + set = ip_set(inst, index); + if (set != NULL) + __ip_set_put(set); + } + nfnl_unlock(NFNL_SUBSYS_IPSET); +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_put); + +/* + * Communication protocol with userspace over netlink. + * + * The commands are serialized by the nfnl mutex. + */ + +static inline bool +protocol_failed(const struct nlattr * const tb[]) +{ + return !tb[IPSET_ATTR_PROTOCOL] || + nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL; +} + +static inline u32 +flag_exist(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST; +} + +static struct nlmsghdr * +start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, + enum ipset_cmd cmd) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + + nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), + sizeof(*nfmsg), flags); + if (nlh == NULL) + return NULL; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = NFPROTO_IPV4; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + return nlh; +} + +/* Create a set */ + +static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1}, + [IPSET_ATTR_REVISION] = { .type = NLA_U8 }, + [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, + [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, +}; + +static struct ip_set * +find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) +{ + struct ip_set *set = NULL; + ip_set_id_t i; + + *id = IPSET_INVALID_ID; + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set != NULL && STREQ(set->name, name)) { + *id = i; + break; + } + } + return (*id == IPSET_INVALID_ID ? NULL : set); +} + +static inline struct ip_set * +find_set(struct ip_set_net *inst, const char *name) +{ + ip_set_id_t id; + + return find_set_and_id(inst, name, &id); +} + +static int +find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, + struct ip_set **set) +{ + struct ip_set *s; + ip_set_id_t i; + + *index = IPSET_INVALID_ID; + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s == NULL) { + if (*index == IPSET_INVALID_ID) + *index = i; + } else if (STREQ(name, s->name)) { + /* Name clash */ + *set = s; + return -EEXIST; + } + } + if (*index == IPSET_INVALID_ID) + /* No free slot remained */ + return -IPSET_ERR_MAX_SETS; + return 0; +} + +static int +ip_set_none(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + return -EOPNOTSUPP; +} + +static int +ip_set_create(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct net *net = sock_net(ctnl); + struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set *set, *clash = NULL; + ip_set_id_t index = IPSET_INVALID_ID; + struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; + const char *name, *typename; + u8 family, revision; + u32 flags = flag_exist(nlh); + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_TYPENAME] == NULL || + attr[IPSET_ATTR_REVISION] == NULL || + attr[IPSET_ATTR_FAMILY] == NULL || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])))) + return -IPSET_ERR_PROTOCOL; + + name = nla_data(attr[IPSET_ATTR_SETNAME]); + typename = nla_data(attr[IPSET_ATTR_TYPENAME]); + family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); + revision = nla_get_u8(attr[IPSET_ATTR_REVISION]); + pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", + name, typename, family_name(family), revision); + + /* + * First, and without any locks, allocate and initialize + * a normal base set structure. + */ + set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); + if (!set) + return -ENOMEM; + rwlock_init(&set->lock); + strlcpy(set->name, name, IPSET_MAXNAMELEN); + set->family = family; + set->revision = revision; + + /* + * Next, check that we know the type, and take + * a reference on the type, to make sure it stays available + * while constructing our new set. + * + * After referencing the type, we try to create the type + * specific part of the set without holding any locks. + */ + ret = find_set_type_get(typename, family, revision, &(set->type)); + if (ret) + goto out; + + /* + * Without holding any locks, create private part. + */ + if (attr[IPSET_ATTR_DATA] && + nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], + set->type->create_policy)) { + ret = -IPSET_ERR_PROTOCOL; + goto put_out; + } + + ret = set->type->create(net, set, tb, flags); + if (ret != 0) + goto put_out; + + /* BTW, ret==0 here. */ + + /* + * Here, we have a valid, constructed set and we are protected + * by the nfnl mutex. Find the first free index in ip_set_list + * and check clashing. + */ + ret = find_free_id(inst, set->name, &index, &clash); + if (ret == -EEXIST) { + /* If this is the same set and requested, ignore error */ + if ((flags & IPSET_FLAG_EXIST) && + STREQ(set->type->name, clash->type->name) && + set->type->family == clash->type->family && + set->type->revision_min == clash->type->revision_min && + set->type->revision_max == clash->type->revision_max && + set->variant->same_set(set, clash)) + ret = 0; + goto cleanup; + } else if (ret == -IPSET_ERR_MAX_SETS) { + struct ip_set **list, **tmp; + ip_set_id_t i = inst->ip_set_max + IP_SET_INC; + + if (i < inst->ip_set_max || i == IPSET_INVALID_ID) + /* Wraparound */ + goto cleanup; + + list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); + if (!list) + goto cleanup; + /* nfnl mutex is held, both lists are valid */ + tmp = ip_set_dereference(inst->ip_set_list); + memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); + rcu_assign_pointer(inst->ip_set_list, list); + /* Make sure all current packets have passed through */ + synchronize_net(); + /* Use new list */ + index = inst->ip_set_max; + inst->ip_set_max = i; + kfree(tmp); + ret = 0; + } else if (ret) + goto cleanup; + + /* + * Finally! Add our shiny new set to the list, and be done. + */ + pr_debug("create: '%s' created with index %u!\n", set->name, index); + ip_set(inst, index) = set; + + return ret; + +cleanup: + set->variant->destroy(set); +put_out: + module_put(set->type->me); +out: + kfree(set); + return ret; +} + +/* Destroy sets */ + +static const struct nla_policy +ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, +}; + +static void +ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) +{ + struct ip_set *set = ip_set(inst, index); + + pr_debug("set: %s\n", set->name); + ip_set(inst, index) = NULL; + + /* Must call it without holding any lock */ + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); +} + +static int +ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *s; + ip_set_id_t i; + int ret = 0; + + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + /* Commands are serialized and references are + * protected by the ip_set_ref_lock. + * External systems (i.e. xt_set) must call + * ip_set_put|get_nfnl_* functions, that way we + * can safely check references here. + * + * list:set timer can only decrement the reference + * counter, so if it's already zero, we can proceed + * without holding the lock. + */ + read_lock_bh(&ip_set_ref_lock); + if (!attr[IPSET_ATTR_SETNAME]) { + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL && s->ref) { + ret = -IPSET_ERR_BUSY; + goto out; + } + } + read_unlock_bh(&ip_set_ref_lock); + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL) + ip_set_destroy_set(inst, i); + } + } else { + s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), + &i); + if (s == NULL) { + ret = -ENOENT; + goto out; + } else if (s->ref) { + ret = -IPSET_ERR_BUSY; + goto out; + } + read_unlock_bh(&ip_set_ref_lock); + + ip_set_destroy_set(inst, i); + } + return 0; +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; +} + +/* Flush sets */ + +static void +ip_set_flush_set(struct ip_set *set) +{ + pr_debug("set: %s\n", set->name); + + write_lock_bh(&set->lock); + set->variant->flush(set); + write_unlock_bh(&set->lock); +} + +static int +ip_set_flush(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *s; + ip_set_id_t i; + + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + if (!attr[IPSET_ATTR_SETNAME]) { + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL) + ip_set_flush_set(s); + } + } else { + s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (s == NULL) + return -ENOENT; + + ip_set_flush_set(s); + } + + return 0; +} + +/* Rename a set */ + +static const struct nla_policy +ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, +}; + +static int +ip_set_rename(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *set, *s; + const char *name2; + ip_set_id_t i; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_SETNAME2] == NULL)) + return -IPSET_ERR_PROTOCOL; + + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + read_lock_bh(&ip_set_ref_lock); + if (set->ref != 0) { + ret = -IPSET_ERR_REFERENCED; + goto out; + } + + name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL && STREQ(s->name, name2)) { + ret = -IPSET_ERR_EXIST_SETNAME2; + goto out; + } + } + strncpy(set->name, name2, IPSET_MAXNAMELEN); + +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; +} + +/* Swap two sets so that name/index points to the other. + * References and set names are also swapped. + * + * The commands are serialized by the nfnl mutex and references are + * protected by the ip_set_ref_lock. The kernel interfaces + * do not hold the mutex but the pointer settings are atomic + * so the ip_set_list always contains valid pointers to the sets. + */ + +static int +ip_set_swap(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *from, *to; + ip_set_id_t from_id, to_id; + char from_name[IPSET_MAXNAMELEN]; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_SETNAME2] == NULL)) + return -IPSET_ERR_PROTOCOL; + + from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), + &from_id); + if (from == NULL) + return -ENOENT; + + to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), + &to_id); + if (to == NULL) + return -IPSET_ERR_EXIST_SETNAME2; + + /* Features must not change. + * Not an artificial restriction anymore, as we must prevent + * possible loops created by swapping in setlist type of sets. */ + if (!(from->type->features == to->type->features && + from->family == to->family)) + return -IPSET_ERR_TYPE_MISMATCH; + + strncpy(from_name, from->name, IPSET_MAXNAMELEN); + strncpy(from->name, to->name, IPSET_MAXNAMELEN); + strncpy(to->name, from_name, IPSET_MAXNAMELEN); + + write_lock_bh(&ip_set_ref_lock); + swap(from->ref, to->ref); + ip_set(inst, from_id) = to; + ip_set(inst, to_id) = from; + write_unlock_bh(&ip_set_ref_lock); + + return 0; +} + +/* List/save set data */ + +#define DUMP_INIT 0 +#define DUMP_ALL 1 +#define DUMP_ONE 2 +#define DUMP_LAST 3 + +#define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) +#define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) + +static int +ip_set_dump_done(struct netlink_callback *cb) +{ + struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; + if (cb->args[IPSET_CB_ARG0]) { + pr_debug("release set %s\n", + ip_set(inst, cb->args[IPSET_CB_INDEX])->name); + __ip_set_put_byindex(inst, + (ip_set_id_t) cb->args[IPSET_CB_INDEX]); + } + return 0; +} + +static inline void +dump_attrs(struct nlmsghdr *nlh) +{ + const struct nlattr *attr; + int rem; + + pr_debug("dump nlmsg\n"); + nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) { + pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len); + } +} + +static int +dump_init(struct netlink_callback *cb, struct ip_set_net *inst) +{ + struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *attr = (void *)nlh + min_len; + u32 dump_type; + ip_set_id_t index; + + /* Second pass, so parser can't fail */ + nla_parse(cda, IPSET_ATTR_CMD_MAX, + attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); + + /* cb->args[IPSET_CB_NET]: net namespace + * [IPSET_CB_DUMP]: dump single set/all sets + * [IPSET_CB_INDEX]: set index + * [IPSET_CB_ARG0]: type specific + */ + + if (cda[IPSET_ATTR_SETNAME]) { + struct ip_set *set; + + set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), + &index); + if (set == NULL) + return -ENOENT; + + dump_type = DUMP_ONE; + cb->args[IPSET_CB_INDEX] = index; + } else + dump_type = DUMP_ALL; + + if (cda[IPSET_ATTR_FLAGS]) { + u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); + dump_type |= (f << 16); + } + cb->args[IPSET_CB_NET] = (unsigned long)inst; + cb->args[IPSET_CB_DUMP] = dump_type; + + return 0; +} + +static int +ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) +{ + ip_set_id_t index = IPSET_INVALID_ID, max; + struct ip_set *set = NULL; + struct nlmsghdr *nlh = NULL; + unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; + struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); + u32 dump_type, dump_flags; + int ret = 0; + + if (!cb->args[IPSET_CB_DUMP]) { + ret = dump_init(cb, inst); + if (ret < 0) { + nlh = nlmsg_hdr(cb->skb); + /* We have to create and send the error message + * manually :-( */ + if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(cb->skb, nlh, ret); + return ret; + } + } + + if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) + goto out; + + dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]); + dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]); + max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1 + : inst->ip_set_max; +dump_last: + pr_debug("dump type, flag: %u %u index: %ld\n", + dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); + for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { + index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; + set = ip_set(inst, index); + if (set == NULL) { + if (dump_type == DUMP_ONE) { + ret = -ENOENT; + goto out; + } + continue; + } + /* When dumping all sets, we must dump "sorted" + * so that lists (unions of sets) are dumped last. + */ + if (dump_type != DUMP_ONE && + ((dump_type == DUMP_ALL) == + !!(set->type->features & IPSET_DUMP_LAST))) + continue; + pr_debug("List set: %s\n", set->name); + if (!cb->args[IPSET_CB_ARG0]) { + /* Start listing: make sure set won't be destroyed */ + pr_debug("reference set\n"); + __ip_set_get(set); + } + nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags, + IPSET_CMD_LIST); + if (!nlh) { + ret = -EMSGSIZE; + goto release_refcount; + } + if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + nla_put_string(skb, IPSET_ATTR_SETNAME, set->name)) + goto nla_put_failure; + if (dump_flags & IPSET_FLAG_LIST_SETNAME) + goto next_set; + switch (cb->args[IPSET_CB_ARG0]) { + case 0: + /* Core header data */ + if (nla_put_string(skb, IPSET_ATTR_TYPENAME, + set->type->name) || + nla_put_u8(skb, IPSET_ATTR_FAMILY, + set->family) || + nla_put_u8(skb, IPSET_ATTR_REVISION, + set->revision)) + goto nla_put_failure; + ret = set->variant->head(set, skb); + if (ret < 0) + goto release_refcount; + if (dump_flags & IPSET_FLAG_LIST_HEADER) + goto next_set; + /* Fall through and add elements */ + default: + read_lock_bh(&set->lock); + ret = set->variant->list(set, skb, cb); + read_unlock_bh(&set->lock); + if (!cb->args[IPSET_CB_ARG0]) + /* Set is done, proceed with next one */ + goto next_set; + goto release_refcount; + } + } + /* If we dump all sets, continue with dumping last ones */ + if (dump_type == DUMP_ALL) { + dump_type = DUMP_LAST; + cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); + cb->args[IPSET_CB_INDEX] = 0; + goto dump_last; + } + goto out; + +nla_put_failure: + ret = -EFAULT; +next_set: + if (dump_type == DUMP_ONE) + cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID; + else + cb->args[IPSET_CB_INDEX]++; +release_refcount: + /* If there was an error or set is done, release set */ + if (ret || !cb->args[IPSET_CB_ARG0]) { + pr_debug("release set %s\n", ip_set(inst, index)->name); + __ip_set_put_byindex(inst, index); + cb->args[IPSET_CB_ARG0] = 0; + } +out: + if (nlh) { + nlmsg_end(skb, nlh); + pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len); + dump_attrs(nlh); + } + + return ret < 0 ? ret : skb->len; +} + +static int +ip_set_dump(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + { + struct netlink_dump_control c = { + .dump = ip_set_dump_start, + .done = ip_set_dump_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } +} + +/* Add, del and test */ + +static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, + [IPSET_ATTR_ADT] = { .type = NLA_NESTED }, +}; + +static int +call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, + struct nlattr *tb[], enum ipset_adt adt, + u32 flags, bool use_lineno) +{ + int ret; + u32 lineno = 0; + bool eexist = flags & IPSET_FLAG_EXIST, retried = false; + + do { + write_lock_bh(&set->lock); + ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); + write_unlock_bh(&set->lock); + retried = true; + } while (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried)) == 0); + + if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) + return 0; + if (lineno && use_lineno) { + /* Error in restore/batch mode: send back lineno */ + struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + struct nlmsgerr *errmsg; + size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cmdattr; + u32 *errline; + + skb2 = nlmsg_new(payload, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); + errmsg = nlmsg_data(rep); + errmsg->error = ret; + memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); + cmdattr = (void *)&errmsg->msg + min_len; + + nla_parse(cda, IPSET_ATTR_CMD_MAX, + cmdattr, nlh->nlmsg_len - min_len, + ip_set_adt_policy); + + errline = nla_data(cda[IPSET_ATTR_LINENO]); + + *errline = lineno; + + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + /* Signal netlink not to send its ACK/errmsg. */ + return -EINTR; + } + + return ret; +} + +static int +ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + const struct nlattr *nla; + u32 flags = flag_exist(nlh); + bool use_lineno; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + !((attr[IPSET_ATTR_DATA] != NULL) ^ + (attr[IPSET_ATTR_ADT] != NULL)) || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])) || + (attr[IPSET_ATTR_ADT] != NULL && + (!flag_nested(attr[IPSET_ATTR_ADT]) || + attr[IPSET_ATTR_LINENO] == NULL)))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + use_lineno = !!attr[IPSET_ATTR_LINENO]; + if (attr[IPSET_ATTR_DATA]) { + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, + use_lineno); + } else { + int nla_rem; + + nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { + memset(tb, 0, sizeof(tb)); + if (nla_type(nla) != IPSET_ATTR_DATA || + !flag_nested(nla) || + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, + flags, use_lineno); + if (ret < 0) + return ret; + } + } + return ret; +} + +static int +ip_set_udel(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + const struct nlattr *nla; + u32 flags = flag_exist(nlh); + bool use_lineno; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + !((attr[IPSET_ATTR_DATA] != NULL) ^ + (attr[IPSET_ATTR_ADT] != NULL)) || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])) || + (attr[IPSET_ATTR_ADT] != NULL && + (!flag_nested(attr[IPSET_ATTR_ADT]) || + attr[IPSET_ATTR_LINENO] == NULL)))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + use_lineno = !!attr[IPSET_ATTR_LINENO]; + if (attr[IPSET_ATTR_DATA]) { + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, + use_lineno); + } else { + int nla_rem; + + nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { + memset(tb, 0, sizeof(*tb)); + if (nla_type(nla) != IPSET_ATTR_DATA || + !flag_nested(nla) || + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, + flags, use_lineno); + if (ret < 0) + return ret; + } + } + return ret; +} + +static int +ip_set_utest(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_DATA] == NULL || + !flag_nested(attr[IPSET_ATTR_DATA]))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + + read_lock_bh(&set->lock); + ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); + read_unlock_bh(&set->lock); + /* Userspace can't trigger element to be re-added */ + if (ret == -EAGAIN) + ret = 1; + + return ret > 0 ? 0 : -IPSET_ERR_EXIST; +} + +/* Get headed data of a set */ + +static int +ip_set_header(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + const struct ip_set *set; + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL)) + return -IPSET_ERR_PROTOCOL; + + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + IPSET_CMD_HEADER); + if (!nlh2) + goto nlmsg_failure; + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) || + nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) || + nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || + nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision)) + goto nla_put_failure; + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +/* Get type data */ + +static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, +}; + +static int +ip_set_type(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + u8 family, min, max; + const char *typename; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_TYPENAME] == NULL || + attr[IPSET_ATTR_FAMILY] == NULL)) + return -IPSET_ERR_PROTOCOL; + + family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); + typename = nla_data(attr[IPSET_ATTR_TYPENAME]); + ret = find_set_type_minmax(typename, family, &min, &max); + if (ret) + return ret; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + IPSET_CMD_TYPE); + if (!nlh2) + goto nlmsg_failure; + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) || + nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) || + nla_put_u8(skb2, IPSET_ATTR_REVISION, max) || + nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min)) + goto nla_put_failure; + nlmsg_end(skb2, nlh2); + + pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +/* Get protocol version */ + +static const struct nla_policy +ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, +}; + +static int +ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + int ret = 0; + + if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) + return -IPSET_ERR_PROTOCOL; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + IPSET_CMD_PROTOCOL); + if (!nlh2) + goto nlmsg_failure; + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL)) + goto nla_put_failure; + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { + [IPSET_CMD_NONE] = { + .call = ip_set_none, + .attr_count = IPSET_ATTR_CMD_MAX, + }, + [IPSET_CMD_CREATE] = { + .call = ip_set_create, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_create_policy, + }, + [IPSET_CMD_DESTROY] = { + .call = ip_set_destroy, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_FLUSH] = { + .call = ip_set_flush, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_RENAME] = { + .call = ip_set_rename, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname2_policy, + }, + [IPSET_CMD_SWAP] = { + .call = ip_set_swap, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname2_policy, + }, + [IPSET_CMD_LIST] = { + .call = ip_set_dump, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_SAVE] = { + .call = ip_set_dump, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_ADD] = { + .call = ip_set_uadd, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_DEL] = { + .call = ip_set_udel, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_TEST] = { + .call = ip_set_utest, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_HEADER] = { + .call = ip_set_header, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_TYPE] = { + .call = ip_set_type, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_type_policy, + }, + [IPSET_CMD_PROTOCOL] = { + .call = ip_set_protocol, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_protocol_policy, + }, +}; + +static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { + .name = "ip_set", + .subsys_id = NFNL_SUBSYS_IPSET, + .cb_count = IPSET_MSG_MAX, + .cb = ip_set_netlink_subsys_cb, +}; + +/* Interface to iptables/ip6tables */ + +static int +ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) +{ + unsigned int *op; + void *data; + int copylen = *len, ret = 0; + struct net *net = sock_net(sk); + struct ip_set_net *inst = ip_set_pernet(net); + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (optval != SO_IP_SET) + return -EBADF; + if (*len < sizeof(unsigned int)) + return -EINVAL; + + data = vmalloc(*len); + if (!data) + return -ENOMEM; + if (copy_from_user(data, user, *len) != 0) { + ret = -EFAULT; + goto done; + } + op = (unsigned int *) data; + + if (*op < IP_SET_OP_VERSION) { + /* Check the version at the beginning of operations */ + struct ip_set_req_version *req_version = data; + if (req_version->version != IPSET_PROTOCOL) { + ret = -EPROTO; + goto done; + } + } + + switch (*op) { + case IP_SET_OP_VERSION: { + struct ip_set_req_version *req_version = data; + + if (*len != sizeof(struct ip_set_req_version)) { + ret = -EINVAL; + goto done; + } + + req_version->version = IPSET_PROTOCOL; + ret = copy_to_user(user, req_version, + sizeof(struct ip_set_req_version)); + goto done; + } + case IP_SET_OP_GET_BYNAME: { + struct ip_set_req_get_set *req_get = data; + ip_set_id_t id; + + if (*len != sizeof(struct ip_set_req_get_set)) { + ret = -EINVAL; + goto done; + } + req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; + nfnl_lock(NFNL_SUBSYS_IPSET); + find_set_and_id(inst, req_get->set.name, &id); + req_get->set.index = id; + nfnl_unlock(NFNL_SUBSYS_IPSET); + goto copy; + } + case IP_SET_OP_GET_FNAME: { + struct ip_set_req_get_set_family *req_get = data; + ip_set_id_t id; + + if (*len != sizeof(struct ip_set_req_get_set_family)) { + ret = -EINVAL; + goto done; + } + req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; + nfnl_lock(NFNL_SUBSYS_IPSET); + find_set_and_id(inst, req_get->set.name, &id); + req_get->set.index = id; + if (id != IPSET_INVALID_ID) + req_get->family = ip_set(inst, id)->family; + nfnl_unlock(NFNL_SUBSYS_IPSET); + goto copy; + } + case IP_SET_OP_GET_BYINDEX: { + struct ip_set_req_get_set *req_get = data; + struct ip_set *set; + + if (*len != sizeof(struct ip_set_req_get_set) || + req_get->set.index >= inst->ip_set_max) { + ret = -EINVAL; + goto done; + } + nfnl_lock(NFNL_SUBSYS_IPSET); + set = ip_set(inst, req_get->set.index); + strncpy(req_get->set.name, set ? set->name : "", + IPSET_MAXNAMELEN); + nfnl_unlock(NFNL_SUBSYS_IPSET); + goto copy; + } + default: + ret = -EBADMSG; + goto done; + } /* end of switch(op) */ + +copy: + ret = copy_to_user(user, data, copylen); + +done: + vfree(data); + if (ret > 0) + ret = 0; + return ret; +} + +static struct nf_sockopt_ops so_set __read_mostly = { + .pf = PF_INET, + .get_optmin = SO_IP_SET, + .get_optmax = SO_IP_SET + 1, + .get = &ip_set_sockfn_get, + .owner = THIS_MODULE, +}; + +static int __net_init +ip_set_net_init(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set **list; + + inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX; + if (inst->ip_set_max >= IPSET_INVALID_ID) + inst->ip_set_max = IPSET_INVALID_ID - 1; + + list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); + if (!list) + return -ENOMEM; + inst->is_deleted = 0; + rcu_assign_pointer(inst->ip_set_list, list); + return 0; +} + +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + struct ip_set *set = NULL; + ip_set_id_t i; + + inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set != NULL) + ip_set_destroy_set(inst, i); + } + kfree(rcu_dereference_protected(inst->ip_set_list, 1)); +} + +static struct pernet_operations ip_set_net_ops = { + .init = ip_set_net_init, + .exit = ip_set_net_exit, + .id = &ip_set_net_id, + .size = sizeof(struct ip_set_net) +}; + + +static int __init +ip_set_init(void) +{ + int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { + pr_err("ip_set: cannot register with nfnetlink.\n"); + return ret; + } + ret = nf_register_sockopt(&so_set); + if (ret != 0) { + pr_err("SO_SET registry failed: %d\n", ret); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + return ret; + } + ret = register_pernet_subsys(&ip_set_net_ops); + if (ret) { + pr_err("ip_set: cannot register pernet_subsys.\n"); + nf_unregister_sockopt(&so_set); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + return ret; + } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); + return 0; +} + +static void __exit +ip_set_fini(void) +{ + unregister_pernet_subsys(&ip_set_net_ops); + nf_unregister_sockopt(&so_set); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + pr_debug("these are the famous last words\n"); +} + +module_init(ip_set_init); +module_exit(ip_set_fini); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c new file mode 100644 index 00000000000..29fb01ddff9 --- /dev/null +++ b/net/netfilter/ipset/ip_set_getport.c @@ -0,0 +1,174 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Get Layer-4 data from the packets */ + +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/sctp.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/ip.h> +#include <net/ipv6.h> + +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/export.h> + +/* We must handle non-linear skbs */ +static bool +get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, + bool src, __be16 *port, u8 *proto) +{ + switch (protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph; + const struct tcphdr *th; + + th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); + if (th == NULL) + /* No choice either */ + return false; + + *port = src ? th->source : th->dest; + break; + } + case IPPROTO_SCTP: { + sctp_sctphdr_t _sh; + const sctp_sctphdr_t *sh; + + sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); + if (sh == NULL) + /* No choice either */ + return false; + + *port = src ? sh->source : sh->dest; + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + struct udphdr _udph; + const struct udphdr *uh; + + uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); + if (uh == NULL) + /* No choice either */ + return false; + + *port = src ? uh->source : uh->dest; + break; + } + case IPPROTO_ICMP: { + struct icmphdr _ich; + const struct icmphdr *ic; + + ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); + if (ic == NULL) + return false; + + *port = (__force __be16)htons((ic->type << 8) | ic->code); + break; + } + case IPPROTO_ICMPV6: { + struct icmp6hdr _ich; + const struct icmp6hdr *ic; + + ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); + if (ic == NULL) + return false; + + *port = (__force __be16) + htons((ic->icmp6_type << 8) | ic->icmp6_code); + break; + } + default: + break; + } + *proto = protocol; + + return true; +} + +bool +ip_set_get_ip4_port(const struct sk_buff *skb, bool src, + __be16 *port, u8 *proto) +{ + const struct iphdr *iph = ip_hdr(skb); + unsigned int protooff = ip_hdrlen(skb); + int protocol = iph->protocol; + + /* See comments at tcp_match in ip_tables.c */ + if (protocol <= 0) + return false; + + if (ntohs(iph->frag_off) & IP_OFFSET) + switch (protocol) { + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + /* Port info not available for fragment offset > 0 */ + return false; + default: + /* Other protocols doesn't have ports, + so we can match fragments */ + *proto = protocol; + return true; + } + + return get_port(skb, protocol, protooff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip4_port); + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +bool +ip_set_get_ip6_port(const struct sk_buff *skb, bool src, + __be16 *port, u8 *proto) +{ + int protoff; + u8 nexthdr; + __be16 frag_off = 0; + + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return false; + + return get_port(skb, nexthdr, protoff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); +#endif + +bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case NFPROTO_IPV4: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case NFPROTO_IPV6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h new file mode 100644 index 00000000000..61c7fb05280 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -0,0 +1,1160 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _IP_SET_HASH_GEN_H +#define _IP_SET_HASH_GEN_H + +#include <linux/rcupdate.h> +#include <linux/jhash.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#ifndef rcu_dereference_bh +#define rcu_dereference_bh(p) rcu_dereference(p) +#endif + +#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) + +/* Hashing which uses arrays to resolve clashing. The hash table is resized + * (doubled) when searching becomes too long. + * Internally jhash is used with the assumption that the size of the + * stored data is a multiple of sizeof(u32). If storage supports timeout, + * the timeout field must be the last one in the data structure - that field + * is ignored when computing the hash key. + * + * Readers and resizing + * + * Resizing can be triggered by userspace command only, and those + * are serialized by the nfnl mutex. During resizing the set is + * read-locked, so the only possible concurrent operations are + * the kernel side readers. Those must be protected by proper RCU locking. + */ + +/* Number of elements to store in an initial array block */ +#define AHASH_INIT_SIZE 4 +/* Max number of elements to store in an array block */ +#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) + +/* Max number of elements can be tuned */ +#ifdef IP_SET_HASH_WITH_MULTI +#define AHASH_MAX(h) ((h)->ahash_max) + +static inline u8 +tune_ahash_max(u8 curr, u32 multi) +{ + u32 n; + + if (multi < curr) + return curr; + + n = curr + AHASH_INIT_SIZE; + /* Currently, at listing one hash bucket must fit into a message. + * Therefore we have a hard limit here. + */ + return n > curr && n <= 64 ? n : curr; +} +#define TUNE_AHASH_MAX(h, multi) \ + ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) +#else +#define AHASH_MAX(h) AHASH_MAX_SIZE +#define TUNE_AHASH_MAX(h, multi) +#endif + +/* A hash bucket */ +struct hbucket { + void *value; /* the array of the values */ + u8 size; /* size of the array */ + u8 pos; /* position of the first free entry */ +}; + +/* The hash table: the table size stored here in order to make resizing easy */ +struct htable { + u8 htable_bits; /* size of hash table == 2^htable_bits */ + struct hbucket bucket[0]; /* hashtable buckets */ +}; + +#define hbucket(h, i) (&((h)->bucket[i])) + +#ifndef IPSET_NET_COUNT +#define IPSET_NET_COUNT 1 +#endif + +/* Book-keeping of the prefixes added to the set */ +struct net_prefixes { + u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ + u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */ +}; + +/* Compute the hash table size */ +static size_t +htable_size(u8 hbits) +{ + size_t hsize; + + /* We must fit both into u32 in jhash and size_t */ + if (hbits > 31) + return 0; + hsize = jhash_size(hbits); + if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + < hsize) + return 0; + + return hsize * sizeof(struct hbucket) + sizeof(struct htable); +} + +/* Compute htable_bits from the user input parameter hashsize */ +static u8 +htable_bits(u32 hashsize) +{ + /* Assume that hashsize == 2^htable_bits */ + u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) + /* Round up to the first 2^n value */ + bits = fls(hashsize); + + return bits; +} + +static int +hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) +{ + if (n->pos >= n->size) { + void *tmp; + + if (n->size >= ahash_max) + /* Trigger rehashing */ + return -EAGAIN; + + tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + if (n->size) { + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + } + n->value = tmp; + n->size += AHASH_INIT_SIZE; + } + return 0; +} + +#ifdef IP_SET_HASH_WITH_NETS +#if IPSET_NET_COUNT > 1 +#define __CIDR(cidr, i) (cidr[i]) +#else +#define __CIDR(cidr, i) (cidr) +#endif +#ifdef IP_SET_HASH_WITH_NETS_PACKED +/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */ +#define CIDR(cidr, i) (__CIDR(cidr, i) + 1) +#else +#define CIDR(cidr, i) (__CIDR(cidr, i)) +#endif + +#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) + +#ifdef IP_SET_HASH_WITH_MULTI +#define NLEN(family) (SET_HOST_MASK(family) + 1) +#else +#define NLEN(family) SET_HOST_MASK(family) +#endif + +#else +#define NLEN(family) 0 +#endif /* IP_SET_HASH_WITH_NETS */ + +#endif /* _IP_SET_HASH_GEN_H */ + +/* Family dependent templates */ + +#undef ahash_data +#undef mtype_data_equal +#undef mtype_do_data_match +#undef mtype_data_set_flags +#undef mtype_data_reset_flags +#undef mtype_data_netmask +#undef mtype_data_list +#undef mtype_data_next +#undef mtype_elem + +#undef mtype_ahash_destroy +#undef mtype_ext_cleanup +#undef mtype_add_cidr +#undef mtype_del_cidr +#undef mtype_ahash_memsize +#undef mtype_flush +#undef mtype_destroy +#undef mtype_gc_init +#undef mtype_same_set +#undef mtype_kadt +#undef mtype_uadt +#undef mtype + +#undef mtype_add +#undef mtype_del +#undef mtype_test_cidrs +#undef mtype_test +#undef mtype_expire +#undef mtype_resize +#undef mtype_head +#undef mtype_list +#undef mtype_gc +#undef mtype_gc_init +#undef mtype_variant +#undef mtype_data_match + +#undef HKEY + +#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal) +#ifdef IP_SET_HASH_WITH_NETS +#define mtype_do_data_match IPSET_TOKEN(MTYPE, _do_data_match) +#else +#define mtype_do_data_match(d) 1 +#endif +#define mtype_data_set_flags IPSET_TOKEN(MTYPE, _data_set_flags) +#define mtype_data_reset_elem IPSET_TOKEN(MTYPE, _data_reset_elem) +#define mtype_data_reset_flags IPSET_TOKEN(MTYPE, _data_reset_flags) +#define mtype_data_netmask IPSET_TOKEN(MTYPE, _data_netmask) +#define mtype_data_list IPSET_TOKEN(MTYPE, _data_list) +#define mtype_data_next IPSET_TOKEN(MTYPE, _data_next) +#define mtype_elem IPSET_TOKEN(MTYPE, _elem) +#define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy) +#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr) +#define mtype_del_cidr IPSET_TOKEN(MTYPE, _del_cidr) +#define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize) +#define mtype_flush IPSET_TOKEN(MTYPE, _flush) +#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) +#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) +#define mtype MTYPE + +#define mtype_add IPSET_TOKEN(MTYPE, _add) +#define mtype_del IPSET_TOKEN(MTYPE, _del) +#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) +#define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_expire IPSET_TOKEN(MTYPE, _expire) +#define mtype_resize IPSET_TOKEN(MTYPE, _resize) +#define mtype_head IPSET_TOKEN(MTYPE, _head) +#define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_variant IPSET_TOKEN(MTYPE, _variant) +#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) + +#ifndef HKEY_DATALEN +#define HKEY_DATALEN sizeof(struct mtype_elem) +#endif + +#define HKEY(data, initval, htable_bits) \ +(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ + & jhash_mask(htable_bits)) + +#ifndef htype +#define htype HTYPE + +/* The generic hash structure */ +struct htype { + struct htable __rcu *table; /* the hash table */ + u32 maxelem; /* max elements in the hash */ + u32 elements; /* current element (vs timeout) */ + u32 initval; /* random jhash init value */ +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; /* markmask value for mark mask to store */ +#endif + struct timer_list gc; /* garbage collection when timeout enabled */ + struct mtype_elem next; /* temporary storage for uadd */ +#ifdef IP_SET_HASH_WITH_MULTI + u8 ahash_max; /* max elements in an array block */ +#endif +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; /* netmask value for subnets to store */ +#endif +#ifdef IP_SET_HASH_WITH_RBTREE + struct rb_root rbtree; +#endif +#ifdef IP_SET_HASH_WITH_NETS + struct net_prefixes nets[0]; /* book-keeping of prefixes */ +#endif +}; +#endif + +#ifdef IP_SET_HASH_WITH_NETS +/* Network cidr size book keeping when the hash stores different + * sized networks */ +static void +mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) +{ + int i, j; + + /* Add in increasing prefix order, so larger cidr first */ + for (i = 0, j = -1; i < nets_length && h->nets[i].nets[n]; i++) { + if (j != -1) + continue; + else if (h->nets[i].cidr[n] < cidr) + j = i; + else if (h->nets[i].cidr[n] == cidr) { + h->nets[i].nets[n]++; + return; + } + } + if (j != -1) { + for (; i > j; i--) { + h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; + h->nets[i].nets[n] = h->nets[i - 1].nets[n]; + } + } + h->nets[i].cidr[n] = cidr; + h->nets[i].nets[n] = 1; +} + +static void +mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) +{ + u8 i, j, net_end = nets_length - 1; + + for (i = 0; i < nets_length; i++) { + if (h->nets[i].cidr[n] != cidr) + continue; + if (h->nets[i].nets[n] > 1 || i == net_end || + h->nets[i + 1].nets[n] == 0) { + h->nets[i].nets[n]--; + return; + } + for (j = i; j < net_end && h->nets[j].nets[n]; j++) { + h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; + h->nets[j].nets[n] = h->nets[j + 1].nets[n]; + } + h->nets[j].nets[n] = 0; + return; + } +} +#endif + +/* Calculate the actual memory size of the set data */ +static size_t +mtype_ahash_memsize(const struct htype *h, const struct htable *t, + u8 nets_length, size_t dsize) +{ + u32 i; + size_t memsize = sizeof(*h) + + sizeof(*t) +#ifdef IP_SET_HASH_WITH_NETS + + sizeof(struct net_prefixes) * nets_length +#endif + + jhash_size(t->htable_bits) * sizeof(struct hbucket); + + for (i = 0; i < jhash_size(t->htable_bits); i++) + memsize += t->bucket[i].size * dsize; + + return memsize; +} + +/* Get the ith element from the array block n */ +#define ahash_data(n, i, dsize) \ + ((struct mtype_elem *)((n)->value + ((i) * (dsize)))) + +static void +mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) +{ + int i; + + for (i = 0; i < n->pos; i++) + ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); +} + +/* Flush a hash type of set: destroy all elements */ +static void +mtype_flush(struct ip_set *set) +{ + struct htype *h = set->data; + struct htable *t; + struct hbucket *n; + u32 i; + + t = rcu_dereference_bh_nfnl(h->table); + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) { + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + n->size = n->pos = 0; + /* FIXME: use slab cache */ + kfree(n->value); + } + } +#ifdef IP_SET_HASH_WITH_NETS + memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); +#endif + h->elements = 0; +} + +/* Destroy the hashtable part of the set */ +static void +mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) +{ + struct hbucket *n; + u32 i; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) { + if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + kfree(n->value); + } + } + + ip_set_free(t); +} + +/* Destroy a hash type of set */ +static void +mtype_destroy(struct ip_set *set) +{ + struct htype *h = set->data; + + if (set->extensions & IPSET_EXT_TIMEOUT) + del_timer_sync(&h->gc); + + mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); +#ifdef IP_SET_HASH_WITH_RBTREE + rbtree_destroy(&h->rbtree); +#endif + kfree(h); + + set->data = NULL; +} + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct htype *h = set->data; + + init_timer(&h->gc); + h->gc.data = (unsigned long) set; + h->gc.function = gc; + h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&h->gc); + pr_debug("gc initialized, run in every %u\n", + IPSET_GC_PERIOD(set->timeout)); +} + +static bool +mtype_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct htype *x = a->data; + const struct htype *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + a->timeout == b->timeout && +#ifdef IP_SET_HASH_WITH_NETMASK + x->netmask == y->netmask && +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + x->markmask == y->markmask && +#endif + a->extensions == b->extensions; +} + +/* Delete expired elements from the hashtable */ +static void +mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) +{ + struct htable *t; + struct hbucket *n; + struct mtype_elem *data; + u32 i; + int j; +#ifdef IP_SET_HASH_WITH_NETS + u8 k; +#endif + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, dsize); + if (ip_set_timeout_expired(ext_timeout(data, set))) { + pr_debug("expired %u/%u\n", i, j); +#ifdef IP_SET_HASH_WITH_NETS + for (k = 0; k < IPSET_NET_COUNT; k++) + mtype_del_cidr(h, CIDR(data->cidr, k), + nets_length, k); +#endif + ip_set_ext_destroy(set, data); + if (j != n->pos - 1) + /* Not last one */ + memcpy(data, + ahash_data(n, n->pos - 1, dsize), + dsize); + n->pos--; + h->elements--; + } + } + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!tmp) + /* Still try to delete expired elements */ + continue; + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + n->value = tmp; + } + } + rcu_read_unlock_bh(); +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct htype *h = set->data; + + pr_debug("called\n"); + write_lock_bh(&set->lock); + mtype_expire(set, h, NLEN(set->family), set->dsize); + write_unlock_bh(&set->lock); + + h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&h->gc); +} + +/* Resize a hash: create a new hash table with doubling the hashsize + * and inserting the elements to it. Repeat until we succeed or + * fail due to memory pressures. */ +static int +mtype_resize(struct ip_set *set, bool retried) +{ + struct htype *h = set->data; + struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table); + u8 htable_bits = orig->htable_bits; +#ifdef IP_SET_HASH_WITH_NETS + u8 flags; +#endif + struct mtype_elem *data; + struct mtype_elem *d; + struct hbucket *n, *m; + u32 i, j; + int ret; + + /* Try to cleanup once */ + if (SET_WITH_TIMEOUT(set) && !retried) { + i = h->elements; + write_lock_bh(&set->lock); + mtype_expire(set, set->data, NLEN(set->family), set->dsize); + write_unlock_bh(&set->lock); + if (h->elements < i) + return 0; + } + +retry: + ret = 0; + htable_bits++; + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); + if (!htable_bits) { + /* In case we have plenty of memory :-) */ + pr_warning("Cannot increase the hashsize of set %s further\n", + set->name); + return -IPSET_ERR_HASH_FULL; + } + t = ip_set_alloc(sizeof(*t) + + jhash_size(htable_bits) * sizeof(struct hbucket)); + if (!t) + return -ENOMEM; + t->htable_bits = htable_bits; + + read_lock_bh(&set->lock); + for (i = 0; i < jhash_size(orig->htable_bits); i++) { + n = hbucket(orig, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + flags = 0; + mtype_data_reset_flags(data, &flags); +#endif + m = hbucket(t, HKEY(data, h->initval, htable_bits)); + ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize); + if (ret < 0) { +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(data, &flags); +#endif + read_unlock_bh(&set->lock); + mtype_ahash_destroy(set, t, false); + if (ret == -EAGAIN) + goto retry; + return ret; + } + d = ahash_data(m, m->pos++, set->dsize); + memcpy(d, data, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(d, &flags); +#endif + } + } + + rcu_assign_pointer(h->table, t); + read_unlock_bh(&set->lock); + + /* Give time to other readers of the set */ + synchronize_rcu_bh(); + + pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, + orig->htable_bits, orig, t->htable_bits, t); + mtype_ahash_destroy(set, orig, false); + + return 0; +} + +/* Add an element to a hash and update the internal counters when succeeded, + * otherwise report the proper error code. */ +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i, ret = 0; + int j = AHASH_MAX(h) + 1; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 key, multi = 0; + + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) { + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t,key); + if (n->pos) { + /* Choosing the first entry in the array to replace */ + j = 0; + goto reuse_slot; + } + rcu_read_unlock_bh(); + } + if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) + /* FIXME: when set is full, we slow down here */ + mtype_expire(set, h, NLEN(set->family), set->dsize); + + if (h->elements >= h->maxelem) { + if (net_ratelimit()) + pr_warning("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (mtype_data_equal(data, d, &multi)) { + if (flag_exist || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set)))) { + /* Just the extensions could be overwritten */ + j = i; + goto reuse_slot; + } else { + ret = -IPSET_ERR_EXIST; + goto out; + } + } + /* Reuse first timed out entry */ + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set)) && + j != AHASH_MAX(h) + 1) + j = i; + } +reuse_slot: + if (j != AHASH_MAX(h) + 1) { + /* Fill out reused slot */ + data = ahash_data(n, j, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + for (i = 0; i < IPSET_NET_COUNT; i++) { + mtype_del_cidr(h, CIDR(data->cidr, i), + NLEN(set->family), i); + mtype_add_cidr(h, CIDR(d->cidr, i), + NLEN(set->family), i); + } +#endif + ip_set_ext_destroy(set, data); + } else { + /* Use/create a new slot */ + TUNE_AHASH_MAX(h, multi); + ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize); + if (ret != 0) { + if (ret == -EAGAIN) + mtype_data_next(&h->next, d); + goto out; + } + data = ahash_data(n, n->pos++, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_add_cidr(h, CIDR(d->cidr, i), NLEN(set->family), + i); +#endif + h->elements++; + } + memcpy(data, d, sizeof(struct mtype_elem)); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_set_flags(data, flags); +#endif + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, set), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(data, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(data, set), ext); + +out: + rcu_read_unlock_bh(); + return ret; +} + +/* Delete an element from the hash: swap it with the last element + * and free up space if possible. + */ +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i, ret = -IPSET_ERR_EXIST; +#ifdef IP_SET_HASH_WITH_NETS + u8 j; +#endif + u32 key, multi = 0; + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set))) + goto out; + if (i != n->pos - 1) + /* Not last one */ + memcpy(data, ahash_data(n, n->pos - 1, set->dsize), + set->dsize); + + n->pos--; + h->elements--; +#ifdef IP_SET_HASH_WITH_NETS + for (j = 0; j < IPSET_NET_COUNT; j++) + mtype_del_cidr(h, CIDR(d->cidr, j), NLEN(set->family), + j); +#endif + ip_set_ext_destroy(set, data); + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * set->dsize, + GFP_ATOMIC); + if (!tmp) { + ret = 0; + goto out; + } + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * set->dsize); + kfree(n->value); + n->value = tmp; + } + ret = 0; + goto out; + } + +out: + rcu_read_unlock_bh(); + return ret; +} + +static inline int +mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, + struct ip_set_ext *mext, struct ip_set *set, u32 flags) +{ + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(data, set), + ext, mext, flags); + return mtype_do_data_match(data); +} + +#ifdef IP_SET_HASH_WITH_NETS +/* Special test function which takes into account the different network + * sizes added to the set */ +static int +mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = rcu_dereference_bh(h->table); + struct hbucket *n; + struct mtype_elem *data; +#if IPSET_NET_COUNT == 2 + struct mtype_elem orig = *d; + int i, j = 0, k; +#else + int i, j = 0; +#endif + u32 key, multi = 0; + u8 nets_length = NLEN(set->family); + + pr_debug("test by nets\n"); + for (; j < nets_length && h->nets[j].nets[0] && !multi; j++) { +#if IPSET_NET_COUNT == 2 + mtype_data_reset_elem(d, &orig); + mtype_data_netmask(d, h->nets[j].cidr[0], false); + for (k = 0; k < nets_length && h->nets[k].nets[1] && !multi; + k++) { + mtype_data_netmask(d, h->nets[k].cidr[1], true); +#else + mtype_data_netmask(d, h->nets[j].cidr[0]); +#endif + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set)) { + if (!ip_set_timeout_expired( + ext_timeout(data, set))) + return mtype_data_match(data, ext, + mext, set, + flags); +#ifdef IP_SET_HASH_WITH_MULTI + multi = 0; +#endif + } else + return mtype_data_match(data, ext, + mext, set, flags); + } +#if IPSET_NET_COUNT == 2 + } +#endif + } + return 0; +} +#endif + +/* Test whether the element is added to the set */ +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + struct mtype_elem *d = value; + struct hbucket *n; + struct mtype_elem *data; + int i, ret = 0; + u32 key, multi = 0; + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); +#ifdef IP_SET_HASH_WITH_NETS + /* If we test an IP address and not a network address, + * try all possible network sizes */ + for (i = 0; i < IPSET_NET_COUNT; i++) + if (CIDR(d->cidr, i) != SET_HOST_MASK(set->family)) + break; + if (i == IPSET_NET_COUNT) { + ret = mtype_test_cidrs(set, d, ext, mext, flags); + goto out; + } +#endif + + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (mtype_data_equal(data, d, &multi) && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set)))) { + ret = mtype_data_match(data, ext, mext, set, flags); + goto out; + } + } +out: + rcu_read_unlock_bh(); + return ret; +} + +/* Reply a HEADER request: fill out the header part of the set */ +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct htype *h = set->data; + const struct htable *t; + struct nlattr *nested; + size_t memsize; + + t = rcu_dereference_bh_nfnl(h->table); + memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, + htonl(jhash_size(t->htable_bits))) || + nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) + goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_NETMASK + if (h->netmask != HOST_MASK && + nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + goto nla_put_failure; +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) + goto nla_put_failure; +#endif + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) + goto nla_put_failure; + if (unlikely(ip_set_put_flags(skb, set))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +/* Reply a LIST/SAVE request: dump the elements of the specified set */ +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct htype *h = set->data; + const struct htable *t = rcu_dereference_bh_nfnl(h->table); + struct nlattr *atd, *nested; + const struct hbucket *n; + const struct mtype_elem *e; + u32 first = cb->args[IPSET_CB_ARG0]; + /* We assume that one hash bucket fills into one page */ + void *incomplete; + int i; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); + for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); + cb->args[IPSET_CB_ARG0]++) { + incomplete = skb_tail_pointer(skb); + n = hbucket(t, cb->args[IPSET_CB_ARG0]); + pr_debug("cb->arg bucket: %lu, t %p n %p\n", + cb->args[IPSET_CB_ARG0], t, n); + for (i = 0; i < n->pos; i++) { + e = ahash_data(n, i, set->dsize); + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + pr_debug("list hash %lu hbucket %p i %u, data %p\n", + cb->args[IPSET_CB_ARG0], n, i, e); + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (cb->args[IPSET_CB_ARG0] == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_data_list(skb, e)) + goto nla_put_failure; + if (ip_set_put_extensions(skb, set, e, true)) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[IPSET_CB_ARG0] = 0; + + return 0; + +nla_put_failure: + nlmsg_trim(skb, incomplete); + if (unlikely(first == cb->args[IPSET_CB_ARG0])) { + pr_warning("Can't list set %s: one bucket does not fit into " + "a message. Please report it!\n", set->name); + cb->args[IPSET_CB_ARG0] = 0; + return -EMSGSIZE; + } + ipset_nest_end(skb, atd); + return 0; +} + +static int +IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); + +static int +IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + +static const struct ip_set_type_variant mtype_variant = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .resize = mtype_resize, + .same_set = mtype_same_set, +}; + +#ifdef IP_SET_EMIT_CREATE +static int +IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, + struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; +#endif + u8 hbits; +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; +#endif + size_t hsize; + struct HTYPE *h; + struct htable *t; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + +#ifdef IP_SET_HASH_WITH_MARKMASK + markmask = 0xffffffff; +#endif +#ifdef IP_SET_HASH_WITH_NETMASK + netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + pr_debug("Create set %s with family %s\n", + set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); +#endif + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || +#ifdef IP_SET_HASH_WITH_MARKMASK + !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || +#endif + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + +#ifdef IP_SET_HASH_WITH_NETMASK + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if ((set->family == NFPROTO_IPV4 && netmask > 32) || + (set->family == NFPROTO_IPV6 && netmask > 128) || + netmask == 0) + return -IPSET_ERR_INVALID_NETMASK; + } +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (tb[IPSET_ATTR_MARKMASK]) { + markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + + if ((markmask > 4294967295u) || markmask == 0) + return -IPSET_ERR_INVALID_MARKMASK; + } +#endif + + hsize = sizeof(*h); +#ifdef IP_SET_HASH_WITH_NETS + hsize += sizeof(struct net_prefixes) * + (set->family == NFPROTO_IPV4 ? 32 : 128); +#endif + h = kzalloc(hsize, GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; +#ifdef IP_SET_HASH_WITH_NETMASK + h->netmask = netmask; +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + h->markmask = markmask; +#endif + get_random_bytes(&h->initval, sizeof(h->initval)); + set->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + t = ip_set_alloc(hsize); + if (!t) { + kfree(h); + return -ENOMEM; + } + t->htable_bits = hbits; + rcu_assign_pointer(h->table, t); + + set->data = h; + if (set->family == NFPROTO_IPV4) { + set->variant = &IPSET_TOKEN(HTYPE, 4_variant); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); + } else { + set->variant = &IPSET_TOKEN(HTYPE, 6_variant); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); + } + if (tb[IPSET_ATTR_TIMEOUT]) { + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (set->family == NFPROTO_IPV4) + IPSET_TOKEN(HTYPE, 4_gc_init)(set, + IPSET_TOKEN(HTYPE, 4_gc)); + else + IPSET_TOKEN(HTYPE, 6_gc_init)(set, + IPSET_TOKEN(HTYPE, 6_gc)); + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(t->htable_bits), + t->htable_bits, h->maxelem, set->data, t); + + return 0; +} +#endif /* IP_SET_EMIT_CREATE */ diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c new file mode 100644 index 00000000000..dd40607f878 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -0,0 +1,315 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counters support */ +/* 2 Comments support */ +#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip"); + +/* Type specific function prefix */ +#define HTYPE hash_ip +#define IP_SET_HASH_WITH_NETMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ip4_elem { + /* Zero valued IP addresses cannot be stored */ + __be32 ip; +}; + +/* Common functions */ + +static inline bool +hash_ip4_data_equal(const struct hash_ip4_elem *e1, + const struct hash_ip4_elem *e2, + u32 *multi) +{ + return e1->ip == e2->ip; +} + +static inline bool +hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) +{ + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) +{ + next->ip = e->ip; +} + +#define MTYPE hash_ip4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ip *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + __be32 ip; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); + ip &= ip_set_netmask(h->netmask); + if (ip == 0) + return -EINVAL; + + e.ip = ip; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ip *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, hosts; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + ip &= ip_set_hostmask(h->netmask); + + if (adt == IPSET_TEST) { + e.ip = htonl(ip); + if (e.ip == 0) + return -IPSET_ERR_HASH_ELEM; + return adtfn(set, &e, &ext, &ext, flags); + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip += hosts) { + e.ip = htonl(ip); + if (e.ip == 0) + return -IPSET_ERR_HASH_ELEM; + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* IPv6 variant */ + +/* Member elements */ +struct hash_ip6_elem { + union nf_inet_addr ip; +}; + +/* Common functions */ + +static inline bool +hash_ip6_data_equal(const struct hash_ip6_elem *ip1, + const struct hash_ip6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); +} + +static inline void +hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip6_netmask(ip, prefix); +} + +static bool +hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) +{ + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE hash_ip6 +#define PF 6 +#define HOST_MASK 128 + +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ip *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ip *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) + return -IPSET_ERR_HASH_ELEM; + + ret = adtfn(set, &e, &ext, &ext, flags); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_ip_type __read_mostly = { + .name = "hash:ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ip_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ip_init(void) +{ + return ip_set_type_register(&hash_ip_type); +} + +static void __exit +hash_ip_fini(void) +{ + ip_set_type_unregister(&hash_ip_type); +} + +module_init(hash_ip_init); +module_exit(hash_ip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c new file mode 100644 index 00000000000..4eff0a29725 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -0,0 +1,321 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,mark type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); +IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,mark"); + +/* Type specific function prefix */ +#define HTYPE hash_ipmark +#define IP_SET_HASH_WITH_MARKMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipmark4_elem { + __be32 ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, + const struct hash_ipmark4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark4_data_list(struct sk_buff *skb, + const struct hash_ipmark4_elem *data) +{ + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark4_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark4_elem *d) +{ + next->ip = d->ip; +} + +#define MTYPE hash_ipmark4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip, ip_to = 0; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip = ntohl(e.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip++) { + e.ip = htonl(ip); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_ipmark6_elem { + union nf_inet_addr ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, + const struct hash_ipmark6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark6_data_list(struct sk_buff *skb, + const struct hash_ipmark6_elem *data) +{ + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark6_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE hash_ipmark6 +#define PF 6 +#define HOST_MASK 128 +#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + + +static int +hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + + return ret; +} + +static struct ip_set_type hash_ipmark_type __read_mostly = { + .name = "hash:ip,mark", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_MARK, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ipmark_create, + .create_policy = { + [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_MARK] = { .type = NLA_U32 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipmark_init(void) +{ + return ip_set_type_register(&hash_ipmark_type); +} + +static void __exit +hash_ipmark_fini(void) +{ + ip_set_type_unregister(&hash_ipmark_type); +} + +module_init(hash_ipmark_init); +module_exit(hash_ipmark_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c new file mode 100644 index 00000000000..7597b82a8b0 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -0,0 +1,390 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Counters support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,port"); + +/* Type specific function prefix */ +#define HTYPE hash_ipport + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; +}; + +/* Common functions */ + +static inline bool +hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, + const struct hash_ipport4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static bool +hash_ipport4_data_list(struct sk_buff *skb, + const struct hash_ipport4_elem *data) +{ + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipport4_data_next(struct hash_ipport4_elem *next, + const struct hash_ipport4_elem *d) +{ + next->ip = d->ip; + next->port = d->port; +} + +#define MTYPE hash_ipport4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipport *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip, ip_to = 0, p = 0, port, port_to; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || + tb[IPSET_ATTR_PORT_TO])) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip = ntohl(e.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + port_to = port = ntohs(e.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip++) { + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; + for (; p <= port_to; p++) { + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + } + return ret; +} + +/* IPv6 variant */ + +struct hash_ipport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; +}; + +/* Common functions */ + +static inline bool +hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, + const struct hash_ipport6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static bool +hash_ipport6_data_list(struct sk_buff *skb, + const struct hash_ipport6_elem *data) +{ + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipport6_data_next(struct hash_ipport4_elem *next, + const struct hash_ipport6_elem *d) +{ + next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE hash_ipport6 +#define PF 6 +#define HOST_MASK 128 +#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipport *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(e.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = ntohs(h->next.port); + for (; port <= port_to; port++) { + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static struct ip_set_type hash_ipport_type __read_mostly = { + .name = "hash:ip,port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ipport_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipport_init(void) +{ + return ip_set_type_register(&hash_ipport_type); +} + +static void __exit +hash_ipport_fini(void) +{ + ip_set_type_unregister(&hash_ipport_type); +} + +module_init(hash_ipport_init); +module_exit(hash_ipport_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c new file mode 100644 index 00000000000..672655ffd57 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -0,0 +1,402 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,ip type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Counters support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,port,ip"); + +/* Type specific function prefix */ +#define HTYPE hash_ipportip + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipportip4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; +}; + +static inline bool +hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, + const struct hash_ipportip4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->ip2 == ip2->ip2 && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static bool +hash_ipportip4_data_list(struct sk_buff *skb, + const struct hash_ipportip4_elem *data) +{ + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipportip4_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip4_elem *d) +{ + next->ip = d->ip; + next->port = d->port; +} + +/* Common functions */ +#define MTYPE hash_ipportip4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipportip *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip, ip_to = 0, p = 0, port, port_to; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || + tb[IPSET_ATTR_PORT_TO])) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip = ntohl(e.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + port_to = port = ntohs(e.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip++) { + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; + for (; p <= port_to; p++) { + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + } + return ret; +} + +/* IPv6 variant */ + +struct hash_ipportip6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; +}; + +/* Common functions */ + +static inline bool +hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, + const struct hash_ipportip6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static bool +hash_ipportip6_data_list(struct sk_buff *skb, + const struct hash_ipportip6_elem *data) +{ + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipportip6_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip6_elem *d) +{ + next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_ipportip6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipportip *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(e.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = ntohs(h->next.port); + for (; port <= port_to; port++) { + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static struct ip_set_type hash_ipportip_type __read_mostly = { + .name = "hash:ip,port,ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, + .dimension = IPSET_DIM_THREE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ipportip_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipportip_init(void) +{ + return ip_set_type_register(&hash_ipportip_type); +} + +static void __exit +hash_ipportip_fini(void) +{ + ip_set_type_unregister(&hash_ipportip_type); +} + +module_init(hash_ipportip_init); +module_exit(hash_ipportip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c new file mode 100644 index 00000000000..7308d84f927 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -0,0 +1,561 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Range as input support for IPv4 added */ +/* 3 nomatch flag support added */ +/* 4 Counters support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,port,net"); + +/* Type specific function prefix */ +#define HTYPE hash_ipportnet + +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipportnet4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, + const struct hash_ipportnet4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->ip2 == ip2->ip2 && + ip1->cidr == ip2->cidr && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline int +hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) +{ + elem->ip2 &= ip_set_netmask(cidr); + elem->cidr = cidr - 1; +} + +static bool +hash_ipportnet4_data_list(struct sk_buff *skb, + const struct hash_ipportnet4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet4_elem *d) +{ + next->ip = d->ip; + next->port = d->port; + next->ip2 = d->ip2; +} + +#define MTYPE hash_ipportnet4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (adt == IPSET_TEST) + e.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + e.ip2 &= ip_set_netmask(e.cidr + 1); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, p = 0, port, port_to; + u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR2]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; + } + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || + tb[IPSET_ATTR_IP2_TO])) { + e.ip = htonl(ip); + e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + port_to = port = ntohs(e.port); + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + ip2_to = ip2_from; + if (tb[IPSET_ATTR_IP2_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); + if (ret) + return ret; + if (ip2_from > ip2_to) + swap(ip2_from, ip2_to); + if (ip2_from + UINT_MAX == ip2_to) + return -IPSET_ERR_HASH_RANGE; + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip++) { + e.ip = htonl(ip); + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; + for (; p <= port_to; p++) { + e.port = htons(p); + ip2 = retried && + ip == ntohl(h->next.ip) && + p == ntohs(h->next.port) + ? ntohl(h->next.ip2) : ip2_from; + while (!after(ip2, ip2_to)) { + e.ip2 = htonl(ip2); + ip2_last = ip_set_range_to_cidr(ip2, ip2_to, + &cidr); + e.cidr = cidr - 1; + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip2 = ip2_last + 1; + } + } + } + return ret; +} + +/* IPv6 variant */ + +struct hash_ipportnet6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, + const struct hash_ipportnet6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && + ip1->cidr == ip2->cidr && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline int +hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip2, cidr); + elem->cidr = cidr - 1; +} + +static bool +hash_ipportnet6_data_list(struct sk_buff *skb, + const struct hash_ipportnet6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet6_elem *d) +{ + next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_ipportnet6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (adt == IPSET_TEST) + e.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + ip6_netmask(&e.ip2, e.cidr + 1); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR2]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; + } + + ip6_netmask(&e.ip2, e.cidr + 1); + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(e.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = ntohs(h->next.port); + for (; port <= port_to; port++) { + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static struct ip_set_type hash_ipportnet_type __read_mostly = { + .name = "hash:ip,port,net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | + IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_THREE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ipportnet_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipportnet_init(void) +{ + return ip_set_type_register(&hash_ipportnet_type); +} + +static void __exit +hash_ipportnet_fini(void) +{ + ip_set_type_unregister(&hash_ipportnet_type); +} + +module_init(hash_ipportnet_init); +module_exit(hash_ipportnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c new file mode 100644 index 00000000000..4c7d495783a --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -0,0 +1,397 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Range as input support for IPv4 added */ +/* 2 nomatch flag support added */ +/* 3 Counters support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net"); + +/* Type specific function prefix */ +#define HTYPE hash_net +#define IP_SET_HASH_WITH_NETS + +/* IPv4 variant */ + +/* Member elements */ +struct hash_net4_elem { + __be32 ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; + +/* Common functions */ + +static inline bool +hash_net4_data_equal(const struct hash_net4_elem *ip1, + const struct hash_net4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->cidr == ip2->cidr; +} + +static inline int +hash_net4_do_data_match(const struct hash_net4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; +} + +static bool +hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_net4_data_next(struct hash_net4_elem *next, + const struct hash_net4_elem *d) +{ + next->ip = d->ip; +} + +#define MTYPE hash_net4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_net *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (e.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + e.cidr = HOST_MASK; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_net *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net4_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, last; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr || e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret: + ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } + if (retried) + ip = ntohl(h->next.ip); + while (!after(ip, ip_to)) { + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip = last + 1; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_net6_elem { + union nf_inet_addr ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; + +/* Common functions */ + +static inline bool +hash_net6_data_equal(const struct hash_net6_elem *ip1, + const struct hash_net6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->cidr == ip2->cidr; +} + +static inline int +hash_net6_do_data_match(const struct hash_net6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr; +} + +static bool +hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_net6_data_next(struct hash_net4_elem *next, + const struct hash_net6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_net6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_net *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (e.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + e.cidr = HOST_MASK; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net6_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!e.cidr || e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&e.ip, e.cidr); + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + ret = adtfn(set, &e, &ext, &ext, flags); + + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_net_type __read_mostly = { + .name = "hash:net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_net_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_net_init(void) +{ + return ip_set_type_register(&hash_net_type); +} + +static void __exit +hash_net_fini(void) +{ + ip_set_type_unregister(&hash_net_type); +} + +module_init(hash_net_init); +module_exit(hash_net_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c new file mode 100644 index 00000000000..db2606805b3 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -0,0 +1,610 @@ +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net,iface type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <linux/rbtree.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 nomatch flag support added */ +/* 2 /0 support added */ +/* 3 Counters support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,iface"); + +/* Interface name rbtree */ + +struct iface_node { + struct rb_node node; + char iface[IFNAMSIZ]; +}; + +#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) + +static void +rbtree_destroy(struct rb_root *root) +{ + struct iface_node *node, *next; + + rbtree_postorder_for_each_entry_safe(node, next, root, node) + kfree(node); + + *root = RB_ROOT; +} + +static int +iface_test(struct rb_root *root, const char **iface) +{ + struct rb_node *n = root->rb_node; + + while (n) { + const char *d = iface_data(n); + int res = strcmp(*iface, d); + + if (res < 0) + n = n->rb_left; + else if (res > 0) + n = n->rb_right; + else { + *iface = d; + return 1; + } + } + return 0; +} + +static int +iface_add(struct rb_root *root, const char **iface) +{ + struct rb_node **n = &(root->rb_node), *p = NULL; + struct iface_node *d; + + while (*n) { + char *ifname = iface_data(*n); + int res = strcmp(*iface, ifname); + + p = *n; + if (res < 0) + n = &((*n)->rb_left); + else if (res > 0) + n = &((*n)->rb_right); + else { + *iface = ifname; + return 0; + } + } + + d = kzalloc(sizeof(*d), GFP_ATOMIC); + if (!d) + return -ENOMEM; + strcpy(d->iface, *iface); + + rb_link_node(&d->node, p, n); + rb_insert_color(&d->node, root); + + *iface = d->iface; + return 0; +} + +/* Type specific function prefix */ +#define HTYPE hash_netiface +#define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_RBTREE +#define IP_SET_HASH_WITH_MULTI + +#define STREQ(a, b) (strcmp(a, b) == 0) + +/* IPv4 variant */ + +struct hash_netiface4_elem_hashed { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; +}; + +/* Member elements */ +struct hash_netiface4_elem { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; +}; + +/* Common functions */ + +static inline bool +hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, + const struct hash_netiface4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->cidr == ip2->cidr && + (++*multi) && + ip1->physdev == ip2->physdev && + ip1->iface == ip2->iface; +} + +static inline int +hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; +} + +static bool +hash_netiface4_data_list(struct sk_buff *skb, + const struct hash_netiface4_elem *data) +{ + u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netiface4_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface4_elem *d) +{ + next->ip = d->ip; +} + +#define MTYPE hash_netiface4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) +#include "ip_set_hash_gen.h" + +static int +hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct hash_netiface *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .elem = 1, + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + int ret; + + if (e.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + e.cidr = HOST_MASK; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); + +#define IFACE(dir) (par->dir ? par->dir->name : NULL) +#define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL) +#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) + + if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { +#ifdef CONFIG_BRIDGE_NETFILTER + const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (!nf_bridge) + return -EINVAL; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; +#else + e.iface = NULL; +#endif + } else + e.iface = SRCDIR ? IFACE(in) : IFACE(out); + + if (!e.iface) + return -EINVAL; + ret = iface_test(&h->rbtree, &e.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &e.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct hash_netiface *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, last; + char iface[IFNAMSIZ]; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !tb[IPSET_ATTR_IFACE] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + + strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &e.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } else + ip_set_mask_from_to(ip, ip_to, e.cidr); + + if (retried) + ip = ntohl(h->next.ip); + while (!after(ip, ip_to)) { + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip = last + 1; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_netiface6_elem_hashed { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; +}; + +struct hash_netiface6_elem { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; +}; + +/* Common functions */ + +static inline bool +hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, + const struct hash_netiface6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->cidr == ip2->cidr && + (++*multi) && + ip1->physdev == ip2->physdev && + ip1->iface == ip2->iface; +} + +static inline int +hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr; +} + +static bool +hash_netiface6_data_list(struct sk_buff *skb, + const struct hash_netiface6_elem *data) +{ + u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netiface6_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE hash_netiface6 +#define PF 6 +#define HOST_MASK 128 +#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct hash_netiface *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .elem = 1, + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + int ret; + + if (e.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + e.cidr = HOST_MASK; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); + + if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { +#ifdef CONFIG_BRIDGE_NETFILTER + const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (!nf_bridge) + return -EINVAL; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; +#else + e.iface = NULL; +#endif + } else + e.iface = SRCDIR ? IFACE(in) : IFACE(out); + + if (!e.iface) + return -EINVAL; + ret = iface_test(&h->rbtree, &e.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &e.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct hash_netiface *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + char iface[IFNAMSIZ]; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !tb[IPSET_ATTR_IFACE] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + ip6_netmask(&e.ip, e.cidr); + + strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &e.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + ret = adtfn(set, &e, &ext, &ext, flags); + + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_netiface_type __read_mostly = { + .name = "hash:net,iface", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE | + IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_netiface_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IFACE] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netiface_init(void) +{ + return ip_set_type_register(&hash_netiface_type); +} + +static void __exit +hash_netiface_fini(void) +{ + ip_set_type_unregister(&hash_netiface_type); +} + +module_init(hash_netiface_init); +module_exit(hash_netiface_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c new file mode 100644 index 00000000000..3e99987e4bf --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -0,0 +1,481 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,net"); + +/* Type specific function prefix */ +#define HTYPE hash_netnet +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variants */ + +/* Member elements */ +struct hash_netnet4_elem { + union { + __be32 ip[2]; + __be64 ipcmp; + }; + u8 nomatch; + union { + u8 cidr[2]; + u16 ccmp; + }; +}; + +/* Common functions */ + +static inline bool +hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, + const struct hash_netnet4_elem *ip2, + u32 *multi) +{ + return ip1->ipcmp == ip2->ipcmp && + ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, + struct hash_netnet4_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) +{ + if (inner) { + elem->ip[1] &= ip_set_netmask(cidr); + elem->cidr[1] = cidr; + } else { + elem->ip[0] &= ip_set_netmask(cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netnet4_data_list(struct sk_buff *skb, + const struct hash_netnet4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; +} + +static inline void +hash_netnet4_data_next(struct hash_netnet4_elem *next, + const struct hash_netnet4_elem *d) +{ + next->ipcmp = d->ipcmp; +} + +#define MTYPE hash_netnet4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); + ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); + e.ip[0] &= ip_set_netmask(e.cidr[0]); + e.ip[1] &= ip_set_netmask(e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, last; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; + u8 cidr, cidr2; + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[0] = cidr; + } + + if (tb[IPSET_ATTR_CIDR2]) { + cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr2 || cidr2 > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[1] = cidr2; + } + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] && + tb[IPSET_ATTR_IP2_TO])) { + e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } + + ip2_to = ip2_from; + if (tb[IPSET_ATTR_IP2_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); + if (ret) + return ret; + if (ip2_to < ip2_from) + swap(ip2_from, ip2_to); + if (ip2_from + UINT_MAX == ip2_to) + return -IPSET_ERR_HASH_RANGE; + + } + + if (retried) + ip = ntohl(h->next.ip[0]); + + while (!after(ip, ip_to)) { + e.ip[0] = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &cidr); + e.cidr[0] = cidr; + ip2 = (retried && + ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) + : ip2_from; + while (!after(ip2, ip2_to)) { + e.ip[1] = htonl(ip2); + last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); + e.cidr[1] = cidr2; + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip2 = last2 + 1; + } + ip = last + 1; + } + return ret; +} + +/* IPv6 variants */ + +struct hash_netnet6_elem { + union nf_inet_addr ip[2]; + u8 nomatch; + union { + u8 cidr[2]; + u16 ccmp; + }; +}; + +/* Common functions */ + +static inline bool +hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, + const struct hash_netnet6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && + ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && + ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, + struct hash_netnet6_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) +{ + if (inner) { + ip6_netmask(&elem->ip[1], cidr); + elem->cidr[1] = cidr; + } else { + ip6_netmask(&elem->ip[0], cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netnet6_data_list(struct sk_buff *skb, + const struct hash_netnet6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; +} + +static inline void +hash_netnet6_data_next(struct hash_netnet4_elem *next, + const struct hash_netnet6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_netnet6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || + ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (tb[IPSET_ATTR_CIDR2]) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + + if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || + e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + ret = adtfn(set, &e, &ext, &ext, flags); + + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_netnet_type __read_mostly = { + .name = "hash:net,net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_netnet_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netnet_init(void) +{ + return ip_set_type_register(&hash_netnet_type); +} + +static void __exit +hash_netnet_fini(void) +{ + ip_set_type_unregister(&hash_netnet_type); +} + +module_init(hash_netnet_init); +module_exit(hash_netnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c new file mode 100644 index 00000000000..1c645fbd09c --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -0,0 +1,509 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net,port type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Range as input support for IPv4 added */ +/* 3 nomatch flag support added */ +/* 4 Counters support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,port"); + +/* Type specific function prefix */ +#define HTYPE hash_netport +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED + +/* IPv4 variant */ + +/* Member elements */ +struct hash_netport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; +}; + +/* Common functions */ + +static inline bool +hash_netport4_data_equal(const struct hash_netport4_elem *ip1, + const struct hash_netport4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->port == ip2->port && + ip1->proto == ip2->proto && + ip1->cidr == ip2->cidr; +} + +static inline int +hash_netport4_do_data_match(const struct hash_netport4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr - 1; +} + +static bool +hash_netport4_data_list(struct sk_buff *skb, + const struct hash_netport4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netport4_data_next(struct hash_netport4_elem *next, + const struct hash_netport4_elem *d) +{ + next->ip = d->ip; + next->port = d->port; +} + +#define MTYPE hash_netport4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netport *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (adt == IPSET_TEST) + e.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr + 1); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netport *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to, p = 0, ip = 0, ip_to = 0, last; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; + } + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; + + with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { + e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = port_to = ntohs(e.port); + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port_to < port) + swap(port, port_to); + } + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } else + ip_set_mask_from_to(ip, ip_to, e.cidr + 1); + + if (retried) + ip = ntohl(h->next.ip); + while (!after(ip, ip_to)) { + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &cidr); + e.cidr = cidr - 1; + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; + for (; p <= port_to; p++) { + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + ip = last + 1; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_netport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; +}; + +/* Common functions */ + +static inline bool +hash_netport6_data_equal(const struct hash_netport6_elem *ip1, + const struct hash_netport6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->port == ip2->port && + ip1->proto == ip2->proto && + ip1->cidr == ip2->cidr; +} + +static inline int +hash_netport6_do_data_match(const struct hash_netport6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr - 1; +} + +static bool +hash_netport6_data_list(struct sk_buff *skb, + const struct hash_netport6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netport6_data_next(struct hash_netport4_elem *next, + const struct hash_netport6_elem *d) +{ + next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_netport6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netport *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + if (adt == IPSET_TEST) + e.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr + 1); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netport *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; + } + ip6_netmask(&e.ip, e.cidr + 1); + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(e.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = ntohs(h->next.port); + for (; port <= port_to; port++) { + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static struct ip_set_type hash_netport_type __read_mostly = { + .name = "hash:net,port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_netport_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netport_init(void) +{ + return ip_set_type_register(&hash_netport_type); +} + +static void __exit +hash_netport_fini(void) +{ + ip_set_type_unregister(&hash_netport_type); +} + +module_init(hash_netport_init); +module_exit(hash_netport_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c new file mode 100644 index 00000000000..c0d2ba73f8b --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -0,0 +1,587 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 0 Comments support added */ +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,port,net"); + +/* Type specific function prefix */ +#define HTYPE hash_netportnet +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variant */ + +/* Member elements */ +struct hash_netportnet4_elem { + union { + __be32 ip[2]; + __be64 ipcmp; + }; + __be16 port; + union { + u8 cidr[2]; + u16 ccmp; + }; + u8 nomatch:1; + u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, + const struct hash_netportnet4_elem *ip2, + u32 *multi) +{ + return ip1->ipcmp == ip2->ipcmp && + ip1->ccmp == ip2->ccmp && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, + struct hash_netportnet4_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, + u8 cidr, bool inner) +{ + if (inner) { + elem->ip[1] &= ip_set_netmask(cidr); + elem->cidr[1] = cidr; + } else { + elem->ip[0] &= ip_set_netmask(cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netportnet4_data_list(struct sk_buff *skb, + const struct hash_netportnet4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netportnet4_data_next(struct hash_netportnet4_elem *next, + const struct hash_netportnet4_elem *d) +{ + next->ipcmp = d->ipcmp; + next->port = d->port; +} + +#define MTYPE hash_netportnet4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]); + e.ip[0] &= ip_set_netmask(e.cidr[0]); + e.ip[1] &= ip_set_netmask(e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; + u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; + bool with_ports = false; + u8 cidr, cidr2; + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[0] = cidr; + } + + if (tb[IPSET_ATTR_CIDR2]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[1] = cidr; + } + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { + e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + if (unlikely(ip + UINT_MAX == ip_to)) + return -IPSET_ERR_HASH_RANGE; + } + + port_to = port = ntohs(e.port); + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + ip2_to = ip2_from; + if (tb[IPSET_ATTR_IP2_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); + if (ret) + return ret; + if (ip2_from > ip2_to) + swap(ip2_from, ip2_to); + if (unlikely(ip2_from + UINT_MAX == ip2_to)) + return -IPSET_ERR_HASH_RANGE; + } + + if (retried) + ip = ntohl(h->next.ip[0]); + + while (!after(ip, ip_to)) { + e.ip[0] = htonl(ip); + ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); + e.cidr[0] = cidr; + p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) + : port; + for (; p <= port_to; p++) { + e.port = htons(p); + ip2 = (retried && ip == ntohl(h->next.ip[0]) && + p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) + : ip2_from; + while (!after(ip2, ip2_to)) { + e.ip[1] = htonl(ip2); + ip2_last = ip_set_range_to_cidr(ip2, ip2_to, + &cidr2); + e.cidr[1] = cidr2; + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip2 = ip2_last + 1; + } + } + ip = ip_last + 1; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_netportnet6_elem { + union nf_inet_addr ip[2]; + __be16 port; + union { + u8 cidr[2]; + u16 ccmp; + }; + u8 nomatch:1; + u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, + const struct hash_netportnet6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && + ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && + ip1->ccmp == ip2->ccmp && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, + struct hash_netportnet6_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, + u8 cidr, bool inner) +{ + if (inner) { + ip6_netmask(&elem->ip[1], cidr); + elem->cidr[1] = cidr; + } else { + ip6_netmask(&elem->ip[0], cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netportnet6_data_list(struct sk_buff *skb, + const struct hash_netportnet6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netportnet6_data_next(struct hash_netportnet4_elem *next, + const struct hash_netportnet6_elem *d) +{ + next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_netportnet6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6); + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to; + bool with_ports = false; + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || + ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (tb[IPSET_ATTR_CIDR2]) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + + if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || + e.cidr[1] > HOST_MASK)) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(e.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = ntohs(h->next.port); + for (; port <= port_to; port++) { + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static struct ip_set_type hash_netportnet_type __read_mostly = { + .name = "hash:net,port,net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | + IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_THREE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_netportnet_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netportnet_init(void) +{ + return ip_set_type_register(&hash_netportnet_type); +} + +static void __exit +hash_netportnet_fini(void) +{ + ip_set_type_unregister(&hash_netportnet_type); +} + +module_init(hash_netportnet_init); +module_exit(hash_netportnet_fini); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c new file mode 100644 index 00000000000..3e2317f3cf6 --- /dev/null +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -0,0 +1,685 @@ +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the list:set type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> + +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_list.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counters support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comments support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_list:set"); + +/* Member elements */ +struct set_elem { + ip_set_id_t id; +}; + +struct set_adt_elem { + ip_set_id_t id; + ip_set_id_t refid; + int before; +}; + +/* Type structure */ +struct list_set { + u32 size; /* size of set list array */ + struct timer_list gc; /* garbage collection */ + struct net *net; /* namespace */ + struct set_elem members[0]; /* the set members */ +}; + +#define list_set_elem(set, map, id) \ + (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize) + +static int +list_set_ktest(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) +{ + struct list_set *map = set->data; + struct set_elem *e; + u32 i, cmdflags = opt->cmdflags; + int ret; + + /* Don't lookup sub-counters at all */ + opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; + if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) + opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + ret = ip_set_test(e->id, skb, par, opt); + if (ret > 0) { + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(e, set), + ext, &opt->ext, + cmdflags); + return ret; + } + } + return 0; +} + +static int +list_set_kadd(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) +{ + struct list_set *map = set->data; + struct set_elem *e; + u32 i; + int ret; + + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + ret = ip_set_add(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; +} + +static int +list_set_kdel(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) +{ + struct list_set *map = set->data; + struct set_elem *e; + u32 i; + int ret; + + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + ret = ip_set_del(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; +} + +static int +list_set_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + switch (adt) { + case IPSET_TEST: + return list_set_ktest(set, skb, par, opt, &ext); + case IPSET_ADD: + return list_set_kadd(set, skb, par, opt, &ext); + case IPSET_DEL: + return list_set_kdel(set, skb, par, opt, &ext); + default: + break; + } + return -EINVAL; +} + +static bool +id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) +{ + const struct list_set *map = set->data; + const struct set_elem *e; + + if (i >= map->size) + return 0; + + e = list_set_elem(set, map, i); + return !!(e->id == id && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set)))); +} + +static int +list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, + const struct ip_set_ext *ext) +{ + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(set, map, i); + + if (e->id != IPSET_INVALID_ID) { + if (i == map->size - 1) { + /* Last element replaced: e.g. add new,before,last */ + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + } else { + struct set_elem *x = list_set_elem(set, map, + map->size - 1); + + /* Last element pushed off */ + if (x->id != IPSET_INVALID_ID) { + ip_set_put_byindex(map->net, x->id); + ip_set_ext_destroy(set, x); + } + memmove(list_set_elem(set, map, i + 1), e, + set->dsize * (map->size - (i + 1))); + /* Extensions must be initialized to zero */ + memset(e, 0, set->dsize); + } + } + + e->id = d->id; + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); + return 0; +} + +static int +list_set_del(struct ip_set *set, u32 i) +{ + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(set, map, i); + + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + + if (i < map->size - 1) + memmove(e, list_set_elem(set, map, i + 1), + set->dsize * (map->size - (i + 1))); + + /* Last element */ + e = list_set_elem(set, map, map->size - 1); + e->id = IPSET_INVALID_ID; + return 0; +} + +static void +set_cleanup_entries(struct ip_set *set) +{ + struct list_set *map = set->data; + struct set_elem *e; + u32 i = 0; + + while (i < map->size) { + e = list_set_elem(set, map, i); + if (e->id != IPSET_INVALID_ID && + ip_set_timeout_expired(ext_timeout(e, set))) + list_set_del(set, i); + /* Check element moved to position i in next loop */ + else + i++; + } +} + +static int +list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + u32 i; + int ret; + + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return 1; + else if (d->before > 0) + ret = id_eq(set, i + 1, d->refid); + else + ret = i > 0 && id_eq(set, i - 1, d->refid); + return ret; + } + return 0; +} + + +static int +list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 i, ret = 0; + + if (SET_WITH_TIMEOUT(set)) + set_cleanup_entries(set); + + /* Check already added element */ + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + goto insert; + else if (e->id != d->id) + continue; + + if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || + (d->before < 0 && + (i == 0 || !id_eq(set, i - 1, d->refid)))) + /* Before/after doesn't match */ + return -IPSET_ERR_REF_EXIST; + if (!flag_exist) + /* Can't re-add */ + return -IPSET_ERR_EXIST; + /* Update extensions */ + ip_set_ext_destroy(set, e); + + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); + /* Set is already added to the list */ + ip_set_put_byindex(map->net, d->id); + return 0; + } +insert: + ret = -IPSET_ERR_LIST_FULL; + for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + ret = d->before != 0 ? -IPSET_ERR_REF_EXIST + : list_set_add(set, i, d, ext); + else if (e->id != d->refid) + continue; + else if (d->before > 0) + ret = list_set_add(set, i, d, ext); + else if (i + 1 < map->size) + ret = list_set_add(set, i + 1, d, ext); + } + + return ret; +} + +static int +list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + u32 i; + + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return d->before != 0 ? -IPSET_ERR_REF_EXIST + : -IPSET_ERR_EXIST; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return list_set_del(set, i); + else if (d->before > 0) { + if (!id_eq(set, i + 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + return list_set_del(set, i); + } else if (i == 0 || !id_eq(set, i - 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + else + return list_set_del(set, i); + } + return -IPSET_ERR_EXIST; +} + +static int +list_set_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct list_set *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + struct ip_set *s; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_NAME] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s); + if (e.id == IPSET_INVALID_ID) + return -IPSET_ERR_NAME; + /* "Loop detection" */ + if (s->type->features & IPSET_TYPE_NAME) { + ret = -IPSET_ERR_LOOP; + goto finish; + } + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + e.before = f & IPSET_FLAG_BEFORE; + } + + if (e.before && !tb[IPSET_ATTR_NAMEREF]) { + ret = -IPSET_ERR_BEFORE; + goto finish; + } + + if (tb[IPSET_ATTR_NAMEREF]) { + e.refid = ip_set_get_byname(map->net, + nla_data(tb[IPSET_ATTR_NAMEREF]), + &s); + if (e.refid == IPSET_INVALID_ID) { + ret = -IPSET_ERR_NAMEREF; + goto finish; + } + if (!e.before) + e.before = -1; + } + if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set)) + set_cleanup_entries(set); + + ret = adtfn(set, &e, &ext, &ext, flags); + +finish: + if (e.refid != IPSET_INVALID_ID) + ip_set_put_byindex(map->net, e.refid); + if (adt != IPSET_ADD || ret) + ip_set_put_byindex(map->net, e.id); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static void +list_set_flush(struct ip_set *set) +{ + struct list_set *map = set->data; + struct set_elem *e; + u32 i; + + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id != IPSET_INVALID_ID) { + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + e->id = IPSET_INVALID_ID; + } + } +} + +static void +list_set_destroy(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); + list_set_flush(set); + kfree(map); + + set->data = NULL; +} + +static int +list_set_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct list_set *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->size * set->dsize))) + goto nla_put_failure; + if (unlikely(ip_set_put_flags(skb, set))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static int +list_set_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct list_set *map = set->data; + struct nlattr *atd, *nested; + u32 i, first = cb->args[IPSET_CB_ARG0]; + const struct set_elem *e; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[IPSET_CB_ARG0] < map->size; + cb->args[IPSET_CB_ARG0]++) { + i = cb->args[IPSET_CB_ARG0]; + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + goto finish; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (i == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (nla_put_string(skb, IPSET_ATTR_NAME, + ip_set_name_byindex(map->net, e->id))) + goto nla_put_failure; + if (ip_set_put_extensions(skb, set, e, true)) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } +finish: + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[IPSET_CB_ARG0] = 0; + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + if (unlikely(i == first)) { + cb->args[IPSET_CB_ARG0] = 0; + return -EMSGSIZE; + } + ipset_nest_end(skb, atd); + return 0; +} + +static bool +list_set_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct list_set *x = a->data; + const struct list_set *y = b->data; + + return x->size == y->size && + a->timeout == b->timeout && + a->extensions == b->extensions; +} + +static const struct ip_set_type_variant set_variant = { + .kadt = list_set_kadt, + .uadt = list_set_uadt, + .adt = { + [IPSET_ADD] = list_set_uadd, + [IPSET_DEL] = list_set_udel, + [IPSET_TEST] = list_set_utest, + }, + .destroy = list_set_destroy, + .flush = list_set_flush, + .head = list_set_head, + .list = list_set_list, + .same_set = list_set_same_set, +}; + +static void +list_set_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct list_set *map = set->data; + + write_lock_bh(&set->lock); + set_cleanup_entries(set); + write_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&map->gc); +} + +static void +list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct list_set *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create list:set type of sets */ + +static bool +init_list_set(struct net *net, struct ip_set *set, u32 size) +{ + struct list_set *map; + struct set_elem *e; + u32 i; + + map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL); + if (!map) + return false; + + map->size = size; + map->net = net; + set->data = map; + + for (i = 0; i < size; i++) { + e = list_set_elem(set, map, i); + e->id = IPSET_INVALID_ID; + } + + return true; +} + +static int +list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + u32 size = IP_SET_LIST_DEFAULT_SIZE; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_SIZE]) + size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]); + if (size < IP_SET_LIST_MIN_SIZE) + size = IP_SET_LIST_MIN_SIZE; + + set->variant = &set_variant; + set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); + if (!init_list_set(net, set, size)) + return -ENOMEM; + if (tb[IPSET_ATTR_TIMEOUT]) { + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + list_set_gc_init(set, list_set_gc); + } + return 0; +} + +static struct ip_set_type list_set_type __read_mostly = { + .name = "list:set", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = list_set_create, + .create_policy = { + [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_NAME] = { .type = NLA_STRING, + .len = IPSET_MAXNAMELEN }, + [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING, + .len = IPSET_MAXNAMELEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +list_set_init(void) +{ + return ip_set_type_register(&list_set_type); +} + +static void __exit +list_set_fini(void) +{ + ip_set_type_unregister(&list_set_type); +} + +module_init(list_set_init); +module_exit(list_set_fini); diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c new file mode 100644 index 00000000000..04d15fdc99e --- /dev/null +++ b/net/netfilter/ipset/pfxlen.c @@ -0,0 +1,313 @@ +#include <linux/export.h> +#include <linux/netfilter/ipset/pfxlen.h> + +/* + * Prefixlen maps for fast conversions, by Jan Engelhardt. + */ + +#define E(a, b, c, d) \ + {.ip6 = { \ + htonl(a), htonl(b), \ + htonl(c), htonl(d), \ + } } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_netmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_netmask_map[] = { + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_netmask_map); + +#undef E +#define E(a, b, c, d) \ + {.ip6 = { (__force __be32) a, (__force __be32) b, \ + (__force __be32) c, (__force __be32) d, \ + } } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_hostmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_hostmask_map[] = { + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_hostmask_map); + +/* Find the largest network which matches the range from left, in host order. */ +u32 +ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr) +{ + u32 last; + u8 i; + + for (i = 1; i < 32; i++) { + if ((from & ip_set_hostmask(i)) != from) + continue; + last = from | ~ip_set_hostmask(i); + if (!after(last, to)) { + *cidr = i; + return last; + } + } + *cidr = 32; + return from; +} +EXPORT_SYMBOL_GPL(ip_set_range_to_cidr); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig new file mode 100644 index 00000000000..0c3b1670b0d --- /dev/null +++ b/net/netfilter/ipvs/Kconfig @@ -0,0 +1,281 @@ +# +# IP Virtual Server configuration +# +menuconfig IP_VS + tristate "IP virtual server support" + depends on NET && INET && NETFILTER + depends on (NF_CONNTRACK || NF_CONNTRACK=n) + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: <http://www.linuxvirtualserver.org/>. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +if IP_VS + +config IP_VS_IPV6 + bool "IPv6 support for IPVS" + depends on IPV6 = y || IP_VS = IPV6 + select IP6_NF_IPTABLES + ---help--- + Add IPv6 support to IPVS. + + Say Y if unsure. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + range 8 20 + default 12 + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line + if IP VS was compiled built-in. + +comment "IPVS transport protocol load balancing support" + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_AH_ESP + def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + ---help--- + This option enables support for load balancing ESP (Encapsulation + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_SCTP + bool "SCTP load balancing support" + select LIBCRC32C + ---help--- + This option enables support for load balancing SCTP transport + protocol. Say Y if unsure. + +comment "IPVS scheduler" + +config IP_VS_RR + tristate "round-robin scheduling" + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS SH scheduler' + +config IP_VS_SH_TAB_BITS + int "IPVS source hashing table size (the Nth power of 2)" + range 4 20 + default 8 + ---help--- + The source hashing scheduler maps source IPs to destinations + stored in a hash table. This table is tiled by each destination + until all slots in the table are filled. When using weights to + allow destinations to receive more connections, the table is + tiled an amount proportional to the weights specified. The table + needs to be large enough to effectively fit all the destinations + multiplied by their respective weights. + +comment 'IPVS application helper' + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ + NF_CONNTRACK_FTP + select IP_VS_NFCT + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + ---help--- + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + +config IP_VS_PE_SIP + tristate "SIP persistence engine" + depends on IP_VS_PROTO_UDP + depends on NF_CONNTRACK_SIP + ---help--- + Allow persistence based on the SIP Call-ID + +endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile new file mode 100644 index 00000000000..34ee602ddb6 --- /dev/null +++ b/net/netfilter/ipvs/Makefile @@ -0,0 +1,40 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o + +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \ + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o + +# IPVS connection template retrievers +obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c new file mode 100644 index 00000000000..dfd7b65b3d2 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -0,0 +1,627 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <net/net_namespace.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> + +#include <net/ip_vs.h> + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +static DEFINE_MUTEX(__ip_vs_app_mutex); + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + return try_module_get(app->module); +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + module_put(app->module); +} + +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ + kfree(inc->timeout_table); + kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + + ip_vs_app_inc_destroy(inc); +} + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); + if (!inc) + return -ENOMEM; + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(net, inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s App %s:%u registered\n", + pp->name, inc->name, ntohs(inc->port)); + + return 0; + + out: + ip_vs_app_inc_destroy(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(net, inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, ntohs(inc->port)); + + list_del(&inc->a_list); + + call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + result = ip_vs_app_get(inc->app); + if (result) + atomic_inc(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + atomic_dec(&inc->usecnt); + ip_vs_app_put(inc->app); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + int result; + + mutex_lock(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(net, app, proto, port); + + mutex_unlock(&__ip_vs_app_mutex); + + return result; +} + + +/* Register application for netns */ +struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_app *a; + int err = 0; + + if (!ipvs) + return ERR_PTR(-ENOENT); + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry(a, &ipvs->app_list, a_list) { + if (!strcmp(app->name, a->name)) { + err = -EEXIST; + goto out_unlock; + } + } + a = kmemdup(app, sizeof(*app), GFP_KERNEL); + if (!a) { + err = -ENOMEM; + goto out_unlock; + } + INIT_LIST_HEAD(&a->incs_list); + list_add(&a->a_list, &ipvs->app_list); + /* increase the module use count */ + ip_vs_use_count_inc(); + +out_unlock: + mutex_unlock(&__ip_vs_app_mutex); + + return err ? ERR_PTR(err) : a; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + * Caller should use synchronize_rcu() or rcu_barrier() + */ +void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_app *a, *anxt, *inc, *nxt; + + if (!ipvs) + return; + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { + if (app && strcmp(app->name, a->name)) + continue; + list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { + ip_vs_app_inc_release(net, inc); + } + + list_del(&a->a_list); + kfree(a); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + } + + mutex_unlock(&__ip_vs_app_mutex); +} + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "%s(): added delta (%d) to seq\n", + __func__, vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "%s(): subtracted delta " + "(%d) from ack_seq\n", __func__, vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "%s(): subtracted " + "previous_delta (%d) from ack_seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned int flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock_bh(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock_bh(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, skb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, skb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, skb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, skb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ipvs->app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(ipvs, 0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static const struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_app_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +int __net_init ip_vs_app_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->app_list); + proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); + return 0; +} + +void __net_exit ip_vs_app_net_cleanup(struct net *net) +{ + unregister_ip_vs_app(net, NULL /* all */); + remove_proc_entry("ip_vs_app", net->proc_net); +} diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c new file mode 100644 index 00000000000..610e19c0e13 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -0,0 +1,1376 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/net.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> /* for proc_net_* */ +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/jhash.h> +#include <linux/random.h> + +#include <net/net_namespace.h> +#include <net/ip_vs.h> + + +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly; + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct hlist_head *ip_vs_conn_tab __read_mostly; + +/* SLAB cache for IPVS connections */ +static struct kmem_cache *ip_vs_conn_cachep __read_mostly; + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd __read_mostly; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 5 +#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) +#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) + +struct ip_vs_aligned_lock +{ + spinlock_t l; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +/* lock array for conn table */ +static struct ip_vs_aligned_lock +__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; + +static inline void ct_write_lock_bh(unsigned int key) +{ + spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_unlock_bh(unsigned int key) +{ + spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + + +/* + * Returns hash value for IPVS connection entry + */ +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, + const union nf_inet_addr *addr, + __be16 port) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), + (__force u32)port, proto, ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; +#endif + return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; +} + +static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, + bool inverse) +{ + const union nf_inet_addr *addr; + __be16 port; + + if (p->pe_data && p->pe->hashkey_raw) + return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & + ip_vs_conn_tab_mask; + + if (likely(!inverse)) { + addr = p->caddr; + port = p->cport; + } else { + addr = p->vaddr; + port = p->vport; + } + + return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); +} + +static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +{ + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + &cp->caddr, cp->cport, NULL, 0, &p); + + if (cp->pe) { + p.pe = cp->pe; + p.pe_data = cp->pe_data; + p.pe_data_len = cp->pe_data_len; + } + + return ip_vs_conn_hashkey_param(&p, false); +} + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned int hash; + int ret; + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return 0; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); + ret = 1; + } else { + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); + ret = 0; + } + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. Caller should hold conn reference. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned int hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); + + return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ + unsigned int hash; + bool ret; + + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + ret = false; + /* Decrease refcnt and unlink conn only if we are last user */ + if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + ret = true; + } + } else + ret = atomic_read(&cp->refcnt) ? false : true; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * p->caddr, p->cport: pkt source address (foreign host) + * p->vaddr, p->vport: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn * +__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + unsigned int hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey_param(p, false); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->cport == cp->cport && p->vport == cp->vport && + cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && + ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; + /* HIT */ + rcu_read_unlock(); + return cp; + } + } + + rcu_read_unlock(); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(p); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { + struct ip_vs_conn_param cport_zero_p = *p; + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } + + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +static int +ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + int inverse, struct ip_vs_conn_param *p) +{ + __be16 _ports[2], *pptr; + struct net *net = skb_net(skb); + + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); + if (pptr == NULL) + return 1; + + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + pptr[0], &iph->daddr, pptr[1], p); + else + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + pptr[1], &iph->saddr, pptr[0], p); + return 0; +} + +struct ip_vs_conn * +ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + return NULL; + + return ip_vs_conn_in_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); + +/* Get reference to connection template */ +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) +{ + unsigned int hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey_param(p, false); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { + if (__ip_vs_conn_get(cp)) + goto out; + } + continue; + } + + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + /* protocol should only be IPPROTO_IP if + * p->vaddr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : + p->af, p->vaddr, &cp->vaddr) && + p->vport == cp->vport && p->cport == cp->cport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (__ip_vs_conn_get(cp)) + goto out; + } + } + cp = NULL; + + out: + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * p->caddr, p->cport: pkt source address (inside host) + * p->vaddr, p->vport: pkt dest address (foreign host) */ +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) +{ + unsigned int hash; + struct ip_vs_conn *cp, *ret=NULL; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey_param(p, true); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->vport == cp->cport && p->cport == cp->dport && + cp->af == p->af && + ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; + /* HIT */ + ret = cp; + break; + } + } + + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +struct ip_vs_conn * +ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + return NULL; + + return ip_vs_conn_out_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? + 0 : cp->timeout; + mod_timer(&cp->timer, jiffies+t); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock_bh(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock_bh(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + unsigned int conn_flags; + __u32 flags; + + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + ip_vs_dest_hold(dest); + + conn_flags = atomic_read(&dest->conn_flags); + if (cp->protocol != IPPROTO_UDP) + conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + flags = cp->flags; + /* Bind with the destination and its corresponding transmitter */ + if (flags & IP_VS_CONN_F_SYNC) { + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); + } + flags |= conn_flags; + cp->flags = flags; + cp->dest = dest; + + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so modify the counters + * according to the flags, later the protocol can + * update them on state change + */ + if (!(flags & IP_VS_CONN_F_INACTIVE)) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the persistent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +void ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + rcu_read_lock(); + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + if (dest) { + struct ip_vs_proto_data *pd; + + spin_lock_bh(&cp->lock); + if (cp->dest) { + spin_unlock_bh(&cp->lock); + rcu_read_unlock(); + return; + } + + /* Applications work depending on the forwarding method + * but better to reassign them always when binding dest */ + if (cp->app) + ip_vs_unbind_app(cp); + + ip_vs_bind_dest(cp, dest); + spin_unlock_bh(&cp->lock); + + /* Update its packet transmitter */ + cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + if (pd && atomic_read(&pd->appcnt)) + ip_vs_bind_app(cp, pd->pp); + } + rcu_read_unlock(); +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the persistent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + ip_vs_dest_put(dest); +} + +static int expire_quiescent_template(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest) +{ +#ifdef CONFIG_SYSCTL + return ipvs->sysctl_expire_quiescent_template && + (atomic_read(&dest->weight) == 0); +#else + return 0; +#endif +} + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + expire_quiescent_template(ipvs, dest)) { + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->vport != htons(0xffff)) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = htons(0xffff); + ct->vport = htons(0xffff); + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + __ip_vs_conn_put(ct); + return 0; + } + return 1; +} + +static void ip_vs_conn_rcu_free(struct rcu_head *head) +{ + struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, + rcu_head); + + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + kmem_cache_free(ip_vs_conn_cachep, cp); +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct net *net = ip_vs_conn_net(cp); + struct netns_ipvs *ipvs = net_ipvs(net); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* Unlink conn if not referenced anymore */ + if (likely(ip_vs_conn_unlink(cp))) { + /* delete the timer if it is activated by other users */ + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (cp->flags & IP_VS_CONN_F_NFCT) { + /* Do not access conntracks during subsys cleanup + * because nf_conntrack_find_get can not be used after + * conntrack cleanup for the net. + */ + smp_rmb(); + if (ipvs->enable) + ip_vs_conn_drop_conntrack(cp); + } + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + atomic_dec(&ipvs->conn_count); + return; + } + + expire_later: + IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt), + atomic_read(&cp->n_control)); + + atomic_inc(&cp->refcnt); + cp->timeout = 60*HZ; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + + ip_vs_conn_put(cp); +} + +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + */ +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + /* Using mod_timer_pending will ensure the timer is not + * modified after the final del_timer in ip_vs_conn_expire. + */ + if (timer_pending(&cp->timer) && + time_after(cp->timer.expires, jiffies)) + mod_timer_pending(&cp->timer, jiffies); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(const struct ip_vs_conn_param *p, + const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, + struct ip_vs_dest *dest, __u32 fwmark) +{ + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(p->net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + p->protocol); + + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NULL; + } + + INIT_HLIST_NODE(&cp->c_list); + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + ip_vs_conn_net_set(cp, p->net); + cp->af = p->af; + cp->protocol = p->protocol; + ip_vs_addr_set(p->af, &cp->caddr, p->caddr); + cp->cport = p->cport; + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ + ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->vaddr, p->vaddr); + cp->vport = p->vport; + ip_vs_addr_set(p->af, &cp->daddr, daddr); + cp->dport = dport; + cp->flags = flags; + cp->fwmark = fwmark; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { + ip_vs_pe_get(p->pe); + cp->pe = p->pe; + cp->pe_data = p->pe_data; + cp->pe_data_len = p->pe_data_len; + } else { + cp->pe = NULL; + cp->pe_data = NULL; + cp->pe_data_len = 0; + } + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + cp->control = NULL; + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + cp->packet_xmit = NULL; + cp->app = NULL; + cp->app_data = NULL; + /* reset struct ip_vs_seq */ + cp->in_seq.delta = 0; + cp->out_seq.delta = 0; + + atomic_inc(&ipvs->conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + cp->dest = NULL; + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->old_state = 0; + cp->timeout = 3*HZ; + cp->sync_endtime = jiffies & ~3UL; + + /* Bind its packet transmitter */ +#ifdef CONFIG_IP_VS_IPV6 + if (p->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + if (unlikely(pd && atomic_read(&pd->appcnt))) + ip_vs_bind_app(cp, pd->pp); + + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled(ipvs)) + cp->flags |= IP_VS_CONN_F_NFCT; + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { + struct seq_net_private p; + struct hlist_head *l; +}; + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_iter_state *iter = seq->private; + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + /* __ip_vs_conn_get() is not needed by + * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show + */ + if (pos-- == 0) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + } + cond_resched_rcu(); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct ip_vs_iter_state *iter = seq->private; + + iter->l = NULL; + rcu_read_lock(); + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *e; + struct hlist_head *l = iter->l; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + e = rcu_dereference(hlist_next_rcu(&cp->c_list)); + if (e) + return hlist_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + while (++idx < ip_vs_conn_tab_size) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + cond_resched_rcu(); + } + iter->l = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; + size_t len = 0; + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + if (cp->pe_data) { + pe_data[0] = ' '; + len = strlen(cp->pe->name); + memcpy(pe_data + 1, cp->pe->name, len); + pe_data[len + 1] = ' '; + len += 2; + len += cp->pe->show_pe_data(cp, pe_data + len); + } + pe_data[len] = '\0'; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%pI6 %04X %-11s %7lu%s\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + &cp->daddr.in6, ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ, pe_data); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu%s\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ, pe_data); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state)); +} + +static const struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const char *ip_vs_origin_name(unsigned int flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + &cp->daddr.in6, ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state)); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +/* Called from keventd and must protect itself from softirqs */ +void ip_vs_random_dropentry(struct net *net) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + + rcu_read_lock(); + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + /* connection template */ + continue; + if (!ip_vs_conn_net_eq(cp, net)) + continue; + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else if (cp->protocol == IPPROTO_SCTP) { + switch (cp->state) { + case IP_VS_SCTP_S_INIT1: + case IP_VS_SCTP_S_INIT: + break; + case IP_VS_SCTP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); + } + } + cond_resched_rcu(); + } + rcu_read_unlock(); +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(struct net *net) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + struct netns_ipvs *ipvs = net_ipvs(net); + +flush_again: + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (!ip_vs_conn_net_eq(cp, net)) + continue; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); + } + } + cond_resched_rcu(); + } + rcu_read_unlock(); + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ipvs->conn_count) != 0) { + schedule(); + goto flush_again; + } +} +/* + * per netns init and exit + */ +int __net_init ip_vs_conn_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + atomic_set(&ipvs->conn_count, 0); + + proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); + proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); + return 0; +} + +void __net_exit ip_vs_conn_net_cleanup(struct net *net) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(net); + remove_proc_entry("ip_vs_conn", net->proc_net); + remove_proc_entry("ip_vs_conn_sync", net->proc_net); +} + +int __init ip_vs_conn_init(void) +{ + int idx; + + /* Compute size and mask */ + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + pr_info("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) + INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + +void ip_vs_conn_cleanup(void) +{ + /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ + rcu_barrier(); + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + vfree(ip_vs_conn_tab); +} diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c new file mode 100644 index 00000000000..e6836755c45 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -0,0 +1,2105 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/sctp.h> +#include <linux/icmp.h> +#include <linux/slab.h> + +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> +#include <net/ip6_checksum.h> +#include <net/netns/generic.h> /* net_generic() */ + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#include <linux/netfilter_ipv6.h> +#include <net/ip6_route.h> +#endif + +#include <net/ip_vs.h> + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif + +static int ip_vs_net_id __read_mostly; +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) + +const char *ip_vs_proto_name(unsigned int proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_SCTP: + return "SCTP"; + case IPPROTO_ICMP: + return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif + default: + sprintf(buf, "IP_%u", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + rcu_read_unlock(); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + rcu_read_unlock(); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(cp->dest->stats.cpustats); + s->ustats.conns++; + + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.conns++; + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.conns++; +} + + +static inline void +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (likely(pd->pp->state_transition)) + pd->pp->state_transition(cp, direction, skb, pd); +} + +static inline int +ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, + struct sk_buff *skb, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + vport, p); + p->pe = rcu_dereference(svc->pe); + if (p->pe && p->pe->fill_param) + return p->pe->fill_param(p, skb); + + return 0; +} + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + struct sk_buff *skb, __be16 src_port, __be16 dst_port, + int *ignored, struct ip_vs_iphdr *iph) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __be16 dport = 0; /* destination port to forward */ + unsigned int flags; + struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + union nf_inet_addr snet; /* source network of the client, + after masking */ + + /* Mask saddr with the netmask to adjust template granularity */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + (__force __u32) svc->netmask); + else +#endif + snet.ip = iph->saddr.ip & svc->netmask; + + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, &snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP + * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> + * is created for other persistent services. + */ + { + int protocol = iph->protocol; + const union nf_inet_addr *vaddr = &iph->daddr; + __be16 vport = 0; + + if (dst_port == svc->port) { + /* non-FTP template: + * <protocol, caddr, 0, vaddr, vport, daddr, dport> + * FTP template: + * <protocol, caddr, 0, vaddr, 0, daddr, 0> + */ + if (svc->port != FTPPORT) + vport = dst_port; + } else { + /* Note: persistent fwmark-based services and + * persistent port zero service are handled here. + * fwmark template: + * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> + * port zero template: + * <protocol,caddr,0,vaddr,0,daddr,0> + */ + if (svc->fwmark) { + protocol = IPPROTO_IP; + vaddr = &fwmark; + } + } + /* return *ignored = -1 so NF_DROP can be used */ + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m) < 0) { + *ignored = -1; + return NULL; + } + } + + /* Check if a template already exists */ + ct = ip_vs_ct_in_get(¶m); + if (!ct || !ip_vs_check_template(ct)) { + struct ip_vs_scheduler *sched; + + /* + * No template found or the dest of the connection + * template is not available. + * return *ignored=0 i.e. ICMP and NF_DROP + */ + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); + if (!dest) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + kfree(param.pe_data); + *ignored = 0; + return NULL; + } + + if (dst_port == svc->port && svc->port != FTPPORT) + dport = dest->port; + + /* Create a template + * This adds param.pe_data to the template, + * and thus param.pe_data will be destroyed + * when the template expires */ + ct = ip_vs_conn_new(¶m, &dest->addr, dport, + IP_VS_CONN_F_TEMPLATE, dest, skb->mark); + if (ct == NULL) { + kfree(param.pe_data); + *ignored = -1; + return NULL; + } + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + kfree(param.pe_data); + } + + dport = dst_port; + if (dport == svc->port && dest->port) + dport = dest->port; + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a new connection according to the template + */ + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, + src_port, &iph->daddr, dst_port, ¶m); + + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); + if (cp == NULL) { + ip_vs_conn_put(ct); + *ignored = -1; + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + * + * Usage of *ignored + * + * 1 : protocol tried to schedule (eg. on SYN), found svc but the + * svc/scheduler decides that this packet should be accepted with + * NF_ACCEPT because it must not be scheduled. + * + * 0 : scheduler can not find destination, so try bypass or + * return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 : scheduler tried to schedule but fatal error occurred, eg. + * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + * failure such as missing Call-ID, ENOMEM on skb_linearize + * or pe_data. In this case we should return NF_DROP without + * any attempts to send ICMP with ip_vs_leave. + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *ignored, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_protocol *pp = pd->pp; + struct ip_vs_conn *cp = NULL; + struct ip_vs_scheduler *sched; + struct ip_vs_dest *dest; + __be16 _ports[2], *pptr; + unsigned int flags; + + *ignored = 1; + /* + * IPv6 frags, only the first hit here. + */ + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); + if (pptr == NULL) + return NULL; + + /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, + iph); + + *ignored = 0; + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + pr_err("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a connection entry. + */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], &iph->daddr, + pptr[1], &p); + cp = ip_vs_conn_new(&p, &dest->addr, + dest->port ? dest->port : pptr[1], + flags, dest, skb->mark); + if (!cp) { + *ignored = -1; + return NULL; + } + } + + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *pptr; +#ifdef CONFIG_SYSCTL + struct net *net; + struct netns_ipvs *ipvs; + int unicast; +#endif + + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); + if (pptr == NULL) { + return NF_DROP; + } + +#ifdef CONFIG_SYSCTL + net = skb_net(skb); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is a non-local unicast, then create + a cache_bypass connection entry */ + ipvs = net_ipvs(net); + if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { + int ret; + struct ip_vs_conn *cp; + unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && + iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + + /* create a new connection entry */ + IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &daddr, 0, + IP_VS_CONN_F_BYPASS | flags, + NULL, skb->mark); + if (!cp) + return NF_DROP; + } + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pd->pp, iph); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } +#endif + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) + return NF_ACCEPT; + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (!skb->dev) { + struct net *net_ = dev_net(skb_dst(skb)->dev); + + skb->dev = net_->loopback_dev; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + } else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + return NF_DROP; +} + +#ifdef CONFIG_SYSCTL + +static int sysctl_snat_reroute(struct sk_buff *skb) +{ + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + return ipvs->sysctl_snat_reroute; +} + +static int sysctl_nat_icmp_send(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + return ipvs->sysctl_nat_icmp_send; +} + +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_expire_nodest_conn; +} + +#else + +static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } +static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } + +#endif + +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) +{ + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; +} + +static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err; + + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +static int ip_vs_route_me_harder(int af, struct sk_buff *skb) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) + return 1; + } else +#endif + if ((sysctl_snat_reroute(skb) || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + return 1; + + return 0; +} + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.ip; + ip_send_check(iph); + ciph->daddr = cp->vaddr.ip; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr.ip; + ip_send_check(iph); + ciph->saddr = cp->daddr.ip; + ip_send_check(ciph); + } + + /* the TCP/UDP/SCTP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || + IPPROTO_SCTP == ciph->protocol) { + __be16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = 0; + unsigned int offs = 0; /* header offset*/ + int protocol; + struct icmp6hdr *icmph; + struct ipv6hdr *ciph; + unsigned short fragoffs; + + ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); + icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); + offs = icmp_offset + sizeof(struct icmp6hdr); + ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); + + protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP/SCTP port */ + if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol)) { + __be16 *ports = (void *)(skb_network_header(skb) + offs); + + IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, + ntohs(inout ? ports[1] : ports[0]), + ntohs(inout ? cp->vport : cp->dport)); + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + +/* Handle relevant response ICMP messages - forward to the right + * destination host. + */ +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + pr_err("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); + goto out; + } + + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); + + if (ip_vs_route_me_harder(af, skb)) + goto out; + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_out_icmp(struct sk_buff *skb, int *related, + unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); + + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); + if (!cp) + return NF_ACCEPT; + + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, ciph.len, ihl); +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum, struct ip_vs_iphdr *ipvsh) +{ + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + union nf_inet_addr snet; + unsigned int writable; + + *related = 1; + ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); + if (ic == NULL) + return NF_DROP; + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { + *related = 0; + return NF_ACCEPT; + } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (ipvsh->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &ipvsh->saddr, &ipvsh->daddr); + + /* Now find the contained IP header */ + ciph.len = ipvsh->len + sizeof(_icmph); + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + + pp = ip_vs_proto_get(ciph.protocol); + if (!pp) + return NF_ACCEPT; + + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); + if (!cp) + return NF_ACCEPT; + + snet.in6 = ciph.saddr.in6; + writable = ciph.len; + return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, + pp, writable, sizeof(struct ipv6hdr)); +} +#endif + +/* + * Check if sctp chunc is ABORT chunk + */ +static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) +{ + sctp_chunkhdr_t *sch, schunk; + sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return 0; + if (sch->type == SCTP_CID_ABORT) + return 1; + return 0; +} + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +static inline bool is_new_conn(const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + return th->syn; + } + case IPPROTO_SCTP: { + sctp_chunkhdr_t *sch, schunk; + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return false; + return sch->type == SCTP_CID_INIT; + } + default: + return false; + } +} + +/* Handle response packets: rewrite addresses and send away... + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct ip_vs_protocol *pp = pd->pp; + + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, iph->len)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* + * nf_iterate does not expect change in the skb->dst->dev. + * It looks like it is not fatal to enable this code for hooks + * where our handlers are at the end of the chain list and + * when all next handlers use skb->dst->dev and not outdev. + * It will definitely route properly the inout NAT traffic + * when multiple paths are used. + */ + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_vs_route_me_harder(af, skb)) + goto drop; + + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + ip_vs_conn_put(cp); + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + LeaveFunction(11); + return NF_STOLEN; +} + +/* + * Check if outgoing packet belongs to the established ip_vs_conn. + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) +{ + struct net *net = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + + EnterFunction(11); + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + ip_vs_fill_iph_skb(af, skb, &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_out_icmp_v6(skb, &related, + hooknum, &iph); + + if (related) + return verdict; + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_out_icmp(skb, &related, hooknum); + + if (related) + return verdict; + } + + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) + return NF_ACCEPT; + pp = pd->pp; + + /* reassemble IP fragments */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET) +#endif + if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + + ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(af, skb, &iph, 0); + + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph); + if (sysctl_nat_icmp_send(net) && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = frag_safe_skb_hp(skb, iph.len, + sizeof(_ports), _ports, &iph); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; +} + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET); +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(ops->hooknum, skb, AF_INET6); +} + +#endif + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct net *net = NULL; + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offset, offset2, ihl, verdict; + bool ipip; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + net = skb_net(skb); + + /* Special case for errors for IPIP packets */ + ipip = false; + if (cih->protocol == IPPROTO_IPIP) { + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + /* Error for our IPIP must arrive at LOCAL_IN */ + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) + return NF_ACCEPT; + offset += cih->ihl * 4; + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + ipip = true; + } + + pd = ip_vs_proto_data_get(net, cih->protocol); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); + + offset2 = offset; + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; + offset = ciph.len; + /* The embedded headers contain source and dest in reverse order. + * For IPIP this is error for request, not for reply. + */ + cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", + &iph->saddr); + goto out; + } + + if (ipip) { + __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; + + /* Update the MTU */ + if (ic->type == ICMP_DEST_UNREACH && + ic->code == ICMP_FRAG_NEEDED) { + struct ip_vs_dest *dest = cp->dest; + u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; + + /* Strip outer IP and ICMP, go to IPIP header */ + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_ipip; + offset2 -= ihl + sizeof(_icmph); + skb_reset_network_header(skb); + IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); + ipv4_update_pmtu(skb, dev_net(skb->dev), + mtu, 0, 0, 0, 0); + /* Client uses PMTUD? */ + if (!(frag_off & htons(IP_DF))) + goto ignore_ipip; + /* Prefer the resulting PMTU */ + if (dest) { + struct ip_vs_dest_dst *dest_dst; + + rcu_read_lock(); + dest_dst = rcu_dereference(dest->dest_dst); + if (dest_dst) + mtu = dst_mtu(dest_dst->dst_cache); + rcu_read_unlock(); + } + if (mtu > 68 + sizeof(struct iphdr)) + mtu -= sizeof(struct iphdr); + info = htonl(mtu); + } + /* Strip outer IP, ICMP and IPIP, go to IP header of + * original request. + */ + if (pskb_pull(skb, offset2) == NULL) + goto ignore_ipip; + skb_reset_network_header(skb); + IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + type, code, ntohl(info)); + icmp_send(skb, type, code, info); + /* ICMP can be shorter but anyways, account it */ + ip_vs_out_stats(cp, skb); + +ignore_ipip: + consume_skb(skb); + verdict = NF_STOLEN; + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || + IPPROTO_SCTP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum, struct ip_vs_iphdr *iph) +{ + struct net *net = NULL; + struct ipv6hdr _ip6h, *ip6h; + struct icmp6hdr _icmph, *ic; + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offs_ciph, writable, verdict; + + *related = 1; + + ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph); + if (ic == NULL) + return NF_DROP; + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { + *related = 0; + return NF_ACCEPT; + } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (iph->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); + + /* Now find the contained IP header */ + ciph.len = iph->len + sizeof(_icmph); + offs_ciph = ciph.len; /* Save ip header offset */ + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, ciph.protocol); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Cannot handle fragmented embedded protocol */ + if (ciph.fragoffs) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, + "Checking incoming ICMPv6 for"); + + /* The embedded headers contain source and dest in reverse order + * if not from localhost + */ + cp = pp->conn_in_get(AF_INET6, skb, &ciph, + (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); + + if (!cp) + return NF_ACCEPT; + /* VS/TUN, VS/DR and LOCALNODE just let it go */ + if ((hooknum == NF_INET_LOCAL_OUT) && + (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { + __ip_vs_conn_put(cp); + return NF_ACCEPT; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + + /* Need to mangle contained IPv6 header in ICMPv6 packet */ + writable = ciph.len; + if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || + IPPROTO_SCTP == ciph.protocol) + writable += 2 * sizeof(__u16); /* Also mangle ports */ + + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) +{ + struct net *net; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + int ret, pkts; + struct netns_ipvs *ipvs; + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + /* + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset + */ + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iph_skb(af, skb, &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); + return NF_ACCEPT; + } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + ip_vs_fill_iph_skb(af, skb, &iph); + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, + &iph); + + if (related) + return verdict; + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_in_icmp(skb, &related, hooknum); + + if (related) + return verdict; + } + + /* Protocol supported? */ + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) + return NF_ACCEPT; + pp = pd->pp; + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(af, skb, &iph, 0); + + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && + unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && + is_new_conn(skb, &iph)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } + + if (unlikely(!cp) && !iph.fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ + int v; + + /* Schedule and create new connection entry into &cp */ + if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_in: packet continues traversal as normal"); + if (iph.fragoffs) { + /* Fragment that couldn't be mapped to a conn entry + * is missing module nf_defrag_ipv6 + */ + IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); + } + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_expire_nodest_conn(ipvs)) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp, &iph); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + * + * For ONE_PKT let ip_vs_sync_conn() do the filter work. + */ + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + pkts = sysctl_sync_threshold(ipvs); + else + pkts = atomic_add_return(1, &cp->in_pkts); + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, pkts); + + ip_vs_conn_put(cp); + return ret; +} + +/* + * AF_INET handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET); +} + +/* + * AF_INET handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET); +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * AF_INET6 handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET6); +} + +/* + * AF_INET6 handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(ops->hooknum, skb, AF_INET6); +} + +#endif + + +/* + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + struct net *net; + struct netns_ipvs *ipvs; + + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + return ip_vs_in_icmp(skb, &r, ops->hooknum); +} + +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + struct net *net; + struct netns_ipvs *ipvs; + struct ip_vs_iphdr iphdr; + + ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); + if (iphdr.protocol != IPPROTO_ICMPV6) + return NF_ACCEPT; + + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); +} +#endif + + +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_remote_request4, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_local_reply4, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request4, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_remote_request6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_local_reply6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +#endif +}; +/* + * Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ + struct netns_ipvs *ipvs; + + ipvs = net_generic(net, ip_vs_net_id); + if (ipvs == NULL) + return -ENOMEM; + + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; + ipvs->net = net; + /* Counters used for creating unique names */ + ipvs->gen = atomic_read(&ipvs_netns_cnt); + atomic_inc(&ipvs_netns_cnt); + net->ipvs = ipvs; + + if (ip_vs_estimator_net_init(net) < 0) + goto estimator_fail; + + if (ip_vs_control_net_init(net) < 0) + goto control_fail; + + if (ip_vs_protocol_net_init(net) < 0) + goto protocol_fail; + + if (ip_vs_app_net_init(net) < 0) + goto app_fail; + + if (ip_vs_conn_net_init(net) < 0) + goto conn_fail; + + if (ip_vs_sync_net_init(net) < 0) + goto sync_fail; + + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", + sizeof(struct netns_ipvs), ipvs->gen); + return 0; +/* + * Error handling + */ + +sync_fail: + ip_vs_conn_net_cleanup(net); +conn_fail: + ip_vs_app_net_cleanup(net); +app_fail: + ip_vs_protocol_net_cleanup(net); +protocol_fail: + ip_vs_control_net_cleanup(net); +control_fail: + ip_vs_estimator_net_cleanup(net); +estimator_fail: + net->ipvs = NULL; + return -ENOMEM; +} + +static void __net_exit __ip_vs_cleanup(struct net *net) +{ + ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ + ip_vs_conn_net_cleanup(net); + ip_vs_app_net_cleanup(net); + ip_vs_protocol_net_cleanup(net); + ip_vs_control_net_cleanup(net); + ip_vs_estimator_net_cleanup(net); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + net->ipvs = NULL; +} + +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(net); + LeaveFunction(2); +} + +static struct pernet_operations ipvs_core_ops = { + .init = __ip_vs_init, + .exit = __ip_vs_cleanup, + .id = &ip_vs_net_id, + .size = sizeof(struct netns_ipvs), +}; + +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ret = ip_vs_control_init(); + if (ret < 0) { + pr_err("can't setup control.\n"); + goto exit; + } + + ip_vs_protocol_init(); + + ret = ip_vs_conn_init(); + if (ret < 0) { + pr_err("can't setup connection table.\n"); + goto cleanup_protocol; + } + + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_conn; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) { + pr_err("can't register hooks.\n"); + goto cleanup_dev; + } + + ret = ip_vs_register_nl_ioctl(); + if (ret < 0) { + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_hooks; + } + + pr_info("ipvs loaded.\n"); + + return ret; + +cleanup_hooks: + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); +cleanup_conn: + ip_vs_conn_cleanup(); +cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); +exit: + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + ip_vs_unregister_nl_ioctl(); + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ + ip_vs_conn_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + pr_info("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c new file mode 100644 index 00000000000..581a6584ed0 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -0,0 +1,3909 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/workqueue.h> +#include <linux/swap.h> +#include <linux/seq_file.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/mutex.h> + +#include <net/net_namespace.h> +#include <linux/nsproxy.h> +#include <net/ip.h> +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#include <net/ip6_route.h> +#endif +#include <net/route.h> +#include <net/sock.h> +#include <net/genetlink.h> + +#include <asm/uaccess.h> + +#include <net/ip_vs.h> + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DEFINE_MUTEX(__ip_vs_mutex); + +/* sysctl variables */ + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); + + +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static bool __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) +{ + struct flowi6 fl6 = { + .daddr = *addr, + }; + struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); + bool is_local; + + is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); + + dst_release(dst); + return is_local; +} +#endif + +#ifdef CONFIG_SYSCTL +/* + * update_defense_level is called from keventd and from sysctl, + * so it needs to protect itself from softirqs + */ +static void update_defense_level(struct netns_ipvs *ipvs) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < ipvs->sysctl_amemthresh); + + local_bh_disable(); + + /* drop_entry */ + spin_lock(&ipvs->dropentry_lock); + switch (ipvs->sysctl_drop_entry) { + case 0: + atomic_set(&ipvs->dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + ipvs->sysctl_drop_entry = 2; + } else { + atomic_set(&ipvs->dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + } else { + atomic_set(&ipvs->dropentry, 0); + ipvs->sysctl_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ipvs->dropentry, 1); + break; + } + spin_unlock(&ipvs->dropentry_lock); + + /* drop_packet */ + spin_lock(&ipvs->droppacket_lock); + switch (ipvs->sysctl_drop_packet) { + case 0: + ipvs->drop_rate = 0; + break; + case 1: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + ipvs->sysctl_drop_packet = 2; + } else { + ipvs->drop_rate = 0; + } + break; + case 2: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + } else { + ipvs->drop_rate = 0; + ipvs->sysctl_drop_packet = 1; + } + break; + case 3: + ipvs->drop_rate = ipvs->sysctl_am_droprate; + break; + } + spin_unlock(&ipvs->droppacket_lock); + + /* secure_tcp */ + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + ipvs->sysctl_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + ipvs->sysctl_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = ipvs->sysctl_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(ipvs, + ipvs->sysctl_secure_tcp > 1); + spin_unlock(&ipvs->securetcp_lock); + + local_bh_enable(); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ + +static void defense_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, defense_work.work); + + update_defense_level(ipvs); + if (atomic_read(&ipvs->dropentry)) + ip_vs_random_dropentry(ipvs->net); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); +} +#endif + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by <protocol, addr, port> */ +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + + +/* + * Returns hash value for virtual service + */ +static inline unsigned int +ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, + const union nf_inet_addr *addr, __be16 port) +{ + register unsigned int porth = ntohs(port); + __be32 addr_fold = addr->ip; + __u32 ahash; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + ahash = ntohl(addr_fold); + ahash ^= ((size_t) net >> 8); + + return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & + IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +{ + return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned int hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + &svc->addr, svc->port); + hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); + hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from svc_table / svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + pr_err("%s(): request for unhash flagged, called from %pF\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the svc_table table */ + hlist_del_rcu(&svc->s_list); + } else { + /* Remove it from the svc_fwm_table table */ + hlist_del_rcu(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {netns, proto,addr,port} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_service_find(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + unsigned int hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); + + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) + && (svc->port == vport) + && (svc->protocol == protocol) + && net_eq(svc->net, net)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) +{ + unsigned int hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(net, fwmark); + + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af + && net_eq(svc->net, net)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + +/* Find service, called under RCU lock */ +struct ip_vs_service * +ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + struct ip_vs_service *svc; + struct netns_ipvs *ipvs = net_ipvs(net); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark) { + svc = __ip_vs_svc_fwm_find(net, af, fwmark); + if (svc) + goto out; + } + + /* + * Check the table hashed by <protocol,addr,port> + * for "full" addressed entries + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ipvs->ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ipvs->nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); + } + + out: + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + rcu_assign_pointer(dest->svc, svc); +} + +static void ip_vs_service_free(struct ip_vs_service *svc) +{ + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); +} + +static void ip_vs_service_rcu_free(struct rcu_head *head) +{ + struct ip_vs_service *svc; + + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} + +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +{ + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port)); + if (do_delay) + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); + else + ip_vs_service_free(svc); + } +} + + +/* + * Returns hash value for real service + */ +static inline unsigned int ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned int porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +{ + unsigned int hash; + + if (dest->in_rs_table) + return; + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + + hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); + dest->in_rs_table = 1; +} + +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the rs_table table. + */ + if (dest->in_rs_table) { + hlist_del_rcu(&dest->d_list); + dest->in_rs_table = 0; + } +} + +/* Check if real service by <proto,addr,port> is present */ +bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + rcu_read_lock(); + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { + /* HIT */ + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; +} + +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock. + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Created to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * Called under RCU lock, no refcnt is returned. + */ +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, + const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol, __u32 fwmark, + __u32 flags) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + __be16 port = dport; + + svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); + if (!svc) + return NULL; + if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) + port = 0; + dest = ip_vs_lookup_dest(svc, daddr, port); + if (!dest) + dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); + return dest; +} + +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_dst *dest_dst = container_of(head, + struct ip_vs_dest_dst, + rcu_head); + + dst_release(dest_dst->dst_cache); + kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, 1); + if (old) { + RCU_INIT_POINTER(dest->dest_dst, NULL); + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); + } +} + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest; + struct netns_ipvs *ipvs = net_ipvs(svc->net); + + /* + * Find the destination in trash + */ + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && + dest->vport == svc->port))) { + /* HIT */ + list_del(&dest->t_list); + ip_vs_dest_hold(dest); + goto out; + } + } + + dest = NULL; + +out: + spin_unlock_bh(&ipvs->dest_trash_lock); + + return dest; +} + +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); + + __ip_vs_dst_cache_reset(dest); + __ip_vs_svc_put(svc, false); + free_percpu(dest->stats.cpustats); + ip_vs_dest_put_and_free(dest); +} + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 0, so we simply release them here. + */ +static void ip_vs_trash_cleanup(struct net *net) +{ + struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); + + del_timer_sync(&ipvs->dest_trash_timer); + /* No need to use dest_trash_lock */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } +} + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c + + spin_lock_bh(&src->lock); + + IP_VS_SHOW_STATS_COUNTER(conns); + IP_VS_SHOW_STATS_COUNTER(inpkts); + IP_VS_SHOW_STATS_COUNTER(outpkts); + IP_VS_SHOW_STATS_COUNTER(inbytes); + IP_VS_SHOW_STATS_COUNTER(outbytes); + + ip_vs_read_estimator(dst, src); + + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + + /* get current counters as zero point, rates are zeroed */ + +#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c + + IP_VS_ZERO_STATS_COUNTER(conns); + IP_VS_ZERO_STATS_COUNTER(inpkts); + IP_VS_ZERO_STATS_COUNTER(outpkts); + IP_VS_ZERO_STATS_COUNTER(inbytes); + IP_VS_ZERO_STATS_COUNTER(outbytes); + + ip_vs_zero_estimator(stats); + + spin_unlock_bh(&stats->lock); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, + struct ip_vs_dest_user_kern *udest, int add) +{ + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_service *old_svc; + struct ip_vs_scheduler *sched; + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; + conn_flags |= IP_VS_CONN_F_INACTIVE; + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in rs_table if not present. + * For now only for NAT! + */ + ip_vs_rs_hash(ipvs, dest); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + old_svc = rcu_dereference_protected(dest->svc, 1); + if (!old_svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (old_svc != svc) { + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + __ip_vs_svc_put(old_svc, true); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; + + spin_lock_bh(&dest->dst_lock); + __ip_vs_dst_cache_reset(dest); + spin_unlock_bh(&dest->dst_lock); + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (add) { + ip_vs_start_estimator(svc->net, &dest->stats); + list_add_rcu(&dest->n_list, &svc->destinations); + svc->num_dests++; + if (sched->add_dest) + sched->add_dest(svc, dest); + } else { + if (sched->upd_dest) + sched->upd_dest(svc, dest); + } +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned int atype, i; + + EnterFunction(2); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && + !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(svc->net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } + + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); + if (dest == NULL) + return -ENOMEM; + + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!dest->stats.cpustats) + goto err_alloc; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_dest_stats; + ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); + u64_stats_init(&ip_vs_dest_stats->syncp); + } + + dest->af = svc->af; + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 1); + + INIT_HLIST_NODE(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest, 1); + + *dest_p = dest; + + LeaveFunction(2); + return 0; + +err_alloc: + kfree(dest); + return -ENOMEM; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* We use function that requires RCU lock */ + rcu_read_lock(); + dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); + + if (dest != NULL) { + IP_VS_DBG(1, "%s(): dest already exists\n", __func__); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + + __ip_vs_update_dest(svc, dest, udest, 1); + ret = 0; + } else { + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + } + LeaveFunction(2); + + return ret; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* We use function that requires RCU lock */ + rcu_read_lock(); + dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest, 0); + LeaveFunction(2); + + return 0; +} + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, + bool cleanup) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_stop_estimator(net, &dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + ip_vs_rs_unhash(dest); + + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash without reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = 0; + spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_dest_put(dest); +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del_rcu(&dest->n_list); + svc->num_dests--; + + if (svcupd) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched->del_dest) + sched->del_dest(svc, dest); + } +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + __be16 dport = udest->port; + + EnterFunction(2); + + /* We use function that requires RCU lock */ + rcu_read_lock(); + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + rcu_read_unlock(); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): destination not found!\n", __func__); + return -ENOENT; + } + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + /* + * Delete the destination + */ + __ip_vs_del_dest(svc->net, dest, false); + + LeaveFunction(2); + + return 0; +} + +static void ip_vs_dest_trash_expire(unsigned long data) +{ + struct net *net = (struct net *) data; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest *dest, *next; + unsigned long now = jiffies; + + spin_lock(&ipvs->dest_trash_lock); + list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { + if (atomic_read(&dest->refcnt) > 0) + continue; + if (dest->idle_start) { + if (time_before(now, dest->idle_start + + IP_VS_DEST_TRASH_PERIOD)) + continue; + } else { + dest->idle_start = max(1UL, now); + continue; + } + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } + if (!list_empty(&ipvs->dest_trash)) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + spin_unlock(&ipvs->dest_trash_lock); +} + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) +{ + int ret = 0, i; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_pe *pe = NULL; + struct ip_vs_service *svc = NULL; + struct netns_ipvs *ipvs = net_ipvs(net); + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); + ret = -ENOENT; + goto out_err; + } + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out_err; + } + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out_err; + } + } +#endif + + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); + if (svc == NULL) { + IP_VS_DBG(1, "%s(): no memory\n", __func__); + ret = -ENOMEM; + goto out_err; + } + svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!svc->stats.cpustats) { + ret = -ENOMEM; + goto out_err; + } + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_stats; + ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); + u64_stats_init(&ip_vs_stats->syncp); + } + + + /* I'm the first user of the service */ + atomic_set(&svc->refcnt, 0); + + svc->af = u->af; + svc->protocol = u->protocol; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + svc->net = net; + + INIT_LIST_HEAD(&svc->destinations); + spin_lock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ipvs->nullsvc_counter); + + ip_vs_start_estimator(net, &svc->stats); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services++; + + /* Hash the service into the service table */ + ip_vs_svc_hash(svc); + + *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; + return 0; + + + out_err: + if (svc != NULL) { + ip_vs_unbind_scheduler(svc, sched); + ip_vs_service_free(svc); + } + ip_vs_scheduler_put(sched); + ip_vs_pe_put(pe); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_pe *pe = NULL, *old_pe = NULL; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); + return -ENOENT; + } + old_sched = sched; + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out; + } + old_pe = pe; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out; + } + } +#endif + + old_sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched != old_sched) { + /* Bind the new scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + old_sched = sched; + goto out; + } + /* Unbind the old scheduler on success */ + ip_vs_unbind_scheduler(svc, old_sched); + } + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_pe = rcu_dereference_protected(svc->pe, 1); + if (pe != old_pe) + rcu_assign_pointer(svc->pe, pe); + +out: + ip_vs_scheduler_put(old_sched); + ip_vs_pe_put(old_pe); + return ret; +} + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + struct ip_vs_pe *old_pe; + struct netns_ipvs *ipvs = net_ipvs(svc->net); + + pr_info("%s: enter\n", __func__); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services--; + + ip_vs_stop_estimator(svc->net, &svc->stats); + + /* Unbind scheduler */ + old_sched = rcu_dereference_protected(svc->scheduler, 1); + ip_vs_unbind_scheduler(svc, old_sched); + ip_vs_scheduler_put(old_sched); + + /* Unbind persistence engine, keep svc->pe */ + old_pe = rcu_dereference_protected(svc->pe, 1); + ip_vs_pe_put(old_pe); + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(svc->net, dest, cleanup); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ipvs->nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + __ip_vs_svc_put(svc, true); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Unlink a service from list and try to delete it if its refcnt reached 0 + */ +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) +{ + /* Hold svc to avoid double release from dest_trash */ + atomic_inc(&svc->refcnt); + /* + * Unhash it from the service table + */ + ip_vs_svc_unhash(svc); + + __ip_vs_del_service(svc, cleanup); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + ip_vs_unlink_service(svc, false); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(struct net *net, bool cleanup) +{ + int idx; + struct ip_vs_service *svc; + struct hlist_node *n; + + /* + * Flush the service table hashed by <netns,protocol,addr,port> + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + s_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc, cleanup); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + f_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc, cleanup); + } + } + + return 0; +} + +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void ip_vs_service_net_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net, true); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} + +/* Put all references for device (dst_cache) */ +static inline void +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) +{ + struct ip_vs_dest_dst *dest_dst; + + spin_lock_bh(&dest->dst_lock); + dest_dst = rcu_dereference_protected(dest->dest_dst, 1); + if (dest_dst && dest_dst->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + __ip_vs_dst_cache_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_DOWN || !ipvs) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); + } + } + } + + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); + } + } + + } + } + + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + ip_vs_forget_dev(dest, dev); + } + spin_unlock_bh(&ipvs->dest_trash_lock); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + return 0; +} + +static int ip_vs_zero_all(struct net *net) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&net_ipvs(net)->tot_stats); + return 0; +} + +#ifdef CONFIG_SYSCTL + +static int zero; +static int three = 3; + +static int +proc_do_defense_mode(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + update_defense_level(net_ipvs(net)); + } + } + return rc; +} + +static int +proc_do_sync_threshold(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || + (valp[0] >= valp[1] && valp[1]))) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + +static int +proc_do_sync_mode(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 1)) { + /* Restore the correct value */ + *valp = val; + } + } + return rc; +} + +static int +proc_do_sync_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (*valp < 1 || !is_power_of_2(*valp)) { + /* Restore the correct value */ + *valp = val; + } + } + return rc; +} + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + * Do not change order or insert new entries without + * align with netns init in ip_vs_control_net_init() + */ + +static struct ctl_table vs_vars[] = { + { + .procname = "amemthresh", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "am_droprate", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_entry", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "secure_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "snat_reroute", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_version", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_mode, + }, + { + .procname = "sync_ports", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_ports, + }, + { + .procname = "sync_persist_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_qlen_max", + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "sync_sock_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "cache_bypass", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_sctp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_threshold", + .maxlen = + sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), + .mode = 0644, + .proc_handler = proc_do_sync_threshold, + }, + { + .procname = "sync_refresh_period", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "sync_retries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + { + .procname = "nat_icmp_send", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "pmtu_disc", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if 0 + { + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif + { } +}; + +#endif + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct seq_net_private p; /* Do not move this, netns depends upon it*/ + struct hlist_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned int flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net) && pos-- == 0) { + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + f_list) { + if (net_eq(svc->net, net) && pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct hlist_node *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, s_list); + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [%pI6]:%04X %s ", + ip_vs_proto_name(svc->protocol), + &svc->addr.in6, + ntohs(svc->port), + sched->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + sched->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } else { + seq_printf(seq, "FWM %08X %s %s", + svc->fwmark, sched->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [%pI6]:%04X" + " %-7s %-6d %-10d %-10d\n", + &dest->addr.in6, + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + + } + } + return 0; +} + +static const struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); +} + +static const struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats_user show; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, + show.inpkts, show.outpkts, + (unsigned long long) show.inbytes, + (unsigned long long) show.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, "%8X %8X %8X %16X %16X\n", + show.cps, show.inpps, show.outpps, + show.inbps, show.outbps); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_show); +} + +static const struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; + struct ip_vs_stats_user rates; + int i; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); + unsigned int start; + __u64 inbytes, outbytes; + + do { + start = u64_stats_fetch_begin_irq(&u->syncp); + inbytes = u->ustats.inbytes; + outbytes = u->ustats.outbytes; + } while (u64_stats_fetch_retry_irq(&u->syncp, start)); + + seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", + i, u->ustats.conns, u->ustats.inpkts, + u->ustats.outpkts, (__u64)inbytes, + (__u64)outbytes); + } + + spin_lock_bh(&tot_stats->lock); + + seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", + tot_stats->ustats.conns, tot_stats->ustats.inpkts, + tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); + + ip_vs_read_estimator(&rates, tot_stats); + + spin_unlock_bh(&tot_stats->lock); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8X %8X %8X %16X %16X\n", + rates.cps, + rates.inpps, + rates.outpps, + rates.inbps, + rates.outbps); + + return 0; +} + +static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_percpu_show); +} + +static const struct file_operations ip_vs_stats_percpu_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_percpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd->timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + memset(udest, 0, sizeof(*udest)); + + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + struct net *net = sock_net(sk); + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) + return -EINVAL; + if (len < 0 || len > MAX_ARG_LEN) + return -EINVAL; + if (len != set_arglen[SET_CMDID(cmd)]) { + pr_err("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Handle daemons since they have another lock */ + if (cmd == IP_VS_SO_SET_STARTDAEMON || + cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + + if (mutex_lock_interruptible(&ipvs->sync_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + if (cmd == IP_VS_SO_SET_STARTDAEMON) + ret = start_sync_thread(net, dm->state, dm->mcast_ifn, + dm->syncid); + else + ret = stop_sync_thread(net, dm->state); + mutex_unlock(&ipvs->sync_mutex); + goto out_dec; + } + + if (mutex_lock_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(net, false); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); + goto out_unlock; + } + + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { + ret = ip_vs_zero_all(net); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && + usvc.protocol != IPPROTO_SCTP) { + pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n", + usvc.protocol, &usvc.addr.ip, + ntohs(usvc.port), usvc.sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by <protocol, addr, port> or fwmark */ + rcu_read_lock(); + if (usvc.fwmark == 0) + svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + rcu_read_unlock(); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc.protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(net, &usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, &udest); + break; + default: + ret = -EINVAL; + } + + out_unlock: + mutex_unlock(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(src->scheduler, 1); + dst->protocol = src->protocol; + dst->addr = src->addr.ip; + dst->port = src->port; + dst->fwmark = src->fwmark; + strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(struct net *net, + const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || !net_eq(svc->net, net)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || !net_eq(svc->net, net)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } +out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; + int ret = 0; + + rcu_read_lock(); + if (get->fwmark) + svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); + else + svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, + get->port); + rcu_read_unlock(); + + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + memset(&entry, 0, sizeof(entry)); + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr.ip; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + + memset(u, 0, sizeof (*u)); + +#ifdef CONFIG_IP_VS_PROTO_TCP + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + u->udp_timeout = + pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + unsigned int copylen; + struct net *net = sock_net(sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + BUG_ON(!net); + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) + return -EINVAL; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + pr_err("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + copylen = get_arglen[GET_CMDID(cmd)]; + if (copylen > 128) + return -EINVAL; + + if (copy_from_user(arg, user, copylen) != 0) + return -EFAULT; + /* + * Handle daemons first since it has its own locking + */ + if (cmd == IP_VS_SO_GET_DAEMON) { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (mutex_lock_interruptible(&ipvs->sync_mutex)) + return -ERESTARTSYS; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->master_syncid; + } + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + mutex_unlock(&ipvs->sync_mutex); + return ret; + } + + if (mutex_lock_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = ip_vs_conn_tab_size; + info.num_services = ipvs->num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(net, get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + union nf_inet_addr addr; + + entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; + rcu_read_lock(); + if (entry->fwmark) + svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); + else + svc = __ip_vs_service_find(net, AF_INET, + entry->protocol, &addr, + entry->port); + rcu_read_unlock(); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(net, get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + +out: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, +}; + +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, + .netnsok = true, /* Make ipvsadm to work on netns */ +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_PENAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct ip_vs_stats_user ustats; + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + ip_vs_copy_stats(&ustats, stats); + + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + goto nla_put_failure; + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + struct ip_vs_pe *pe; + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) + goto nla_put_failure; + if (svc->fwmark) { + if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) + goto nla_put_failure; + } else { + if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || + nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || + nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + goto nla_put_failure; + } + + sched = rcu_dereference_protected(svc->scheduler, 1); + pe = rcu_dereference_protected(svc->pe, 1); + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || + nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || + nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || + nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct net *net = skb_sknet(skb); + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start || !net_eq(svc->net, net)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start || !net_eq(svc->net, net)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct net *net, + struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, int full_entry, + struct ip_vs_service **ret_svc) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + struct ip_vs_service *svc; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = nla_get_u16(nla_af); +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_be16(nla_port); + usvc->fwmark = 0; + } + + rcu_read_lock(); + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + rcu_read_unlock(); + *ret_svc = svc; + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (svc) + usvc->flags = svc->flags; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + usvc->sched_name = nla_data(nla_sched); + usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_be32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, + struct nlattr *nla) +{ + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + int ret; + + ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); + return ret ? ERR_PTR(ret) : svc; +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || + nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, + (atomic_read(&dest->conn_flags) & + IP_VS_CONN_F_FWD_MASK)) || + nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, + atomic_read(&dest->weight)) || + nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns))) + goto nla_put_failure; + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + struct net *net = skb_sknet(skb); + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + + svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + memset(udest, 0, sizeof(*udest)); + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_be16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + goto nla_put_failure; + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = skb_sknet(skb); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&ipvs->sync_mutex); + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ipvs->master_mcast_ifn, + ipvs->master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ipvs->backup_mcast_ifn, + ipvs->backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&ipvs->sync_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(net, &t); +} + +static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) +{ + int ret = 0, cmd; + struct net *net; + struct netns_ipvs *ipvs; + + net = skb_sknet(skb); + ipvs = net_ipvs(net); + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + mutex_lock(&ipvs->sync_mutex); + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(net, daemon_attrs); + else + ret = ip_vs_genl_del_daemon(net, daemon_attrs); +out: + mutex_unlock(&ipvs->sync_mutex); + } + return ret; +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + struct net *net; + + net = skb_sknet(skb); + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(net, false); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(net, info->attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(net); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(net, &usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc, &svc); + if (ret) + goto out; + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(net, &usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + /* do not use svc, it can be freed */ + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + struct net *net; + + net = skb_sknet(skb); + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + pr_err("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(net, + info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); +#ifdef CONFIG_IP_VS_PROTO_TCP + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, + t.tcp_timeout) || + nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout)) + goto nla_put_failure; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) + goto nla_put_failure; +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, + IP_VS_VERSION_CODE) || + nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size)) + goto nla_put_failure; + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + goto out; + +nla_put_failure: + pr_err("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static const struct genl_ops ip_vs_genl_ops[] = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_daemon, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_daemon, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + return genl_register_family_with_ops(&ip_vs_genl_family, + ip_vs_genl_ops); +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + +/* + * per netns intit/exit func. + */ +#ifdef CONFIG_SYSCTL +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ctl_table *tbl; + + atomic_set(&ipvs->dropentry, 0); + spin_lock_init(&ipvs->dropentry_lock); + spin_lock_init(&ipvs->droppacket_lock); + spin_lock_init(&ipvs->securetcp_lock); + + if (!net_eq(net, &init_net)) { + tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; + } else + tbl = vs_vars; + /* Initialize sysctl defaults */ + idx = 0; + ipvs->sysctl_amemthresh = 1024; + tbl[idx++].data = &ipvs->sysctl_amemthresh; + ipvs->sysctl_am_droprate = 10; + tbl[idx++].data = &ipvs->sysctl_am_droprate; + tbl[idx++].data = &ipvs->sysctl_drop_entry; + tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT + tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif + tbl[idx++].data = &ipvs->sysctl_secure_tcp; + ipvs->sysctl_snat_reroute = 1; + tbl[idx++].data = &ipvs->sysctl_snat_reroute; + ipvs->sysctl_sync_ver = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ver; + ipvs->sysctl_sync_ports = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ports; + tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; + tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; + tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; + tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; + ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; + ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; + tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); + tbl[idx++].data = &ipvs->sysctl_sync_retries; + tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + ipvs->sysctl_pmtu_disc = 1; + tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; + + + ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); + if (ipvs->sysctl_hdr == NULL) { + if (!net_eq(net, &init_net)) + kfree(tbl); + return -ENOMEM; + } + ip_vs_start_estimator(net, &ipvs->tot_stats); + ipvs->sysctl_tbl = tbl; + /* Schedule defense work */ + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + + return 0; +} + +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); + unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); +} + +#else + +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } + +#endif + +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + +int __net_init ip_vs_control_net_init(struct net *net) +{ + int i, idx; + struct netns_ipvs *ipvs = net_ipvs(net); + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_HLIST_HEAD(&ipvs->rs_table[idx]); + + INIT_LIST_HEAD(&ipvs->dest_trash); + spin_lock_init(&ipvs->dest_trash_lock); + setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, + (unsigned long) net); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); + + /* procfs stats */ + ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->tot_stats.cpustats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ipvs_tot_stats; + ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); + u64_stats_init(&ipvs_tot_stats->syncp); + } + + spin_lock_init(&ipvs->tot_stats.lock); + + proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, net->proc_net, + &ip_vs_stats_percpu_fops); + + if (ip_vs_control_net_init_sysctl(net)) + goto err; + + return 0; + +err: + free_percpu(ipvs->tot_stats.cpustats); + return -ENOMEM; +} + +void __net_exit ip_vs_control_net_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_trash_cleanup(net); + ip_vs_control_net_cleanup_sysctl(net); + remove_proc_entry("ip_vs_stats_percpu", net->proc_net); + remove_proc_entry("ip_vs_stats", net->proc_net); + remove_proc_entry("ip_vs", net->proc_net); + free_percpu(ipvs->tot_stats.cpustats); +} + +int __init ip_vs_register_nl_ioctl(void) +{ + int ret; + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + pr_err("cannot register sockopt.\n"); + goto err_sock; + } + + ret = ip_vs_genl_register(); + if (ret) { + pr_err("cannot register Generic Netlink interface.\n"); + goto err_genl; + } + return 0; + +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: + return ret; +} + +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + unregister_netdevice_notifier(&ip_vs_dst_notifier); + LeaveFunction(2); +} diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c new file mode 100644 index 00000000000..c3b84546ea9 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -0,0 +1,276 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell <proellt@gmx.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + +struct ip_vs_dh_state { + struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; + struct rcu_head rcu_head; +}; + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) +{ + return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + bool empty; + + b = &s->buckets[0]; + p = &svc->destinations; + empty = list_empty(p); + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_state *s) +{ + int i; + struct ip_vs_dh_bucket *b; + struct ip_vs_dest *dest; + + b = &s->buckets[0]; + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_state *s; + + /* allocate the DH table for this service */ + s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + + svc->sched_data = s; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with current dests */ + ip_vs_dh_reassign(s, svc); + + return 0; +} + + +static void ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_state *s = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(s); + + /* release the table itself */ + kfree_rcu(s, rcu_head); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); +} + + +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_dh_state *s = svc->sched_data; + + /* assign the hash buckets with the updated service */ + ip_vs_dh_reassign(s, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_state *s; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + s = (struct ip_vs_dh_state *) svc->sched_data; + dest = ip_vs_dh_get(svc->af, s, &iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph->daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .add_dest = ip_vs_dh_dest_changed, + .del_dest = ip_vs_dh_dest_changed, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); + synchronize_rcu(); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c new file mode 100644 index 00000000000..1425e9a924c --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -0,0 +1,211 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * Affected data: est_list and est_lock. + * estimation_timer() runs with timer per netns. + * get_stats()) do the per cpu summing. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/types.h> +#include <linux/interrupt.h> +#include <linux/sysctl.h> +#include <linux/list.h> + +#include <net/ip_vs.h> + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, + struct ip_vs_cpu_stats __percpu *stats) +{ + int i; + bool add = false; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); + unsigned int start; + __u64 inbytes, outbytes; + if (add) { + sum->conns += s->ustats.conns; + sum->inpkts += s->ustats.inpkts; + sum->outpkts += s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + inbytes = s->ustats.inbytes; + outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->inbytes += inbytes; + sum->outbytes += outbytes; + } else { + add = true; + sum->conns = s->ustats.conns; + sum->inpkts = s->ustats.inpkts; + sum->outpkts = s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + sum->inbytes = s->ustats.inbytes; + sum->outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + } + } +} + + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + struct net *net = (struct net *)arg; + struct netns_ipvs *ipvs; + + ipvs = net_ipvs(net); + spin_lock(&ipvs->est_lock); + list_for_each_entry(e, &ipvs->est_list, list) { + s = container_of(e, struct ip_vs_stats, est); + + spin_lock(&s->lock); + ip_vs_read_cpu_stats(&s->ustats, s->cpustats); + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns) << 9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps) >> 2; + + rate = (n_inpkts - e->last_inpkts) << 9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps) >> 2; + + rate = (n_outpkts - e->last_outpkts) << 9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps) >> 2; + + rate = (n_inbytes - e->last_inbytes) << 4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps) >> 2; + + rate = (n_outbytes - e->last_outbytes) << 4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps) >> 2; + spin_unlock(&s->lock); + } + spin_unlock(&ipvs->est_lock); + mod_timer(&ipvs->est_timer, jiffies + 2*HZ); +} + +void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_estimator *est = &stats->est; + + INIT_LIST_HEAD(&est->list); + + spin_lock_bh(&ipvs->est_lock); + list_add(&est->list, &ipvs->est_list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_estimator *est = &stats->est; + + spin_lock_bh(&ipvs->est_lock); + list_del(&est->list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + struct ip_vs_stats_user *u = &stats->ustats; + + /* reset counters, caller must hold the stats->lock lock */ + est->last_inbytes = u->inbytes; + est->last_outbytes = u->outbytes; + est->last_conns = u->conns; + est->last_inpkts = u->inpkts; + est->last_outpkts = u->outpkts; + est->cps = 0; + est->inpps = 0; + est->outpps = 0; + est->inbps = 0; + est->outbps = 0; +} + +/* Get decoded rates */ +void ip_vs_read_estimator(struct ip_vs_stats_user *dst, + struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e = &stats->est; + + dst->cps = (e->cps + 0x1FF) >> 10; + dst->inpps = (e->inpps + 0x1FF) >> 10; + dst->outpps = (e->outpps + 0x1FF) >> 10; + dst->inbps = (e->inbps + 0xF) >> 5; + dst->outbps = (e->outbps + 0xF) >> 5; +} + +int __net_init ip_vs_estimator_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->est_list); + spin_lock_init(&ipvs->est_lock); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + return 0; +} + +void __net_exit ip_vs_estimator_net_cleanup(struct net *net) +{ + del_timer_sync(&net_ipvs(net)->est_timer); +} diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c new file mode 100644 index 00000000000..77c173282f3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -0,0 +1,501 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/gfp.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <asm/unaligned.h> + +#include <net/ip_vs.h> + + +#define SERVER_STRING "227 " +#define CLIENT_STRING "PORT" + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static unsigned int ports_count = 1; +static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, ushort, &ports_count, 0444); +MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern", ignoring before "skip" and terminated with + * the "term" character. + * <addr,port> is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, + char skip, char term, + __be32 *addr, __be16 *port, + char **start, char **end) +{ + char *s, c; + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + s = data + plen; + if (skip) { + int found = 0; + + for (;; s++) { + if (s == data_limit) + return -1; + if (!found) { + if (*s == skip) + found = 1; + } else if (*s != skip) { + break; + } + } + } + + for (data = s; ; data++) { + if (data == data_limit) + return -1; + if (*data == term) + break; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = s; ; data++) { + c = *data; + if (c == term) + break; + if (c >= '0' && c <= '9') { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *start = s; + *addr = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); + return 1; +} + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + union nf_inet_addr from; + __be16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned int buf_len; + int ret = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct net *net; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, + '(', ')', + &from.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n", + &from.ip, ntohs(port), &cp->caddr.ip, 0); + + /* + * Now update or create an connection entry for it + */ + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } + if (!n_cp) { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), + AF_INET, IPPROTO_TCP, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from.ip = n_cp->vaddr.ip; + port = n_cp->vport; + snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&from.ip)[0], + ((unsigned char *)&from.ip)[1], + ((unsigned char *)&from.ip)[2], + ((unsigned char *)&from.ip)[3], + ntohs(port) >> 8, + ntohs(port) & 0xFF); + + buf_len = strlen(buf); + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) { + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. + */ + rcu_read_lock(); + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + iph->ihl * 4, + start-data, end-start, + buf, buf_len); + rcu_read_unlock(); + if (ret) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + IPPROTO_TCP, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } + } + + /* + * Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. + */ + + net = skb_net(skb); + cp->app_data = NULL; + ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + union nf_inet_addr to; + __be16 port; + struct ip_vs_conn *n_cp; + struct net *net; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(7, "got PASV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + ' ', '\r', &to.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n", + ip_vs_proto_name(iph->protocol), + &to.ip, ntohs(port), &cp->vaddr.ip, 0); + + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &to, port, &cp->vaddr, + htons(ntohs(cp->vport)-1), &p); + n_cp = ip_vs_conn_in_get(&p); + if (!n_cp) { + n_cp = ip_vs_conn_new(&p, &cp->daddr, + htons(ntohs(cp->dport)-1), + IP_VS_CONN_F_NFCT, cp->dest, + skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + } + + /* + * Move tunnel to listen state + */ + net = skb_net(skb); + ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + +/* + * per netns ip_vs_ftp initialization + */ +static int __net_init __ip_vs_ftp_init(struct net *net) +{ + int i, ret; + struct ip_vs_app *app; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + app = register_ip_vs_app(net, &ip_vs_ftp); + if (IS_ERR(app)) + return PTR_ERR(app); + + for (i = 0; i < ports_count; i++) { + if (!ports[i]) + continue; + ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); + if (ret) + goto err_unreg; + pr_info("%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + return 0; + +err_unreg: + unregister_ip_vs_app(net, &ip_vs_ftp); + return ret; +} +/* + * netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ + unregister_ip_vs_app(net, &ip_vs_ftp); +} + +static struct pernet_operations ip_vs_ftp_ops = { + .init = __ip_vs_ftp_init, + .exit = __ip_vs_ftp_exit, +}; + +static int __init ip_vs_ftp_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns on error */ + return rv; +} + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns */ +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c new file mode 100644 index 00000000000..547ff33c1ef --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -0,0 +1,633 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitialized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * + * return n; + * + * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing + * me to write this module. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/jiffies.h> + +/* for sysctl */ +#include <linux/fs.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct hlist_node list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct timer_list periodic_timer; /* collect stale entries */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ + bool dead; +}; + + +/* + * IPVS LBLC sysctl table + */ +#ifdef CONFIG_SYSCTL +static struct ctl_table vs_vars_table[] = { + { + .procname = "lblc_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static void ip_vs_lblc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_lblc_entry *en = container_of(head, + struct ip_vs_lblc_entry, + rcu_head); + + ip_vs_dest_put_and_free(en->dest); + kfree(en); +} + +static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) +{ + hlist_del_rcu(&en->list); + call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); +} + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned int +ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static void +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); + + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* Get ip_vs_lblc_entry associated with supplied parameters. */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned int hash = ip_vs_lblc_hashkey(af, addr); + struct ip_vs_lblc_entry *en; + + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under spin lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(dest->af, tbl, daddr); + if (en) { + if (en->dest == dest) + return en; + ip_vs_lblc_del(en); + } + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; + + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; + + ip_vs_dest_hold(dest); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; + int i; + + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { + ip_vs_lblc_del(en); + atomic_dec(&tbl->entries); + } + } + spin_unlock_bh(&svc->sched_lock); +} + +static int sysctl_lblc_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblc_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; + unsigned long now = jiffies; + int i, j; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + + sysctl_lblc_expiration(svc))) + continue; + + ip_vs_lblc_del(en); + atomic_dec(&tbl->entries); + } + spin_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_del(en); + atomic_dec(&tbl->entries); + goal--; + } + spin_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); + } + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + tbl->dead = 0; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(svc); + + /* release the table itself */ + kfree_rcu(tbl, rcu_head); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); +} + + +static inline struct ip_vs_dest * +__ip_vs_lblc_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLC: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry_rcu(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + dest = en->dest; + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + } + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblc_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph->daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = { + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .schedule = ip_vs_lblc_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblc_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblc_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblc_ctl_table == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblc_ctl_table[0].procname = NULL; + + } else + ipvs->lblc_ctl_table = vs_vars_table; + ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; + ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + + ipvs->lblc_ctl_header = + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); + if (!ipvs->lblc_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblc_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblc_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblc_ops = { + .init = __ip_vs_lblc_init, + .exit = __ip_vs_lblc_exit, +}; + +static int __init ip_vs_lblc_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblc_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblc_ops); + return ret; +} + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); + unregister_pernet_subsys(&ip_vs_lblc_ops); + rcu_barrier(); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c new file mode 100644 index 00000000000..3f21a2f47de --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -0,0 +1,818 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n <- {weighted least-conn node}; + * add n to serverSet[dest_ip]; + * if |serverSet[dest_ip]| > 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/jiffies.h> +#include <linux/list.h> +#include <linux/slab.h> + +/* for sysctl */ +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <net/net_namespace.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_set_elem { + struct list_head list; /* list link */ + struct ip_vs_dest *dest; /* destination server */ + struct rcu_head rcu_head; +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct list_head list; /* destination list */ +}; + + +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, + struct ip_vs_dest *dest, bool check) +{ + struct ip_vs_dest_set_elem *e; + + if (check) { + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) + return; + } + } + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) + return; + + ip_vs_dest_hold(dest); + e->dest = dest; + + list_add_rcu(&e->list, &set->list); + atomic_inc(&set->size); + + set->lastmod = jiffies; +} + +static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_set_elem *e; + + e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); + ip_vs_dest_put_and_free(e->dest); + kfree(e); +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_set_elem *e; + + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) { + /* HIT */ + atomic_dec(&set->size); + set->lastmod = jiffies; + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); + break; + } + } +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_set_elem *e, *ep; + + list_for_each_entry_safe(e, ep, &set->list, list) { + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); + } +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry_rcu(e, &set->list, list) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + list_for_each_entry_continue_rcu(e, &set->list, list) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if (((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry(e, &set->list, list) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = ip_vs_dest_conn_overhead(most); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + list_for_each_entry_continue(e, &set->list, list) { + dest = e->dest; + doh = ip_vs_dest_conn_overhead(dest); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if (((__s64)moh * atomic_read(&dest->weight) < + (__s64)doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct hlist_node list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ + bool dead; +}; + + +#ifdef CONFIG_SYSCTL +/* + * IPVS LBLCR sysctl table + */ + +static struct ctl_table vs_vars_table[] = { + { + .procname = "lblcr_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + hlist_del_rcu(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree_rcu(en, rcu_head); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned int +ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static void +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* Get ip_vs_lblcr_entry associated with supplied parameters. */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned int hash = ip_vs_lblcr_hashkey(af, addr); + struct ip_vs_lblcr_entry *en; + + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under spin lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(dest->af, tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; + + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; + + /* initialize its dest set */ + atomic_set(&(en->set.size), 0); + INIT_LIST_HEAD(&en->set.list); + + ip_vs_dest_set_insert(&en->set, dest, false); + + ip_vs_lblcr_hash(tbl, en); + return en; + } + + ip_vs_dest_set_insert(&en->set, dest, true); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + int i; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; + + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { + ip_vs_lblcr_free(en); + } + } + spin_unlock_bh(&svc->sched_lock); +} + +static int sysctl_lblcr_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblcr_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_after(en->lastuse + + sysctl_lblcr_expiration(svc), now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + spin_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + spin_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); + } + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + tbl->dead = 0; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(svc); + + /* release the table itself */ + kfree_rcu(tbl, rcu_head); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); +} + + +static inline struct ip_vs_dest * +__ip_vs_lblcr_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLCR: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry_rcu(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct ip_vs_dest *dest; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr); + if (en) { + en->lastuse = jiffies; + + /* Get the least loaded destination */ + dest = ip_vs_dest_set_min(&en->set); + + /* More than one destination + enough time passed by, cleanup */ + if (atomic_read(&en->set.size) > 1 && + time_after(jiffies, en->set.lastmod + + sysctl_lblcr_expiration(svc))) { + spin_lock_bh(&svc->sched_lock); + if (atomic_read(&en->set.size) > 1) { + struct ip_vs_dest *m; + + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + spin_unlock_bh(&svc->sched_lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) + goto out; + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* Update our cache entry */ + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_dest_set_insert(&en->set, dest, true); + spin_unlock_bh(&svc->sched_lock); + goto out; + } + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblcr_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph->daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .schedule = ip_vs_lblcr_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblcr_ctl_table == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblcr_ctl_table[0].procname = NULL; + } else + ipvs->lblcr_ctl_table = vs_vars_table; + ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; + ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + + ipvs->lblcr_ctl_header = + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); + if (!ipvs->lblcr_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblcr_ops = { + .init = __ip_vs_lblcr_init, + .exit = __ip_vs_lblcr_exit, +}; + +static int __init ip_vs_lblcr_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblcr_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblcr_ops); + return ret; +} + +static void __exit ip_vs_lblcr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + unregister_pernet_subsys(&ip_vs_lblcr_ops); + rcu_barrier(); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c new file mode 100644 index 00000000000..2bdcb1cf212 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -0,0 +1,93 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (!least) + ip_vs_scheduler_err(svc, "no destination available"); + else + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " + "inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 00000000000..5882bbfd198 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,299 @@ +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * + * Authors: + * Ben North <ben@redfrontdoor.org> + * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels + * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match + * + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/ip_vs.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> + + +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* Applications may adjust TCP seqs */ + if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && + !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE + ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), + ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + nf_conntrack_alter_reply(ct, &new_tuple); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = nf_ct_net(ct); + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port, &p); + cp = ip_vs_conn_out_get(&p); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(&p); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE + " for conn " FMT_CONN "\n", + __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + + h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, + &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* Show what happens instead of calling nf_ct_kill() */ + if (del_timer(&ct->timeout)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + if (ct->timeout.function) + ct->timeout.function(ct->timeout.data); + } else { + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c new file mode 100644 index 00000000000..961a6de9bb2 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -0,0 +1,142 @@ +/* + * IPVS: Never Queue scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static inline int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least = NULL; + int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + out: + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c new file mode 100644 index 00000000000..1a82b29ce8e --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -0,0 +1,111 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <asm/string.h> +#include <linux/kmod.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + +/* IPVS pe list */ +static LIST_HEAD(ip_vs_pe); + +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex); + +/* Get pe in the pe list by name */ +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) +{ + struct ip_vs_pe *pe; + + IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, + pe_name); + + rcu_read_lock(); + list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { + /* Test and get the modules atomically */ + if (pe->module && + !try_module_get(pe->module)) { + /* This pe is just deleted */ + continue; + } + if (strcmp(pe_name, pe->name)==0) { + /* HIT */ + rcu_read_unlock(); + return pe; + } + if (pe->module) + module_put(pe->module); + } + rcu_read_unlock(); + + return NULL; +} + +/* Lookup pe and try to load it if it doesn't exist */ +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) +{ + struct ip_vs_pe *pe; + + /* Search for the pe by name */ + pe = __ip_vs_pe_getbyname(name); + + /* If pe not found, load the module and search again */ + if (!pe) { + request_module("ip_vs_pe_%s", name); + pe = __ip_vs_pe_getbyname(name); + } + + return pe; +} + +/* Register a pe in the pe list */ +int register_ip_vs_pe(struct ip_vs_pe *pe) +{ + struct ip_vs_pe *tmp; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&ip_vs_pe_mutex); + /* Make sure that the pe with this name doesn't exist + * in the pe list. + */ + list_for_each_entry(tmp, &ip_vs_pe, n_list) { + if (strcmp(tmp->name, pe->name) == 0) { + mutex_unlock(&ip_vs_pe_mutex); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already existed " + "in the system\n", __func__, pe->name); + return -EINVAL; + } + } + /* Add it into the d-linked pe list */ + list_add_rcu(&pe->n_list, &ip_vs_pe); + mutex_unlock(&ip_vs_pe_mutex); + + pr_info("[%s] pe registered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ip_vs_pe); + +/* Unregister a pe from the pe list */ +int unregister_ip_vs_pe(struct ip_vs_pe *pe) +{ + mutex_lock(&ip_vs_pe_mutex); + /* Remove it from the d-linked pe list */ + list_del_rcu(&pe->n_list); + mutex_unlock(&ip_vs_pe_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] pe unregistered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ip_vs_pe); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c new file mode 100644 index 00000000000..bed5f704252 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -0,0 +1,171 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/nf_conntrack_sip.h> + +#ifdef CONFIG_IP_VS_DEBUG +static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, + const char *callid, size_t callid_len, + int *idx) +{ + size_t max_len = 64; + size_t len = min3(max_len, callid_len, buf_len - *idx - 1); + memcpy(buf + *idx, callid, len); + buf[*idx+len] = '\0'; + *idx += len + 1; + return buf + *idx - len; +} + +#define IP_VS_DEBUG_CALLID(callid, len) \ + ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \ + callid, len, &ip_vs_dbg_idx) +#endif + +static int get_callid(const char *dptr, unsigned int dataoff, + unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen) +{ + /* Find callid */ + while (1) { + int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen, + SIP_HDR_CALL_ID, matchoff, + matchlen); + if (ret > 0) + break; + if (!ret) + return -EINVAL; + dataoff += *matchoff; + } + + /* Too large is useless */ + if (*matchlen > IP_VS_PEDATA_MAXLEN) + return -EINVAL; + + /* SIP headers are always followed by a line terminator */ + if (*matchoff + *matchlen == datalen) + return -EINVAL; + + /* RFC 2543 allows lines to be terminated with CR, LF or CRLF, + * RFC 3261 allows only CRLF, we support both. */ + if (*(dptr + *matchoff + *matchlen) != '\r' && + *(dptr + *matchoff + *matchlen) != '\n') + return -EINVAL; + + IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n", + IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen), + *matchlen); + return 0; +} + +static int +ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) +{ + struct ip_vs_iphdr iph; + unsigned int dataoff, datalen, matchoff, matchlen; + const char *dptr; + int retc; + + ip_vs_fill_iph_skb(p->af, skb, &iph); + + /* Only useful with UDP */ + if (iph.protocol != IPPROTO_UDP) + return -EINVAL; + /* todo: IPv6 fragments: + * I think this only should be done for the first fragment. /HS + */ + dataoff = iph.len + sizeof(struct udphdr); + + if (dataoff >= skb->len) + return -EINVAL; + retc = skb_linearize(skb); + if (retc < 0) + return retc; + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + + if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen)) + return -EINVAL; + + /* N.B: pe_data is only set on success, + * this allows fallback to the default persistence logic on failure + */ + p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + + p->pe_data_len = matchlen; + + return 0; +} + +static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, + struct ip_vs_conn *ct) + +{ + bool ret = false; + + if (ct->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && + /* protocol should only be IPPROTO_IP if + * d_addr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + p->vaddr, &ct->vaddr) && + ct->vport == p->vport && + ct->flags & IP_VS_CONN_F_TEMPLATE && + ct->protocol == p->protocol && + ct->pe_data && ct->pe_data_len == p->pe_data_len && + !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) + ret = true; + + IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, + u32 initval, bool inverse) +{ + return jhash(p->pe_data, p->pe_data_len, initval); +} + +static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) +{ + memcpy(buf, cp->pe_data, cp->pe_data_len); + return cp->pe_data_len; +} + +static struct ip_vs_pe ip_vs_sip_pe = +{ + .name = "sip", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list), + .fill_param = ip_vs_sip_fill_param, + .ct_match = ip_vs_sip_ct_match, + .hashkey_raw = ip_vs_sip_hashkey_raw, + .show_pe_data = ip_vs_sip_show_pe_data, +}; + +static int __init ip_vs_sip_init(void) +{ + return register_ip_vs_pe(&ip_vs_sip_pe); +} + +static void __exit ip_vs_sip_cleanup(void) +{ + unregister_ip_vs_pe(&ip_vs_sip_pe); + synchronize_rcu(); +} + +module_init(ip_vs_sip_init); +module_exit(ip_vs_sip_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c new file mode 100644 index 00000000000..939f7fbe9b4 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -0,0 +1,409 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/gfp.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> + +#include <net/ip_vs.h> + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + +/* + * register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); + struct ip_vs_proto_data *pd = + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); + + if (!pd) + return -ENOMEM; + + pd->pp = pp; /* For speed issues */ + pd->next = ipvs->proto_data_table[hash]; + ipvs->proto_data_table[hash] = pd; + atomic_set(&pd->appcnt, 0); /* Init app counter */ + + if (pp->init_netns != NULL) { + int ret = pp->init_netns(net, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } + + return 0; +} + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + +/* + * unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data **pd_p; + unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); + + pd_p = &ipvs->proto_data_table[hash]; + for (; *pd_p; pd_p = &(*pd_p)->next) { + if (*pd_p == pd) { + *pd_p = pd->next; + if (pd->pp->exit_netns != NULL) + pd->pp->exit_netns(net, pd); + kfree(pd); + return 0; + } + } + + return -ESRCH; +} + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned int hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} +EXPORT_SYMBOL(ip_vs_proto_get); + +/* + * get ip_vs_protocol object data by netns and proto + */ +static struct ip_vs_proto_data * +__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +{ + struct ip_vs_proto_data *pd; + unsigned int hash = IP_VS_PROTO_HASH(proto); + + for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { + if (pd->pp->protocol == proto) + return pd; + } + + return NULL; +} + +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + return __ipvs_proto_data_get(ipvs, proto); +} +EXPORT_SYMBOL(ip_vs_proto_data_get); + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) +{ + struct ip_vs_proto_data *pd; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { + if (pd->pp->timeout_change) + pd->pp->timeout_change(pd, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + return kmemdup(table, size, GFP_KERNEL); +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, const char *const *names, + const char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; + return pp->state_name(state); +} + + +static void +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->frag_off & htons(IP_OFFSET)) + sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI4->%pI4", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI4:%u->%pI4:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI6c->%pI6c", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI6c:%u->%pI6c:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + +/* + * per network name-space init + */ +int __net_init ip_vs_protocol_net_init(struct net *net) +{ + int i, ret; + static struct ip_vs_protocol *protos[] = { +#ifdef CONFIG_IP_VS_PROTO_TCP + &ip_vs_protocol_tcp, +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + &ip_vs_protocol_udp, +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + &ip_vs_protocol_sctp, +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + &ip_vs_protocol_ah, +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + &ip_vs_protocol_esp, +#endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(net, protos[i]); + if (ret < 0) + goto cleanup; + } + return 0; + +cleanup: + ip_vs_protocol_net_cleanup(net); + return ret; +} + +void __net_exit ip_vs_protocol_net_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd; + int i; + + /* unregister all the ipvs proto data for this netns */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pd = ipvs->proto_data_table[i]) != NULL) + unregister_ip_vs_proto_netns(net, pd); + } +} + +int __init ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + REGISTER_PROTOCOL(&ip_vs_protocol_sctp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + pr_info("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 00000000000..5de3dd312c0 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,165 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 + * Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + +static void +ah_esp_conn_fill_param_proto(struct net *net, int af, + const struct ip_vs_iphdr *iph, int inverse, + struct ip_vs_conn_param *p) +{ + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), + &iph->daddr, htons(PORT_ISAKMP), p); + else + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), + &iph->saddr, htons(PORT_ISAKMP), p); +} + +static struct ip_vs_conn * +ah_esp_conn_in_get(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + int inverse) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); + + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_in_get(&p); + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, int inverse) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); + + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_out_get(&p); + if (!cp) { + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c new file mode 100644 index 00000000000..2f7ea756404 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -0,0 +1,591 @@ +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <net/ip.h> +#include <net/ip6_checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/sctp/checksum.h> +#include <net/ip_vs.h> + +static int +sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct net *net; + struct ip_vs_service *svc; + struct netns_ipvs *ipvs; + sctp_chunkhdr_t _schunkh, *sch; + sctp_sctphdr_t *sh, _sctph; + + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh == NULL) { + *verdict = NF_DROP; + return 0; + } + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); + if (sch == NULL) { + *verdict = NF_DROP; + return 0; + } + + net = skb_net(skb); + ipvs = net_ipvs(net); + rcu_read_lock(); + if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, sh->dest))) { + int ignored; + + if (ip_vs_todrop(ipvs)) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + rcu_read_unlock(); + *verdict = NF_DROP; + return 0; + } + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + /* NF_ACCEPT */ + return 1; +} + +static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph, + unsigned int sctphoff) +{ + sctph->checksum = sctp_compute_cksum(skb, sctphoff); + skb->ip_summed = CHECKSUM_UNNECESSARY; +} + +static int +sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff = iph->len; + bool payload_csum = false; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + ret = ip_vs_app_pkt_out(cp, skb); + if (ret == 0) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + + /* Only update csum if we really have to */ + if (sctph->source != cp->vport || payload_csum || + skb->ip_summed == CHECKSUM_PARTIAL) { + sctph->source = cp->vport; + sctp_nat_csum(skb, sctph, sctphoff); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + return 1; +} + +static int +sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff = iph->len; + bool payload_csum = false; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + ret = ip_vs_app_pkt_in(cp, skb); + if (ret == 0) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + + /* Only update csum if we really have to */ + if (sctph->dest != cp->dport || payload_csum || + (skb->ip_summed == CHECKSUM_PARTIAL && + !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { + sctph->dest = cp->dport; + sctp_nat_csum(skb, sctph, sctphoff); + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + return 1; +} + +static int +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int sctphoff; + struct sctphdr *sh, _sctph; + __le32 cmp, val; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + cmp = sh->checksum; + val = sctp_compute_cksum(skb, sctphoff); + + if (val != cmp) { + /* CRC failure, dump it. */ + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + return 1; +} + +enum ipvs_sctp_event_t { + IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ + IP_VS_SCTP_INIT, + IP_VS_SCTP_INIT_ACK, + IP_VS_SCTP_COOKIE_ECHO, + IP_VS_SCTP_COOKIE_ACK, + IP_VS_SCTP_SHUTDOWN, + IP_VS_SCTP_SHUTDOWN_ACK, + IP_VS_SCTP_SHUTDOWN_COMPLETE, + IP_VS_SCTP_ERROR, + IP_VS_SCTP_ABORT, + IP_VS_SCTP_EVENT_LAST +}; + +/* RFC 2960, 3.2 Chunk Field Descriptions */ +static __u8 sctp_events[] = { + [SCTP_CID_DATA] = IP_VS_SCTP_DATA, + [SCTP_CID_INIT] = IP_VS_SCTP_INIT, + [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, + [SCTP_CID_SACK] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, + [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, + [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, + [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, + [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, + [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, + [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, + [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, + [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, + [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, +}; + +/* SCTP States: + * See RFC 2960, 4. SCTP Association State Diagram + * + * New states (not in diagram): + * - INIT1 state: use shorter timeout for dropped INIT packets + * - REJECTED state: use shorter timeout if INIT is rejected with ABORT + * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging + * + * The states are as seen in real server. In the diagram, INIT1, INIT, + * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. + * + * States as per packets from client (C) and server (S): + * + * Setup of client connection: + * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK + * + * Setup of server connection: + * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK + * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK + */ + +#define sNO IP_VS_SCTP_S_NONE +#define sI1 IP_VS_SCTP_S_INIT1 +#define sIN IP_VS_SCTP_S_INIT +#define sCS IP_VS_SCTP_S_COOKIE_SENT +#define sCR IP_VS_SCTP_S_COOKIE_REPLIED +#define sCW IP_VS_SCTP_S_COOKIE_WAIT +#define sCO IP_VS_SCTP_S_COOKIE +#define sCE IP_VS_SCTP_S_COOKIE_ECHOED +#define sES IP_VS_SCTP_S_ESTABLISHED +#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT +#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED +#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT +#define sRJ IP_VS_SCTP_S_REJECTED +#define sCL IP_VS_SCTP_S_CLOSED + +static const __u8 sctp_states + [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { + { /* INPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* OUTPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, +/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, +/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* INPUT-ONLY */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, +}; + +#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) + +/* Timeout table[state] */ +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = 2 * HZ, + [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, + [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_LAST] = 2 * HZ, +}; + +static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = "NONE", + [IP_VS_SCTP_S_INIT1] = "INIT1", + [IP_VS_SCTP_S_INIT] = "INIT", + [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", + [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", + [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", + [IP_VS_SCTP_S_COOKIE] = "COOKIE", + [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", + [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", + [IP_VS_SCTP_S_REJECTED] = "REJECTED", + [IP_VS_SCTP_S_CLOSED] = "CLOSED", + [IP_VS_SCTP_S_LAST] = "BUG!", +}; + + +static const char *sctp_state_name(int state) +{ + if (state >= IP_VS_SCTP_S_LAST) + return "ERR!"; + if (sctp_state_name_table[state]) + return sctp_state_name_table[state]; + return "?"; +} + +static inline void +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, const struct sk_buff *skb) +{ + sctp_chunkhdr_t _sctpch, *sch; + unsigned char chunk_type; + int event, next_state; + int ihl, cofs; + +#ifdef CONFIG_IP_VS_IPV6 + ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + ihl = ip_hdrlen(skb); +#endif + + cofs = ihl + sizeof(sctp_sctphdr_t); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); + if (sch == NULL) + return; + + chunk_type = sch->type; + /* + * Section 3: Multiple chunks can be bundled into one SCTP packet + * up to the MTU size, except for the INIT, INIT ACK, and + * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with + * any other chunk in a packet. + * + * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control + * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be + * bundled with an ABORT, but they MUST be placed before the ABORT + * in the SCTP packet or they will be ignored by the receiver. + */ + if ((sch->type == SCTP_CID_COOKIE_ECHO) || + (sch->type == SCTP_CID_COOKIE_ACK)) { + int clen = ntohs(sch->length); + + if (clen >= sizeof(sctp_chunkhdr_t)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) + chunk_type = sch->type; + } + } + + event = (chunk_type < sizeof(sctp_events)) ? + sctp_events[chunk_type] : IP_VS_SCTP_DATA; + + /* Update direction to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (direction == IP_VS_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + direction = IP_VS_DIR_INPUT_ONLY; + } + + next_state = sctp_states[direction][event][cp->state]; + + if (next_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((direction == IP_VS_DIR_OUTPUT) ? + "output " : "input "), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + sctp_state_name(cp->state), + sctp_state_name(next_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state == IP_VS_SCTP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = next_state]; + else /* What to do ? */ + cp->timeout = sctp_timeouts[cp->state = next_state]; +} + +static void +sctp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, struct ip_vs_proto_data *pd) +{ + spin_lock_bh(&cp->lock); + set_sctp_state(pd, cp, direction, skb); + spin_unlock_bh(&cp->lock); +} + +static inline __u16 sctp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) + & SCTP_APP_TAB_MASK; +} + +static int sctp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + hash = sctp_app_hashkey(port); + + list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); + atomic_inc(&pd->appcnt); +out: + + return ret; +} + +static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); +} + +static int sctp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + /* Lookup application incarnations and bind the right one */ + hash = sctp_app_hashkey(cp->vport); + + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + rcu_read_unlock(); +out: + return result; +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, + sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; +} + +static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + +struct ip_vs_protocol ip_vs_protocol_sctp = { + .name = "SCTP", + .protocol = IPPROTO_SCTP, + .num_states = IP_VS_SCTP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_sctp_init, + .exit_netns = __ip_vs_sctp_exit, + .register_app = sctp_register_app, + .unregister_app = sctp_unregister_app, + .conn_schedule = sctp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = sctp_snat_handler, + .dnat_handler = sctp_dnat_handler, + .csum_check = sctp_csum_check, + .state_name = sctp_state_name, + .state_transition = sctp_state_transition, + .app_conn_bind = sctp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 00000000000..e3a697234a9 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,712 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * tcp_timeouts table has copy per netns in a hash table per + * protocol ip_vs_proto_data and is handled by netns + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <net/ip6_checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + +static int +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct net *net; + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + struct netns_ipvs *ipvs; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + net = skb_net(skb); + ipvs = net_ipvs(net); + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ + rcu_read_lock(); + if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, th->dest))) { + int ignored; + + if (ip_vs_todrop(ipvs)) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + rcu_read_unlock(); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + /* NF_ACCEPT */ + return 1; +} + + +static inline void +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); +} + + +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); + else +#endif + tcph->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); +} + + +static int +tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct tcphdr *tcph; + unsigned int tcphoff = iph->len; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!(ret = ip_vs_app_pkt_out(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct tcphdr *tcph; + unsigned int tcphoff = iph->len; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static const int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = + pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = new_state]; + else /* What to do ? */ + cp->timeout = tcp_timeouts[cp->state = new_state]; +} + +/* + * Handle state transitions + */ +static void +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + struct tcphdr _tcph, *th; + +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); + if (th == NULL) + return; + + spin_lock_bh(&cp->lock); + set_tcp_state(pd, cp, direction, th); + spin_unlock_bh(&cp->lock); +} + +static inline __u16 tcp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) + & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + hash = tcp_app_hashkey(port); + + list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + return ret; +} + + +static void +tcp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + rcu_read_unlock(); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + spin_lock_bh(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] + : tcp_timeouts[IP_VS_TCP_S_LISTEN]); + spin_unlock_bh(&cp->lock); +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, + sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + pd->tcp_state_table = tcp_states; + return 0; +} + +static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_tcp_init, + .exit_netns = __ip_vs_tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c new file mode 100644 index 00000000000..b62a3c0ff9b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,499 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/udp.h> + +#include <net/ip_vs.h> +#include <net/ip.h> +#include <net/ip6_checksum.h> + +static int +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct net *net; + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + net = skb_net(skb); + rcu_read_lock(); + svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, uh->dest); + if (svc) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + rcu_read_unlock(); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + /* NF_ACCEPT */ + return 1; +} + + +static inline void +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + if (!uhdr->check) + uhdr->check = CSUM_MANGLED_0; +} + +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); +} + + +static int +udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct udphdr *udph; + unsigned int udphoff = iph->len; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!(ret = ip_vs_app_pkt_out(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct udphdr *udph; + unsigned int udphoff = iph->len; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + } + return 1; +} + +static inline __u16 udp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) + & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + + hash = udp_app_hashkey(port); + + list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + return ret; +} + + +static void +udp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + rcu_read_unlock(); + + out: + return result; +} + + +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static void +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (unlikely(!pd)) { + pr_err("UDP no ns data\n"); + return; + } + + cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; +} + +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, + sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; +} + +static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __udp_init, + .exit_netns = __udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c new file mode 100644 index 00000000000..176b87c35e3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -0,0 +1,130 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) +{ + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct list_head *p; + struct ip_vs_dest *dest, *last; + int pass = 0; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + + do { + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + if (dest == last) + goto stop; + } + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + spin_unlock_bh(&svc->sched_lock); + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + out: + svc->sched_data = &dest->n_list; + spin_unlock_bh(&svc->sched_lock); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), + .init_service = ip_vs_rr_init_svc, + .add_dest = NULL, + .del_dest = ip_vs_rr_del_dest, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c new file mode 100644 index 00000000000..4dbcda6258b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -0,0 +1,255 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <asm/string.h> +#include <linux/kmod.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + +EXPORT_SYMBOL(ip_vs_scheduler_err); +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + pr_err("%s(): init error\n", __func__); + return ret; + } + } + rcu_assign_pointer(svc->scheduler, scheduler); + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *sched) +{ + struct ip_vs_scheduler *cur_sched; + + cur_sched = rcu_dereference_protected(svc->scheduler, 1); + /* This check proves that old 'sched' was installed */ + if (!cur_sched) + return; + + if (sched->done_service) + sched->done_service(svc); + /* svc->scheduler can not be set to NULL */ +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); + + mutex_lock(&ip_vs_sched_mutex); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + mutex_unlock(&ip_vs_sched_mutex); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + mutex_unlock(&ip_vs_sched_mutex); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler && scheduler->module) + module_put(scheduler->module); +} + +/* + * Common error output helper for schedulers + */ + +void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) +{ + struct ip_vs_scheduler *sched; + + sched = rcu_dereference(svc->scheduler); + if (svc->fwmark) { + IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", + sched->name, svc->fwmark, svc->fwmark, msg); +#ifdef CONFIG_IP_VS_IPV6 + } else if (svc->af == AF_INET6) { + IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", + sched->name, ip_vs_proto_name(svc->protocol), + &svc->addr.in6, ntohs(svc->port), msg); +#endif + } else { + IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", + sched->name, ip_vs_proto_name(svc->protocol), + &svc->addr.ip, ntohs(svc->port), msg); + } +} + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + if (!scheduler->name) { + pr_err("%s(): NULL scheduler_name\n", __func__); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&ip_vs_sched_mutex); + + if (!list_empty(&scheduler->n_list)) { + mutex_unlock(&ip_vs_sched_mutex); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already linked\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + mutex_unlock(&ip_vs_sched_mutex); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already existed " + "in the system\n", __func__, scheduler->name); + return -EINVAL; + } + } + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + mutex_unlock(&ip_vs_sched_mutex); + + pr_info("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + mutex_lock(&ip_vs_sched_mutex); + if (list_empty(&scheduler->n_list)) { + mutex_unlock(&ip_vs_sched_mutex); + pr_err("%s(): [%s] scheduler is not in the list. failed\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + mutex_unlock(&ip_vs_sched_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c new file mode 100644 index 00000000000..e446b9fa742 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -0,0 +1,143 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static inline int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c new file mode 100644 index 00000000000..cc65b2f42cd --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -0,0 +1,385 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + * The weight destination attribute can be used to control the + * distribution of connections to the destinations in servernode. The + * greater the weight, the more connections the destination + * will receive. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + +#include <net/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + +struct ip_vs_sh_state { + struct rcu_head rcu_head; + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; +}; + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; +} + + +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* first try the dest it's supposed to go to */ + ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + dest = rcu_dereference(s->buckets[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + /* if the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; + hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); + dest = rcu_dereference(s->buckets[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, "SH: selected unavailable " + "server %s:%d (offset %d), reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + int d_count; + bool empty; + + b = &s->buckets[0]; + p = &svc->destinations; + empty = list_empty(p); + d_count = 0; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); + + IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", + i, IP_VS_DBG_ADDR(svc->af, &dest->addr), + atomic_read(&dest->weight)); + + /* Don't move to next dest until filling weight */ + if (++d_count >= atomic_read(&dest->weight)) { + p = p->next; + d_count = 0; + } + + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_state *s) +{ + int i; + struct ip_vs_sh_bucket *b; + struct ip_vs_dest *dest; + + b = &s->buckets[0]; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_state *s; + + /* allocate the SH table for this service */ + s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + + svc->sched_data = s; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with current dests */ + ip_vs_sh_reassign(s, svc); + + return 0; +} + + +static void ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_state *s = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(s); + + /* release the table itself */ + kfree_rcu(s, rcu_head); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); +} + + +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_sh_state *s = svc->sched_data; + + /* assign the hash buckets with the updated service */ + ip_vs_sh_reassign(s, svc); + + return 0; +} + + +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 port; + struct tcphdr _tcph, *th; + struct udphdr _udph, *uh; + sctp_sctphdr_t _sctph, *sh; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (unlikely(th == NULL)) + return 0; + port = th->source; + break; + case IPPROTO_UDP: + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (unlikely(uh == NULL)) + return 0; + port = uh->source; + break; + case IPPROTO_SCTP: + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (unlikely(sh == NULL)) + return 0; + port = sh->source; + break; + default: + port = 0; + } + + return port; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_state *s; + __be16 port = 0; + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) + port = ip_vs_sh_get_port(skb, iph); + + s = (struct ip_vs_sh_state *) svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) + dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + else + dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph->saddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .add_dest = ip_vs_sh_dest_changed, + .del_dest = ip_vs_sh_dest_changed, + .upd_dest = ip_vs_sh_dest_changed, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); + synchronize_rcu(); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c new file mode 100644 index 00000000000..db801263ee9 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -0,0 +1,1948 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version 1, is capable of handling both version 0 and 1 messages. + * Version 0 is the plain old format. + * Note Version 0 receivers will just drop Ver 1 messages. + * Version 1 is capable of handle IPv6, Persistence data, + * time-outs, and firewall marks. + * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions Message: is a complete datagram + * Sync_conn: is a part of a Message + * Param Data is an option to a Sync_conn. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added Version 1: i.e. IPv6, + * Persistence support, fwmark and time-out. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/inetdevice.h> +#include <linux/net.h> +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/igmp.h> /* for ip_mc_join_group */ +#include <linux/udp.h> +#include <linux/err.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/kernel.h> + +#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ + +#include <net/ip.h> +#include <net/sock.h> + +#include <net/ip_vs.h> + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + +#define SYNC_PROTO_VER 1 /* Protocol version in header */ + +static struct lock_class_key __ipvs_sync_key; +/* + * IPVS sync connection entry + * Version 0, i.e. original version. + */ +struct ip_vs_sync_conn_v0 { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + + /* Flags and state transition */ + __be16 flags; /* status flags */ + __be16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +/* + Sync Connection format (sync_conn) + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Protocol | Ver. | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | State | cport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | vport | dport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | fwmark | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | timeout (in sec.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ... | + | IP-Addresses (v4 or v6) | + | ... | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Optional Parameters. + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param. Type | Param. Length | Param. data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + | ... | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Param Type | Param. Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param data | + | Last Param data should be padded for 32 bit alignment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + * Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + struct in6_addr caddr; /* client address */ + struct in6_addr vaddr; /* virtual address */ + struct in6_addr daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { + struct ip_vs_sync_v4 v4; + struct ip_vs_sync_v6 v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6 0 +#define STYPE_F_INET6 (1 << STYPE_INET6) + +#define SVER_SHIFT 12 /* Shift to get version */ +#define SVER_MASK 0x0fff /* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA 1 +#define IPVS_OPT_PE_DATA 2 +#define IPVS_OPT_PE_NAME 3 +#define IPVS_OPT_PARAM 7 + +#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) + +struct ip_vs_sync_thread_data { + struct net *net; + struct socket *sock; + char *buf; + int id; +}; + +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages (Datagrams) to the backup load balancers + in the following format. + + Version 1: + Note, first byte should be Zero, so ver 0 receivers will drop the packet. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0 | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Version | Reserved, set to Zero | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + ~ . ~ + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | IPVS Sync Connection (1) | +*/ + +#define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ + +/* Version 0 header */ +struct ip_vs_sync_mesg_v0 { + __u8 nr_conns; + __u8 syncid; + __be16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* Version 1 header */ +struct ip_vs_sync_mesg { + __u8 reserved; /* must be zero */ + __u8 syncid; + __be16 size; + __u8 nr_conns; + __s8 version; /* SYNC_PROTO_VER */ + __u16 spare; + /* ip_vs_sync_conn entries start here */ +}; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ + ho->init_seq = get_unaligned_be32(&no->init_seq); + ho->delta = get_unaligned_be32(&no->delta); + ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} + +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ + put_unaligned_be32(ho->init_seq, &no->init_seq); + put_unaligned_be32(ho->delta, &no->delta); + put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} + +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_lock); + if (list_empty(&ms->sync_queue)) { + sb = NULL; + __set_current_state(TASK_INTERRUPTIBLE); + } else { + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, + list); + list_del(&sb->list); + ms->sync_queue_len--; + if (!ms->sync_queue_len) + ms->sync_queue_delay = 0; + } + spin_unlock_bh(&ipvs->sync_lock); + + return sb; +} + +/* + * Create a new sync buffer for Version 1 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ + sb->mesg->version = SYNC_PROTO_VER; + sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); + sb->mesg->nr_conns = 0; + sb->mesg->spare = 0; + sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); + sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +static inline void sb_queue_tail(struct netns_ipvs *ipvs, + struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb = ms->sync_buff; + + spin_lock(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER && + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { + if (!ms->sync_queue_len) + schedule_delayed_work(&ms->master_wakeup_work, + max(IPVS_SYNC_SEND_DELAY, 1)); + ms->sync_queue_len++; + list_add_tail(&sb->list, &ms->sync_queue); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) + wake_up_process(ms->master_thread); + } else + ip_vs_sync_buff_release(sb); + spin_unlock(&ipvs->sync_lock); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, + unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_buff_lock); + sb = ms->sync_buff; + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { + ms->sync_buff = NULL; + __set_current_state(TASK_RUNNING); + } else + sb = NULL; + spin_unlock_bh(&ipvs->sync_buff_lock); + return sb; +} + +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) +{ + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; +} + +/* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg_v0 *mesg; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; + mesg->nr_conns = 0; + mesg->syncid = ipvs->master_syncid; + mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); + sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); + sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +/* Check if connection is controlled by persistence */ +static inline bool in_persistence(struct ip_vs_conn *cp) +{ + for (cp = cp->control; cp; cp = cp->control) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + return true; + } + return false; +} + +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + * sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + * for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + * with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, + struct ip_vs_conn *cp, int pkts) +{ + unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long now = jiffies; + unsigned long n = (now + cp->timeout) & ~3UL; + unsigned int sync_refresh_period; + int sync_period; + int force; + + /* Check if we sync in current state */ + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) + force = 0; + else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) + return 0; + else if (likely(cp->protocol == IPPROTO_TCP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_TCP_S_ESTABLISHED) | + (1 << IP_VS_TCP_S_FIN_WAIT) | + (1 << IP_VS_TCP_S_CLOSE) | + (1 << IP_VS_TCP_S_CLOSE_WAIT) | + (1 << IP_VS_TCP_S_TIME_WAIT)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) + goto set; + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_SCTP_S_ESTABLISHED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | + (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | + (1 << IP_VS_SCTP_S_CLOSED)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) + goto set; + } else { + /* UDP or another protocol with single state */ + force = 0; + } + + sync_refresh_period = sysctl_sync_refresh_period(ipvs); + if (sync_refresh_period > 0) { + long diff = n - orig; + long min_diff = max(cp->timeout >> 1, 10UL * HZ); + + /* Avoid sync if difference is below sync_refresh_period + * and below the half timeout. + */ + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { + int retries = orig & 3; + + if (retries >= sysctl_sync_retries(ipvs)) + return 0; + if (time_before(now, orig - cp->timeout + + (sync_refresh_period >> 3))) + return 0; + n |= retries + 1; + } + } + sync_period = sysctl_sync_period(ipvs); + if (sync_period > 0) { + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && + pkts % sync_period != sysctl_sync_threshold(ipvs)) + return 0; + } else if (sync_refresh_period <= 0 && + pkts != sysctl_sync_threshold(ipvs)) + return 0; + +set: + cp->old_state = cp->state; + n = cmpxchg(&cp->sync_endtime, orig, n); + return n == orig || force; +} + +/* + * Version 0 , could be switched in by sys_ctl. + * Add an ip_vs_conn information into the current sync_buff. + */ +static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, + int pkts) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg_v0 *m; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; + int len; + + if (unlikely(cp->af != AF_INET)) + return; + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + buff = ms->sync_buff; + if (buff) { + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + /* Send buffer if it is for v1 */ + if (!m->nr_conns) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + } + } + if (!buff) { + buff = ip_vs_sync_buff_create_v0(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + ms->sync_buff = buff; + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + s = (struct ip_vs_sync_conn_v0 *) buff->head; + + /* copy members */ + s->reserved = 0; + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size = htons(ntohs(m->size) + len); + buff->head += len; + + /* check if there is a space for next one */ + if (buff->head + FULL_CONN_SIZE > buff->end) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + } + spin_unlock_bh(&ipvs->sync_buff_lock); + + /* synchronize its controller if it has */ + cp = cp->control; + if (cp) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + ip_vs_sync_conn(net, cp->control, pkts); + } +} + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + * Sending Version 1 messages + */ +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m; + union ip_vs_sync_conn *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; + __u8 *p; + unsigned int len, pe_name_len, pad; + + /* Handle old version of the protocol */ + if (sysctl_sync_ver(ipvs) == 0) { + ip_vs_sync_conn_v0(net, cp, pkts); + return; + } + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + goto control; +sloop: + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + goto control; + + /* Sanity checks */ + pe_name_len = 0; + if (cp->pe_data_len) { + if (!cp->pe_data || !cp->dest) { + IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); + return; + } + pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); + } + + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + len = sizeof(struct ip_vs_sync_v6); + else +#endif + len = sizeof(struct ip_vs_sync_v4); + + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) + len += sizeof(struct ip_vs_sync_conn_options) + 2; + + if (cp->pe_data_len) + len += cp->pe_data_len + 2; /* + Param hdr field */ + if (pe_name_len) + len += pe_name_len + 2; + + /* check if there is a space for this one */ + pad = 0; + buff = ms->sync_buff; + if (buff) { + m = buff->mesg; + pad = (4 - (size_t) buff->head) & 3; + /* Send buffer if it is for v0 */ + if (buff->head + len + pad > buff->end || m->reserved) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + pad = 0; + } + } + + if (!buff) { + buff = ip_vs_sync_buff_create(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + ms->sync_buff = buff; + m = buff->mesg; + } + + p = buff->head; + buff->head += pad + len; + m->size = htons(ntohs(m->size) + pad + len); + /* Add ev. padding from prev. sync_conn */ + while (pad--) + *(p++) = 0; + + s = (union ip_vs_sync_conn *)p; + + /* Set message type & copy members */ + s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); + s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ + s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); + s->v4.state = htons(cp->state); + s->v4.protocol = cp->protocol; + s->v4.cport = cp->cport; + s->v4.vport = cp->vport; + s->v4.dport = cp->dport; + s->v4.fwmark = htonl(cp->fwmark); + s->v4.timeout = htonl(cp->timeout / HZ); + m->nr_conns++; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) { + p += sizeof(struct ip_vs_sync_v6); + s->v6.caddr = cp->caddr.in6; + s->v6.vaddr = cp->vaddr.in6; + s->v6.daddr = cp->daddr.in6; + } else +#endif + { + p += sizeof(struct ip_vs_sync_v4); /* options ptr */ + s->v4.caddr = cp->caddr.ip; + s->v4.vaddr = cp->vaddr.ip; + s->v4.daddr = cp->daddr.ip; + } + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + *(p++) = IPVS_OPT_SEQ_DATA; + *(p++) = sizeof(struct ip_vs_sync_conn_options); + hton_seq((struct ip_vs_seq *)p, &cp->in_seq); + p += sizeof(struct ip_vs_seq); + hton_seq((struct ip_vs_seq *)p, &cp->out_seq); + p += sizeof(struct ip_vs_seq); + } + /* Handle pe data */ + if (cp->pe_data_len && cp->pe_data) { + *(p++) = IPVS_OPT_PE_DATA; + *(p++) = cp->pe_data_len; + memcpy(p, cp->pe_data, cp->pe_data_len); + p += cp->pe_data_len; + if (pe_name_len) { + /* Add PE_NAME */ + *(p++) = IPVS_OPT_PE_NAME; + *(p++) = pe_name_len; + memcpy(p, cp->pe->name, pe_name_len); + p += pe_name_len; + } + } + + spin_unlock_bh(&ipvs->sync_buff_lock); + +control: + /* synchronize its controller if it has */ + cp = cp->control; + if (!cp) + return; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + goto sloop; +} + +/* + * fill_param used by version 1 + */ +static inline int +ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, + __u8 *pe_data, unsigned int pe_data_len, + __u8 *pe_name, unsigned int pe_name_len) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_conn_fill_param(net, af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(net, af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len) { + if (pe_name_len) { + char buff[IP_VS_PENAME_MAXLEN+1]; + + memcpy(buff, pe_name, pe_name_len); + buff[pe_name_len]=0; + p->pe = __ip_vs_pe_getbyname(buff); + if (!p->pe) { + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", + buff); + return 1; + } + } else { + IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); + return 1; + } + + p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); + if (!p->pe_data) { + if (p->pe->module) + module_put(p->pe->module); + return -ENOMEM; + } + p->pe_data_len = pe_data_len; + } + return 0; +} + +/* + * Connection Add / Update. + * Common for version 0 and 1 reception of backup sync_conns. + * Param: ... + * timeout is in sec. + */ +static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, + unsigned int flags, unsigned int state, + unsigned int protocol, unsigned int type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(param); + else + cp = ip_vs_ct_in_get(param); + + if (cp) { + /* Free pe_data */ + kfree(param->pe_data); + + dest = cp->dest; + spin_lock_bh(&cp->lock); + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { + if (flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + } else { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + } + } + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; + cp->flags = flags; + spin_unlock_bh(&cp->lock); + if (!dest) + ip_vs_try_bind_dest(cp); + } else { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + rcu_read_lock(); + dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, + param->vport, protocol, fwmark, flags); + + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + rcu_read_unlock(); + if (!cp) { + if (param->pe_data) + kfree(param->pe_data); + IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); + return; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); + cp->state = state; + cp->old_state = cp->state; + /* + * For Ver 0 messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * Ver 1 messages style. + * - No problem. + */ + if (timeout) { + if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) + timeout = MAX_SCHEDULE_TIMEOUT / HZ; + cp->timeout = timeout*HZ; + } else { + struct ip_vs_proto_data *pd; + + pd = ip_vs_proto_data_get(net, protocol); + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) + cp->timeout = pd->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + } + ip_vs_conn_put(cp); +} + +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(struct net *net, const char *buffer, + const size_t buflen) +{ + struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + char *p; + int i; + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); + for (i=0; i<m->nr_conns; i++) { + unsigned int flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); + return; + } + s = (struct ip_vs_sync_conn_v0 *) p; + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", + state); + state = 0; + } + } + + ip_vs_conn_fill_param(net, AF_INET, s->protocol, + (const union nf_inet_addr *)&s->caddr, + s->cport, + (const union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + + /* Send timeout as Zero */ + ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt); + } +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, + __u32 *opt_flags, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_sync_conn_options *topt; + + topt = (struct ip_vs_sync_conn_options *)p; + + if (plen != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); + return -EINVAL; + } + if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { + IP_VS_DBG(2, "BACKUP, conn options found twice\n"); + return -EINVAL; + } + ntoh_seq(&topt->in_seq, &opt->in_seq); + ntoh_seq(&topt->out_seq, &opt->out_seq); + *opt_flags |= IPVS_OPT_F_SEQ_DATA; + return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, + __u8 **data, unsigned int maxlen, + __u32 *opt_flags, __u32 flag) +{ + if (plen > maxlen) { + IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); + return -EINVAL; + } + if (*opt_flags & flag) { + IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); + return -EINVAL; + } + *data_len = plen; + *data = p; + *opt_flags |= flag; + return 0; +} +/* + * Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +{ + struct ip_vs_sync_conn_options opt; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + __u32 flags; + unsigned int af, state, pe_data_len=0, pe_name_len=0; + __u8 *pe_data=NULL, *pe_name=NULL; + __u32 opt_flags=0; + int retc=0; + + s = (union ip_vs_sync_conn *) p; + + if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); + retc = 10; + goto out; +#endif + } else if (!s->v4.type) { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } else { + return -10; + } + if (p > msg_end) + return -20; + + /* Process optional params check Type & Len. */ + while (p < msg_end) { + int ptype; + int plen; + + if (p+2 > msg_end) + return -30; + ptype = *(p++); + plen = *(p++); + + if (!plen || ((p + plen) > msg_end)) + return -40; + /* Handle seq option p = param data */ + switch (ptype & ~IPVS_OPT_F_PARAM) { + case IPVS_OPT_SEQ_DATA: + if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) + return -50; + break; + + case IPVS_OPT_PE_DATA: + if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, + IP_VS_PEDATA_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_DATA)) + return -60; + break; + + case IPVS_OPT_PE_NAME: + if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, + IP_VS_PENAME_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_NAME)) + return -70; + break; + + default: + /* Param data mandatory ? */ + if (!(ptype & IPVS_OPT_F_PARAM)) { + IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", + ptype & ~IPVS_OPT_F_PARAM); + retc = 20; + goto out; + } + } + p += plen; /* Next option */ + } + + /* Get flags and Mask off unsupported */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", + s->v4.protocol); + retc = 30; + goto out; + } + if (state >= pp->num_states) { + IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", + pp->name, state); + retc = 40; + goto out; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", + state); + state = 0; + } + } + if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + pe_data_len, pe_name, pe_name_len)) { + retc = 50; + goto out; + } + /* If only IPv4, just silent skip IPv6 */ + if (af == AF_INET) + ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, + ntohl(s->v4.timeout), ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#ifdef CONFIG_IP_VS_IPV6 + else + ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, + ntohl(s->v6.timeout), ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#endif + return 0; + /* Error exit */ +out: + IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); + return retc; + +} +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(struct net *net, __u8 *buffer, + const size_t buflen) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; + __u8 *p, *msg_end; + int i, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { + IP_VS_DBG(2, "BACKUP, message header too short\n"); + return; + } + + if (buflen != ntohs(m2->size)) { + IP_VS_DBG(2, "BACKUP, bogus message size\n"); + return; + } + /* SyncID sanity check */ + if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); + return; + } + /* Handle version 1 message */ + if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) + && (m2->spare == 0)) { + + msg_end = buffer + sizeof(struct ip_vs_sync_mesg); + nr_conns = m2->nr_conns; + + for (i=0; i<nr_conns; i++) { + union ip_vs_sync_conn *s; + unsigned int size; + int retc; + + p = msg_end; + if (p + sizeof(s->v4) > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + return; + } + s = (union ip_vs_sync_conn *)p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msg_end = p + size; + /* Basic sanity checks */ + if (msg_end > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); + return; + } + if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", + ntohs(s->v4.ver_size) >> SVER_SHIFT); + return; + } + /* Process a single sync_conn */ + retc = ip_vs_proc_sync_conn(net, p, msg_end); + if (retc < 0) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", + retc); + return; + } + /* Make sure we have 32 bit alignment */ + msg_end = p + ((size + 3) & ~3); + } + } else { + /* Old type of message */ + ip_vs_process_message_v0(net, buffer, buflen); + return; + } +} + + +/* + * Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ + lock_sock(sk); + if (mode) { + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, + sysctl_wmem_max); + sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } else { + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, + sysctl_rmem_max); + sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(struct net *net, int sync_state) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); + if (!dev) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", ipvs->send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); + if (!dev) + return -ENODEV; + + ipvs->recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", ipvs->recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct net *net = sock_net(sk); + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net *net = sock_net(sock->sk); + struct net_device *dev; + __be32 addr; + struct sockaddr_in sin; + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + pr_err("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %pI4\n", + ifname, &addr); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket *make_send_sock(struct net *net, int id) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; + struct socket *sock; + int result; + + /* First create a socket move it to right name space later */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); + result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); + if (result < 0) { + pr_err("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 1, result); + + result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); + if (result < 0) { + pr_err("Error binding address of the mcast interface\n"); + goto error; + } + + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { + pr_err("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + +error: + sk_release_kernel(sock->sk); + return ERR_PTR(result); +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket *make_receive_sock(struct net *net, int id) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = SK_CAN_REUSE; + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 0, result); + + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { + pr_err("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ipvs->backup_mcast_ifn); + if (result < 0) { + pr_err("Error joining to the multicast group\n"); + goto error; + } + + return sock; + +error: + sk_release_kernel(sock->sk); + return ERR_PTR(result); +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static int +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + int ret; + + msize = ntohs(msg->size); + + ret = ip_vs_send_async(sock, (char *)msg, msize); + if (ret >= 0 || ret == -EAGAIN) + return ret; + pr_err("ip_vs_send_async error %d\n", ret); + return 0; +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); + + if (len < 0) + return len; + + LeaveFunction(7); + return len; +} + +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ + struct ipvs_master_sync_state *ms = + container_of(work, struct ipvs_master_sync_state, + master_wakeup_work.work); + struct netns_ipvs *ipvs = ms->ipvs; + + spin_lock_bh(&ipvs->sync_lock); + if (ms->sync_queue_len && + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; + wake_up_process(ms->master_thread); + } + spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + sb = sb_dequeue(ipvs, ms); + if (sb) + return sb; + /* Do not delay entries in buffer for more than 2 seconds */ + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +} + +static int sync_thread_master(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; + struct sock *sk = tinfo->sock->sk; + struct ip_vs_sync_buff *sb; + + pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d, id = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); + + for (;;) { + sb = next_sync_buff(ipvs, ms); + if (unlikely(kthread_should_stop())) + break; + if (!sb) { + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); + continue; + } + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { + /* (Ab)use interruptible sleep to avoid increasing + * the load avg. + */ + __wait_event_interruptible(*sk_sleep(sk), + sock_writeable(sk) || + kthread_should_stop()); + if (unlikely(kthread_should_stop())) + goto done; + } + ip_vs_sync_buff_release(sb); + } + +done: + __set_current_state(TASK_RUNNING); + if (sb) + ip_vs_sync_buff_release(sb); + + /* clean up the sync_buff queue */ + while ((sb = sb_dequeue(ipvs, ms))) + ip_vs_sync_buff_release(sb); + __set_current_state(TASK_RUNNING); + + /* clean up the current sync_buff */ + sb = get_curr_sync_buff(ipvs, ms, 0); + if (sb) + ip_vs_sync_buff_release(sb); + + /* release the sending multicast socket */ + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo); + + return 0; +} + + +static int sync_thread_backup(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + int len; + + pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d, id = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); + + while (!kthread_should_stop()) { + wait_event_interruptible(*sk_sleep(tinfo->sock->sk), + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + ipvs->recv_mesg_maxlen); + if (len <= 0) { + if (len != -EAGAIN) + pr_err("receiving message error\n"); + break; + } + + ip_vs_process_message(tinfo->net, tinfo->buf, len); + } + } + + /* release the sending multicast socket */ + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + + return 0; +} + + +int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) +{ + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **array = NULL, *task; + struct socket *sock; + struct netns_ipvs *ipvs = net_ipvs(net); + char *name; + int (*threadfn)(void *data); + int id, count; + int result = -ENOMEM; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn_v0)); + + if (!ipvs->sync_state) { + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); + ipvs->threads_mask = count - 1; + } else + count = ipvs->threads_mask + 1; + + if (state == IP_VS_STATE_MASTER) { + if (ipvs->ms) + return -EEXIST; + + strlcpy(ipvs->master_mcast_ifn, mcast_ifn, + sizeof(ipvs->master_mcast_ifn)); + ipvs->master_syncid = syncid; + name = "ipvs-m:%d:%d"; + threadfn = sync_thread_master; + } else if (state == IP_VS_STATE_BACKUP) { + if (ipvs->backup_threads) + return -EEXIST; + + strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, + sizeof(ipvs->backup_mcast_ifn)); + ipvs->backup_syncid = syncid; + name = "ipvs-b:%d:%d"; + threadfn = sync_thread_backup; + } else { + return -EINVAL; + } + + if (state == IP_VS_STATE_MASTER) { + struct ipvs_master_sync_state *ms; + + ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); + if (!ipvs->ms) + goto out; + ms = ipvs->ms; + for (id = 0; id < count; id++, ms++) { + INIT_LIST_HEAD(&ms->sync_queue); + ms->sync_queue_len = 0; + ms->sync_queue_delay = 0; + INIT_DELAYED_WORK(&ms->master_wakeup_work, + master_wakeup_work_handler); + ms->ipvs = ipvs; + } + } else { + array = kzalloc(count * sizeof(struct task_struct *), + GFP_KERNEL); + if (!array) + goto out; + } + set_sync_mesg_maxlen(net, state); + + tinfo = NULL; + for (id = 0; id < count; id++) { + if (state == IP_VS_STATE_MASTER) + sock = make_send_sock(net, id); + else + sock = make_receive_sock(net, id); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto outtinfo; + } + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outsocket; + tinfo->net = net; + tinfo->sock = sock; + if (state == IP_VS_STATE_BACKUP) { + tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + GFP_KERNEL); + if (!tinfo->buf) + goto outtinfo; + } else { + tinfo->buf = NULL; + } + tinfo->id = id; + + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + tinfo = NULL; + if (state == IP_VS_STATE_MASTER) + ipvs->ms[id].master_thread = task; + else + array[id] = task; + } + + /* mark as active */ + + if (state == IP_VS_STATE_BACKUP) + ipvs->backup_threads = array; + spin_lock_bh(&ipvs->sync_buff_lock); + ipvs->sync_state |= state; + spin_unlock_bh(&ipvs->sync_buff_lock); + + /* increase the module use count */ + ip_vs_use_count_inc(); + + return 0; + +outsocket: + sk_release_kernel(sock->sk); + +outtinfo: + if (tinfo) { + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + } + count = id; + while (count-- > 0) { + if (state == IP_VS_STATE_MASTER) + kthread_stop(ipvs->ms[count].master_thread); + else + kthread_stop(array[count]); + } + kfree(array); + +out: + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + kfree(ipvs->ms); + ipvs->ms = NULL; + } + return result; +} + + +int stop_sync_thread(struct net *net, int state) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct task_struct **array; + int id; + int retc = -EINVAL; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + + if (state == IP_VS_STATE_MASTER) { + if (!ipvs->ms) + return -ESRCH; + + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ + + spin_lock_bh(&ipvs->sync_buff_lock); + spin_lock(&ipvs->sync_lock); + ipvs->sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock(&ipvs->sync_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; + int ret; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ms->master_thread)); + cancel_delayed_work_sync(&ms->master_wakeup_work); + ret = kthread_stop(ms->master_thread); + if (retc >= 0) + retc = ret; + } + kfree(ipvs->ms); + ipvs->ms = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!ipvs->backup_threads) + return -ESRCH; + + ipvs->sync_state &= ~IP_VS_STATE_BACKUP; + array = ipvs->backup_threads; + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + int ret; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(array[id])); + ret = kthread_stop(array[id]); + if (retc >= 0) + retc = ret; + } + kfree(array); + ipvs->backup_threads = NULL; + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return retc; +} + +/* + * Initialize data struct for each netns + */ +int __net_init ip_vs_sync_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); + spin_lock_init(&ipvs->sync_lock); + spin_lock_init(&ipvs->sync_buff_lock); + return 0; +} + +void ip_vs_sync_net_cleanup(struct net *net) +{ + int retc; + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&ipvs->sync_mutex); + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); + mutex_unlock(&ipvs->sync_mutex); +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c new file mode 100644 index 00000000000..b5b4650d50a --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -0,0 +1,115 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c new file mode 100644 index 00000000000..0546cd572d6 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -0,0 +1,270 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/net.h> +#include <linux/gcd.h> + +#include <net/ip_vs.h> + +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct ip_vs_dest *cl; /* current dest or head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ + struct rcu_head rcu_head; +}; + + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->weight); + if (new_weight > weight) + weight = new_weight; + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); + if (mark == NULL) + return -ENOMEM; + + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); + mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + mark->cw = mark->mw; + svc->sched_data = mark; + + return 0; +} + + +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + /* + * Release the mark variable + */ + kfree_rcu(mark, rcu_head); +} + + +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + spin_lock_bh(&svc->sched_lock); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); + mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + if (mark->cw > mark->mw || !mark->cw) + mark->cw = mark->mw; + else if (mark->di > 1) + mark->cw = (mark->cw / mark->di) * mark->di + 1; + spin_unlock_bh(&svc->sched_lock); + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *last, *stop = NULL; + struct ip_vs_wrr_mark *mark = svc->sched_data; + bool last_pass = false, restarted = false; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + spin_lock_bh(&svc->sched_lock); + dest = mark->cl; + /* No available dests? */ + if (mark->mw == 0) + goto err_noavail; + last = dest; + /* Stop only after all dests were checked for weight >= 1 (last pass) */ + while (1) { + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) + goto found; + if (dest == stop) + goto err_over; + } + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* Stop if we tried last pass from first dest: + * 1. last_pass: we started checks when cw > di but + * then all dests were checked for w >= 1 + * 2. last was head: the first and only traversal + * was for weight >= 1, for all dests. + */ + if (last_pass || + &last->n_list == &svc->destinations) + goto err_over; + restarted = true; + } + last_pass = mark->cw <= mark->di; + if (last_pass && restarted && + &last->n_list != &svc->destinations) { + /* First traversal was for w >= 1 but only + * for dests after 'last', now do the same + * for all dests up to 'last'. + */ + stop = last; + } + } + +found: + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + mark->cl = dest; + + out: + spin_unlock_bh(&svc->sched_lock); + return dest; + +err_noavail: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available"); + goto out; + +err_over: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available: " + "all destinations are overloaded"); + goto out; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .add_dest = ip_vs_wrr_dest_changed, + .del_dest = ip_vs_wrr_dest_changed, + .upd_dest = ip_vs_wrr_dest_changed, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c new file mode 100644 index 00000000000..73ba1cc7a88 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -0,0 +1,1266 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + * Description of forwarding methods: + * - all transmitters are called from LOCAL_IN (remote clients) and + * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD + * - not all connections have destination server, for example, + * connections in backup server when fwmark is used + * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback + * LOCAL_OUT rules: + * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) + * - skb->pkt_type is not set yet + * - the only place where we can see skb->sk != NULL + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> /* for ip_route_output */ +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <linux/icmpv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + +enum { + IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ + IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ + IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to + * local + */ + IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ + IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ + IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ +}; + +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ + return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ + kfree(dest_dst); +} + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, + struct dst_entry *dst, u32 dst_cookie) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, + lockdep_is_held(&dest->dst_lock)); + + if (dest_dst) { + dest_dst->dst_cache = dst; + dest_dst->dst_cookie = dst_cookie; + } + rcu_assign_pointer(dest->dest_dst, dest_dst); + + if (old) + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); +} + +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); + struct dst_entry *dst; + + if (!dest_dst) + return NULL; + dst = dest_dst->dst_cache; + if (dst->obsolete && + dst->ops->check(dst, dest_dst->dst_cookie) == NULL) + return NULL; + return dest_dst; +} + +static inline bool +__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) +{ + if (IP6CB(skb)->frag_max_size) { + /* frag_max_size tell us that, this packet have been + * defragmented by netfilter IPv6 conntrack module. + */ + if (IP6CB(skb)->frag_max_size > mtu) + return true; /* largest fragment violate MTU */ + } + else if (skb->len > mtu && !skb_is_gso(skb)) { + return true; /* Packet size violate MTU size */ + } + return false; +} + +/* Get route to daddr, update *saddr, optionally bind route to saddr */ +static struct rtable *do_output_route4(struct net *net, __be32 daddr, + int rt_mode, __be32 *saddr) +{ + struct flowi4 fl4; + struct rtable *rt; + int loop = 0; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; + fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? + FLOWI_FLAG_KNOWN_NH : 0; + +retry: + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + /* Invalid saddr ? */ + if (PTR_ERR(rt) == -EINVAL && *saddr && + rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { + *saddr = 0; + flowi4_update_output(&fl4, 0, 0, daddr, 0); + goto retry; + } + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); + return NULL; + } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + ip_rt_put(rt); + *saddr = fl4.saddr; + flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + loop++; + goto retry; + } + *saddr = fl4.saddr; + return rt; +} + +/* Get route to destination or remote server */ +static int +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, int rt_mode, __be32 *ret_saddr) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest_dst *dest_dst; + struct rtable *rt; /* Route to the other host */ + struct rtable *ort; /* Original route */ + struct iphdr *iph; + __be16 df; + int mtu; + int local, noref = 1; + + if (dest) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rtable *) dest_dst->dst_cache; + else { + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + rt = do_output_route4(net, dest->addr.ip, rt_mode, + &dest_dst->dst_saddr.ip); + if (!rt) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; + } + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt)); + } + daddr = dest->addr.ip; + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; + } else { + __be32 saddr = htonl(INADDR_ANY); + + noref = 0; + + /* For such unconfigured boxes avoid many route lookups + * for performance reasons because we do not remember saddr + */ + rt_mode &= ~IP_VS_RT_MODE_CONNECT; + rt = do_output_route4(net, daddr, rt_mode, &saddr); + if (!rt) + goto err_unreach; + if (ret_saddr) + *ret_saddr = saddr; + } + + local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", + (rt->rt_flags & RTCF_LOCAL) ? + "local":"non-local", &daddr); + goto err_put; + } + iph = ip_hdr(skb); + if (likely(!local)) { + if (unlikely(ipv4_is_loopback(iph->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI4 to non-local address, dest: %pI4\n", + &iph->saddr, &daddr); + goto err_put; + } + } else { + ort = skb_rtable(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !(ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to " + "local requires NAT method, dest: %pI4\n", + &iph->daddr, &daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + ip_rt_put(rt); + return local; + } + + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { + mtu = dst_mtu(&rt->dst); + df = iph->frag_off & htons(IP_DF); + } else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto err_put; + } + ort = skb_rtable(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + /* MTU check allowed? */ + df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; + } + + /* MTU checking */ + if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + ip_rt_put(rt); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; +} + +#ifdef CONFIG_IP_VS_IPV6 + +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; +} + +static struct dst_entry * +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, + struct in6_addr *ret_saddr, int do_xfrm) +{ + struct dst_entry *dst; + struct flowi6 fl6 = { + .daddr = *daddr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) + goto out_err; + if (!ret_saddr) + return dst; + if (ipv6_addr_any(&fl6.saddr) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl6.daddr, 0, &fl6.saddr) < 0) + goto out_err; + if (do_xfrm) { + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) { + dst = NULL; + goto out_err; + } + } + *ret_saddr = fl6.saddr; + return dst; + +out_err: + dst_release(dst); + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); + return NULL; +} + +/* + * Get route to destination or remote server + */ +static int +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + struct ip_vs_dest_dst *dest_dst; + struct rt6_info *rt; /* Route to the other host */ + struct rt6_info *ort; /* Original route */ + struct dst_entry *dst; + int mtu; + int local, noref = 1; + + if (dest) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rt6_info *) dest_dst->dst_cache; + else { + u32 cookie; + + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + dst = __ip_vs_route_output_v6(net, &dest->addr.in6, + &dest_dst->dst_saddr.in6, + do_xfrm); + if (!dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; + } + rt = (struct rt6_info *) dst; + cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest_dst->dst_saddr.in6, + atomic_read(&rt->dst.__refcnt)); + } + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; + } else { + noref = 0; + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + if (!dst) + goto err_unreach; + rt = (struct rt6_info *) dst; + } + + local = __ip_vs_is_local_route6(rt); + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", + local ? "local":"non-local", daddr); + goto err_put; + } + if (likely(!local)) { + if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI6c to non-local address, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err_put; + } + } else { + ort = (struct rt6_info *) skb_dst(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !__ip_vs_is_local_route6(ort)) { + IP_VS_DBG_RL("Redirect from non-local address %pI6c " + "to local requires NAT method, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->daddr, daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + dst_release(&rt->dst); + return local; + } + + /* MTU checking */ + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) + mtu = dst_mtu(&rt->dst); + else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto err_put; + } + ort = (struct rt6_info *) skb_dst(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + } + + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + dst_release(&rt->dst); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; +} +#endif + + +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, + struct ip_vs_conn *cp) +{ + int ret = NF_ACCEPT; + + skb->ipvs_property = 1; + if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) + ret = ip_vs_confirm_conntrack(skb); + if (ret == NF_ACCEPT) { + nf_reset(skb); + skb_forward_csum(skb); + } + return ret; +} + +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 1); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; +} + +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; +} + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + /* we do not touch skb and do not need pskb ptr */ + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct iphdr *iph = ip_hdr(skb); + + EnterFunction(10); + + rcu_read_lock(); + if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, + NULL) < 0) + goto tx_error; + + ip_send_check(iph); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error: + kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + EnterFunction(10); + + rcu_read_lock(); + if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) + goto tx_error; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error: + kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct rtable *rt; /* Route to the other host */ + int local, rc, was_input; + + EnterFunction(10); + + rcu_read_lock(); + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + was_input = rt_is_input_route(skb_rtable(skb)); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to loopback address"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct iphdr))) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) + goto tx_error; + ip_hdr(skb)->daddr = cp->daddr.ip; + ip_send_check(ip_hdr(skb)); + + IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); + + LeaveFunction(10); + return rc; + + tx_error: + kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct rt6_info *rt; /* Route to the other host */ + int local, rc; + + EnterFunction(10); + + rcu_read_lock(); + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); + + LeaveFunction(10); + return rc; + +tx_error: + LeaveFunction(10); + kfree_skb(skb); + rcu_read_unlock(); + return NF_STOLEN; +} +#endif + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 <Virtual IP Address> up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct rtable *rt; /* Route to the other host */ + __be32 saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = ip_hdr(skb); + u8 tos = old_iph->tos; + __be16 df; + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int ret, local; + + EnterFunction(10); + + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_CONNECT | + IP_VS_RT_MODE_TUNNEL, &saddr); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); + } + + rt = skb_rtable(skb); + tdev = rt->dst.dev; + + /* Copy DF, reset fragment offset and MF */ + df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + + if (!new_skb) + goto tx_error; + consume_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb->transport_header = skb->network_header; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* + * Push down and install the IPIP header. + */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = cp->daddr.ip; + iph->saddr = saddr; + iph->ttl = old_iph->ttl; + ip_select_ident(skb, NULL); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ret = ip_vs_tunnel_xmit_prepare(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + rcu_read_unlock(); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error: + kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct rt6_info *rt; /* Route to the other host */ + struct in6_addr saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int ret, local; + + EnterFunction(10); + + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, ipvsh, 1, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_TUNNEL); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); + } + + rt = (struct rt6_info *) skb_dst(skb); + tdev = rt->dst.dev; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + + if (!new_skb) + goto tx_error; + consume_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = skb->network_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len; + be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + iph->daddr = cp->daddr.in6; + iph->saddr = saddr; + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ret = ip_vs_tunnel_xmit_prepare(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + rcu_read_unlock(); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error: + kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + int local; + + EnterFunction(10); + + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); + } + + ip_send_check(ip_hdr(skb)); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error: + kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + int local; + + EnterFunction(10); + + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); + } + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error: + kfree_skb(skb); + rcu_read_unlock(); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *iph) +{ + struct rtable *rt; /* Route to the other host */ + int rc; + int local; + int rt_mode, was_input; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp, iph); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + was_input = rt_is_input_route(skb_rtable(skb)); + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); + + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); + goto out; + + tx_error: + kfree_skb(skb); + rcu_read_unlock(); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) +{ + struct rt6_info *rt; /* Route to the other host */ + int rc; + int local; + int rt_mode; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp, ipvsh); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, rt_mode); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); + goto out; + +tx_error: + kfree_skb(skb); + rcu_read_unlock(); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +} +#endif diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c new file mode 100644 index 00000000000..a4b5e2a435a --- /dev/null +++ b/net/netfilter/nf_conntrack_acct.c @@ -0,0 +1,133 @@ +/* Accouting handling for netfilter. */ + +/* + * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/export.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_acct.h> + +static bool nf_ct_acct __read_mostly; + +module_param_named(acct, nf_ct_acct, bool, 0644); +MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table acct_sysctl_table[] = { + { + .procname = "nf_conntrack_acct", + .data = &init_net.ct.sysctl_acct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +unsigned int +seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) +{ + struct nf_conn_acct *acct; + struct nf_conn_counter *counter; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; + + counter = acct->counter; + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)atomic64_read(&counter[dir].packets), + (unsigned long long)atomic64_read(&counter[dir].bytes)); +}; +EXPORT_SYMBOL_GPL(seq_print_acct); + +static struct nf_ct_ext_type acct_extend __read_mostly = { + .len = sizeof(struct nf_conn_acct), + .align = __alignof__(struct nf_conn_acct), + .id = NF_CT_EXT_ACCT, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_acct_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_acct; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter", + table); + if (!net->ct.acct_sysctl_header) { + printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.acct_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.acct_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_acct_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_acct_pernet_init(struct net *net) +{ + net->ct.sysctl_acct = nf_ct_acct; + return nf_conntrack_acct_init_sysctl(net); +} + +void nf_conntrack_acct_pernet_fini(struct net *net) +{ + nf_conntrack_acct_fini_sysctl(net); +} + +int nf_conntrack_acct_init(void) +{ + int ret = nf_ct_extend_register(&acct_extend); + if (ret < 0) + pr_err("nf_conntrack_acct: Unable to register extension\n"); + return ret; +} + +void nf_conntrack_acct_fini(void) +{ + nf_ct_extend_unregister(&acct_extend); +} diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index b8869eab765..b8b95f4027c 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -2,6 +2,7 @@ * * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> * based on HW's ip_conntrack_irc.c as well as other modules + * (C) 2006 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -16,6 +17,7 @@ #include <linux/in.h> #include <linux/udp.h> #include <linux/netfilter.h> +#include <linux/gfp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_expect.h> @@ -30,14 +32,16 @@ MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_amanda"); +MODULE_ALIAS_NFCT_HELPER("amanda"); module_param(master_timeout, uint, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); module_param(ts_algo, charp, 0400); MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); -unsigned int (*nf_nat_amanda_hook)(struct sk_buff **pskb, +unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned int matchoff, unsigned int matchlen, struct nf_conntrack_expect *exp) @@ -53,7 +57,7 @@ enum amanda_strings { }; static struct { - char *string; + const char *string; size_t len; struct ts_config *ts; } search[] __read_mostly = { @@ -79,7 +83,7 @@ static struct { }, }; -static int amanda_help(struct sk_buff **pskb, +static int amanda_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) @@ -91,7 +95,6 @@ static int amanda_help(struct sk_buff **pskb, char pbuf[sizeof("65535")], *tmp; u_int16_t len; __be16 port; - int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; int ret = NF_ACCEPT; typeof(nf_nat_amanda_hook) nf_nat_amanda; @@ -101,25 +104,24 @@ static int amanda_help(struct sk_buff **pskb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - nf_ct_refresh(ct, *pskb, master_timeout * HZ); + nf_ct_refresh(ct, skb, master_timeout * HZ); /* No data? */ dataoff = protoff + sizeof(struct udphdr); - if (dataoff >= (*pskb)->len) { - if (net_ratelimit()) - printk("amanda_help: skblen = %u\n", (*pskb)->len); + if (dataoff >= skb->len) { + net_err_ratelimited("amanda_help: skblen = %u\n", skb->len); return NF_ACCEPT; } memset(&ts, 0, sizeof(ts)); - start = skb_find_text(*pskb, dataoff, (*pskb)->len, + start = skb_find_text(skb, dataoff, skb->len, search[SEARCH_CONNECT].ts, &ts); if (start == UINT_MAX) goto out; start += dataoff + search[SEARCH_CONNECT].len; memset(&ts, 0, sizeof(ts)); - stop = skb_find_text(*pskb, start, (*pskb)->len, + stop = skb_find_text(skb, start, skb->len, search[SEARCH_NEWLINE].ts, &ts); if (stop == UINT_MAX) goto out; @@ -127,13 +129,13 @@ static int amanda_help(struct sk_buff **pskb, for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { memset(&ts, 0, sizeof(ts)); - off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); + off = skb_find_text(skb, start, stop, search[i].ts, &ts); if (off == UINT_MAX) continue; off += start + search[i].len; len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); - if (skb_copy_bits(*pskb, off, pbuf, len)) + if (skb_copy_bits(skb, off, pbuf, len)) break; pbuf[len] = '\0'; @@ -142,55 +144,56 @@ static int amanda_help(struct sk_buff **pskb, if (port == 0 || len > 5) break; - exp = nf_conntrack_expect_alloc(ct); + exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - nf_conntrack_expect_init(exp, family, - &tuple->src.u3, &tuple->dst.u3, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_TCP, NULL, &port); nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook); if (nf_nat_amanda && ct->status & IPS_NAT_MASK) - ret = nf_nat_amanda(pskb, ctinfo, off - dataoff, - len, exp); - else if (nf_conntrack_expect_related(exp) != 0) + ret = nf_nat_amanda(skb, ctinfo, protoff, + off - dataoff, len, exp); + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - nf_conntrack_expect_put(exp); + } + nf_ct_expect_put(exp); } out: return ret; } +static const struct nf_conntrack_expect_policy amanda_exp_policy = { + .max_expected = 3, + .timeout = 180, +}; + static struct nf_conntrack_helper amanda_helper[2] __read_mostly = { { .name = "amanda", - .max_expected = 3, - .timeout = 180, .me = THIS_MODULE, .help = amanda_help, .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(10080), + .tuple.src.u.udp.port = cpu_to_be16(10080), .tuple.dst.protonum = IPPROTO_UDP, - .mask.src.l3num = 0xFFFF, - .mask.src.u.udp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, + .expect_policy = &amanda_exp_policy, }, { .name = "amanda", - .max_expected = 3, - .timeout = 180, .me = THIS_MODULE, .help = amanda_help, .tuple.src.l3num = AF_INET6, - .tuple.src.u.udp.port = __constant_htons(10080), + .tuple.src.u.udp.port = cpu_to_be16(10080), .tuple.dst.protonum = IPPROTO_UDP, - .mask.src.l3num = 0xFFFF, - .mask.src.u.udp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, + .expect_policy = &amanda_exp_policy, }, }; @@ -208,13 +211,14 @@ static int __init nf_conntrack_amanda_init(void) { int ret, i; - ret = -ENOMEM; for (i = 0; i < ARRAY_SIZE(search); i++) { search[i].ts = textsearch_prepare(ts_algo, search[i].string, search[i].len, GFP_KERNEL, TS_AUTOLOAD); - if (search[i].ts == NULL) + if (IS_ERR(search[i].ts)) { + ret = PTR_ERR(search[i].ts); goto err1; + } } ret = nf_conntrack_helper_register(&amanda_helper[0]); if (ret < 0) @@ -227,10 +231,9 @@ static int __init nf_conntrack_amanda_init(void) err2: nf_conntrack_helper_unregister(&amanda_helper[0]); err1: - for (; i >= 0; i--) { - if (search[i].ts) - textsearch_destroy(search[i].ts); - } + while (--i >= 0) + textsearch_destroy(search[i].ts); + return ret; } diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c new file mode 100644 index 00000000000..4e99cca6161 --- /dev/null +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -0,0 +1,82 @@ +/* + * broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <net/route.h> +#include <linux/inetdevice.h> +#include <linux/skbuff.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> + +int nf_conntrack_broadcast_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int timeout) +{ + struct nf_conntrack_expect *exp; + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct nf_conn_help *help = nfct_help(ct); + __be32 mask = 0; + + /* we're only interested in locally generated packets */ + if (skb->sk == NULL) + goto out; + if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) + goto out; + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + goto out; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (in_dev != NULL) { + for_primary_ifa(in_dev) { + if (ifa->ifa_broadcast == iph->daddr) { + mask = ifa->ifa_mask; + break; + } + } endfor_ifa(in_dev); + } + rcu_read_unlock(); + + if (mask == 0) + goto out; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) + goto out; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port; + + exp->mask.src.u3.ip = mask; + exp->mask.src.u.udp.port = htons(0xFFFF); + + exp->expectfn = NULL; + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->class = NF_CT_EXPECT_CLASS_DEFAULT; + exp->helper = NULL; + + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); + + nf_ct_refresh(ct, skb, timeout * HZ); +out: + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e8b5c2d7db6..1f4f954c4b4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -5,6 +5,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -14,6 +15,7 @@ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> +#include <linux/sched.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> @@ -29,219 +31,138 @@ #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/mm.h> +#include <linux/nsproxy.h> +#include <linux/rculist_nulls.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> #define NF_CONNTRACK_VERSION "0.5.0" -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -DEFINE_RWLOCK(nf_conntrack_lock); -EXPORT_SYMBOL_GPL(nf_conntrack_lock); - -/* nf_conntrack_standalone needs this */ -atomic_t nf_conntrack_count = ATOMIC_INIT(0); -EXPORT_SYMBOL_GPL(nf_conntrack_count); - -void (*nf_conntrack_destroyed)(struct nf_conn *conntrack); -EXPORT_SYMBOL_GPL(nf_conntrack_destroyed); - -unsigned int nf_conntrack_htable_size __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); - -int nf_conntrack_max __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_max); - -struct list_head *nf_conntrack_hash __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash); - -struct nf_conn nf_conntrack_untracked __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_untracked); - -unsigned int nf_ct_log_invalid __read_mostly; -LIST_HEAD(unconfirmed); -static int nf_conntrack_vmalloc __read_mostly; - -static unsigned int nf_conntrack_next_id; +int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) __read_mostly; +EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); -DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); -EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); +__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; +EXPORT_SYMBOL_GPL(nf_conntrack_locks); -/* - * This scheme offers various size of "struct nf_conn" dependent on - * features(helper, nat, ...) - */ - -#define NF_CT_FEATURES_NAMELEN 256 -static struct { - /* name of slab cache. printed in /proc/slabinfo */ - char *name; - - /* size of slab cache */ - size_t size; - - /* slab cache pointer */ - struct kmem_cache *cachep; - - /* allocated slab cache + modules which uses this slab cache */ - int use; - -} nf_ct_cache[NF_CT_F_NUM]; +__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); -/* protect members of nf_ct_cache except of "use" */ -DEFINE_RWLOCK(nf_ct_cache_lock); - -/* This avoids calling kmem_cache_create() with same name simultaneously */ -static DEFINE_MUTEX(nf_ct_cache_mutex); - -static int nf_conntrack_hash_rnd_initted; -static unsigned int nf_conntrack_hash_rnd; - -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - unsigned int size, unsigned int rnd) +static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) { - unsigned int a, b; - - a = jhash2(tuple->src.u3.all, ARRAY_SIZE(tuple->src.u3.all), - (tuple->src.l3num << 16) | tuple->dst.protonum); - b = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), - (tuple->src.u.all << 16) | tuple->dst.u.all); - - return jhash_2words(a, b, rnd) % size; + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + spin_unlock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_unlock(&nf_conntrack_locks[h2]); } -static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) +/* return true if we need to recompute hashes (in case hash table was resized) */ +static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, + unsigned int h2, unsigned int sequence) { - return __hash_conntrack(tuple, nf_conntrack_htable_size, - nf_conntrack_hash_rnd); + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + if (h1 <= h2) { + spin_lock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_lock_nested(&nf_conntrack_locks[h2], + SINGLE_DEPTH_NESTING); + } else { + spin_lock(&nf_conntrack_locks[h2]); + spin_lock_nested(&nf_conntrack_locks[h1], + SINGLE_DEPTH_NESTING); + } + if (read_seqcount_retry(&net->ct.generation, sequence)) { + nf_conntrack_double_unlock(h1, h2); + return true; + } + return false; } -int nf_conntrack_register_cache(u_int32_t features, const char *name, - size_t size) +static void nf_conntrack_all_lock(void) { - int ret = 0; - char *cache_name; - struct kmem_cache *cachep; - - DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n", - features, name, size); - - if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) { - DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n", - features); - return -EINVAL; - } + int i; - mutex_lock(&nf_ct_cache_mutex); - - write_lock_bh(&nf_ct_cache_lock); - /* e.g: multiple helpers are loaded */ - if (nf_ct_cache[features].use > 0) { - DEBUGP("nf_conntrack_register_cache: already resisterd.\n"); - if ((!strncmp(nf_ct_cache[features].name, name, - NF_CT_FEATURES_NAMELEN)) - && nf_ct_cache[features].size == size) { - DEBUGP("nf_conntrack_register_cache: reusing.\n"); - nf_ct_cache[features].use++; - ret = 0; - } else - ret = -EBUSY; - - write_unlock_bh(&nf_ct_cache_lock); - mutex_unlock(&nf_ct_cache_mutex); - return ret; - } - write_unlock_bh(&nf_ct_cache_lock); + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_nested(&nf_conntrack_locks[i], i); +} - /* - * The memory space for name of slab cache must be alive until - * cache is destroyed. - */ - cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC); - if (cache_name == NULL) { - DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n"); - ret = -ENOMEM; - goto out_up_mutex; - } +static void nf_conntrack_all_unlock(void) +{ + int i; - if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN) - >= NF_CT_FEATURES_NAMELEN) { - printk("nf_conntrack_register_cache: name too long\n"); - ret = -EINVAL; - goto out_free_name; - } + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_unlock(&nf_conntrack_locks[i]); +} - cachep = kmem_cache_create(cache_name, size, 0, 0, - NULL, NULL); - if (!cachep) { - printk("nf_conntrack_register_cache: Can't create slab cache " - "for the features = 0x%x\n", features); - ret = -ENOMEM; - goto out_free_name; - } +unsigned int nf_conntrack_htable_size __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); - write_lock_bh(&nf_ct_cache_lock); - nf_ct_cache[features].use = 1; - nf_ct_cache[features].size = size; - nf_ct_cache[features].cachep = cachep; - nf_ct_cache[features].name = cache_name; - write_unlock_bh(&nf_ct_cache_lock); +unsigned int nf_conntrack_max __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_max); - goto out_up_mutex; +DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -out_free_name: - kfree(cache_name); -out_up_mutex: - mutex_unlock(&nf_ct_cache_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_conntrack_register_cache); +unsigned int nf_conntrack_hash_rnd __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); -/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */ -void nf_conntrack_unregister_cache(u_int32_t features) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) { - struct kmem_cache *cachep; - char *name; + unsigned int n; - /* - * This assures that kmem_cache_create() isn't called before destroying - * slab cache. + /* The direction must be ignored, so we hash everything up to the + * destination ports (which is a multiple of 4) and treat the last + * three bytes manually. */ - DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features); - mutex_lock(&nf_ct_cache_mutex); + n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); + return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^ + (((__force __u16)tuple->dst.u.all << 16) | + tuple->dst.protonum)); +} - write_lock_bh(&nf_ct_cache_lock); - if (--nf_ct_cache[features].use > 0) { - write_unlock_bh(&nf_ct_cache_lock); - mutex_unlock(&nf_ct_cache_mutex); - return; - } - cachep = nf_ct_cache[features].cachep; - name = nf_ct_cache[features].name; - nf_ct_cache[features].cachep = NULL; - nf_ct_cache[features].name = NULL; - nf_ct_cache[features].size = 0; - write_unlock_bh(&nf_ct_cache_lock); +static u32 __hash_bucket(u32 hash, unsigned int size) +{ + return ((u64)hash * size) >> 32; +} - synchronize_net(); +static u32 hash_bucket(u32 hash, const struct net *net) +{ + return __hash_bucket(hash, net->ct.htable_size); +} - kmem_cache_destroy(cachep); - kfree(name); +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, + u16 zone, unsigned int size) +{ + return __hash_bucket(hash_conntrack_raw(tuple, zone), size); +} - mutex_unlock(&nf_ct_cache_mutex); +static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, zone, net->ct.htable_size); } -EXPORT_SYMBOL_GPL(nf_conntrack_unregister_cache); -int +bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, unsigned int dataoff, @@ -251,11 +172,11 @@ nf_ct_get_tuple(const struct sk_buff *skb, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - NF_CT_TUPLE_U_BLANK(tuple); + memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) - return 0; + return false; tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; @@ -264,17 +185,45 @@ nf_ct_get_tuple(const struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_ct_get_tuple); -int +bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, + u_int16_t l3num, struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + unsigned int protoff; + u_int8_t protonum; + int ret; + + rcu_read_lock(); + + l3proto = __nf_ct_l3proto_find(l3num); + ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); + if (ret != NF_ACCEPT) { + rcu_read_unlock(); + return false; + } + + l4proto = __nf_ct_l4proto_find(l3num, protonum); + + ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, + l3proto, l4proto); + + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); + +bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - NF_CT_TUPLE_U_BLANK(inverse); + memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; if (l3proto->invert_tuple(inverse, orig) == 0) - return 0; + return false; inverse->dst.dir = !orig->dst.dir; @@ -286,155 +235,379 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); static void clean_from_lists(struct nf_conn *ct) { - DEBUGP("clean_from_lists(%p)\n", ct); - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list); + pr_debug("clean_from_lists(%p)\n", ct); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); /* Destroy all pending expectations */ nf_ct_remove_expectations(ct); } +/* must be called with local_bh_disable */ +static void nf_ct_add_to_dying_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) dying list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->dying); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) unconfirmed list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->unconfirmed); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* We overload first tuple to link into unconfirmed or dying list.*/ + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&pcpu->lock); +} + static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - struct nf_conn_help *help = nfct_help(ct); + struct net *net = nf_ct_net(ct); struct nf_conntrack_l4proto *l4proto; - typeof(nf_conntrack_destroyed) destroyed; - DEBUGP("destroy_conntrack(%p)\n", ct); + pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); - nf_conntrack_event(IPCT_DESTROY, ct); - set_bit(IPS_DYING_BIT, &ct->status); - - if (help && help->helper && help->helper->destroy) - help->helper->destroy(ct); - - /* To make sure we don't get any weird locking issues here: - * destroy_conntrack() MUST NOT be called with a write lock - * to nf_conntrack_lock!!! -HW */ rcu_read_lock(); - l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) l4proto->destroy(ct); - destroyed = rcu_dereference(nf_conntrack_destroyed); - if (destroyed) - destroyed(ct); - rcu_read_unlock(); - write_lock_bh(&nf_conntrack_lock); + local_bh_disable(); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, - * too. */ + * too. + */ nf_ct_remove_expectations(ct); - /* We overload first tuple to link into unconfirmed list. */ - if (!nf_ct_is_confirmed(ct)) { - BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - } + nf_ct_del_from_dying_or_unconfirmed_list(ct); - NF_CT_STAT_INC(delete); - write_unlock_bh(&nf_conntrack_lock); + NF_CT_STAT_INC(net, delete); + local_bh_enable(); if (ct->master) nf_ct_put(ct->master); - DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); + pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); nf_conntrack_free(ct); } -static void death_by_timeout(unsigned long ul_conntrack) +static void nf_ct_delete_from_lists(struct nf_conn *ct) { - struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); + unsigned int hash, reply_hash; + u16 zone = nf_ct_zone(ct); + unsigned int sequence; + + nf_ct_helper_destroy(ct); + + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); - write_lock_bh(&nf_conntrack_lock); - /* Inside lock so preempt is disabled on module removal path. - * Otherwise we can get spurious warnings. */ - NF_CT_STAT_INC(delete_list); clean_from_lists(ct); - write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + + nf_ct_add_to_dying_list(ct); + + NF_CT_STAT_INC(net, delete_list); + local_bh_enable(); +} + +static void death_by_event(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); + + if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { + /* bad luck, let's retry again */ + ecache->timeout.expires = jiffies + + (prandom_u32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ecache->timeout); + return; + } + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); nf_ct_put(ct); } -struct nf_conntrack_tuple_hash * -__nf_conntrack_find(const struct nf_conntrack_tuple *tuple, - const struct nf_conn *ignored_conntrack) +static void nf_ct_dying_timeout(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); + + /* set a new timer to retry event delivery */ + setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); + ecache->timeout.expires = jiffies + + (prandom_u32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ecache->timeout); +} + +bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) +{ + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); + + if (!nf_ct_is_dying(ct) && + unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, + portid, report) < 0)) { + /* destroy event was not delivered */ + nf_ct_delete_from_lists(ct); + nf_ct_dying_timeout(ct); + return false; + } + set_bit(IPS_DYING_BIT, &ct->status); + nf_ct_delete_from_lists(ct); + nf_ct_put(ct); + return true; +} +EXPORT_SYMBOL_GPL(nf_ct_delete); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); +} + +static inline bool +nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, + const struct nf_conntrack_tuple *tuple, + u16 zone) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + /* A conntrack can be recreated with the equal tuple, + * so we need to check that the conntrack is confirmed + */ + return nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone && + nf_ct_is_confirmed(ct); +} + +/* + * Warning : + * - Caller must take a reference on returned object + * and recheck nf_ct_tuple_equal(tuple, &h->tuple) + */ +static struct nf_conntrack_tuple_hash * +____nf_conntrack_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; - unsigned int hash = hash_conntrack(tuple); + struct hlist_nulls_node *n; + unsigned int bucket = hash_bucket(hash, net); - list_for_each_entry(h, &nf_conntrack_hash[hash], list) { - if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && - nf_ct_tuple_equal(tuple, &h->tuple)) { - NF_CT_STAT_INC(found); + /* Disable BHs the entire time since we normally need to disable them + * at least once for the stats anyway. + */ + local_bh_disable(); +begin: + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { + if (nf_ct_key_equal(h, tuple, zone)) { + NF_CT_STAT_INC(net, found); + local_bh_enable(); return h; } - NF_CT_STAT_INC(searched); + NF_CT_STAT_INC(net, searched); } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(n) != bucket) { + NF_CT_STAT_INC(net, search_restart); + goto begin; + } + local_bh_enable(); return NULL; } -EXPORT_SYMBOL_GPL(__nf_conntrack_find); /* Find a connection corresponding to a tuple. */ -struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, - const struct nf_conn *ignored_conntrack) +static struct nf_conntrack_tuple_hash * +__nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; - read_lock_bh(&nf_conntrack_lock); - h = __nf_conntrack_find(tuple, ignored_conntrack); - if (h) - atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); - read_unlock_bh(&nf_conntrack_lock); + rcu_read_lock(); +begin: + h = ____nf_conntrack_find(net, zone, tuple, hash); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) + h = NULL; + else { + if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { + nf_ct_put(ct); + goto begin; + } + } + } + rcu_read_unlock(); return h; } + +struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone)); +} EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, - unsigned int repl_hash) + unsigned int reply_hash) { - ct->id = ++nf_conntrack_next_id; - list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list, - &nf_conntrack_hash[hash]); - list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list, - &nf_conntrack_hash[repl_hash]); + struct net *net = nf_ct_net(ct); + + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.hash[hash]); + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, + &net->ct.hash[reply_hash]); } -void nf_conntrack_hash_insert(struct nf_conn *ct) +int +nf_conntrack_hash_check_insert(struct nf_conn *ct) { - unsigned int hash, repl_hash; + struct net *net = nf_ct_net(ct); + unsigned int hash, reply_hash; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + u16 zone; + unsigned int sequence; + + zone = nf_ct_zone(ct); + + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + + /* See if there's one in the list already, including reverse */ + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + + add_timer(&ct->timeout); + smp_wmb(); + /* The caller holds a reference to this object */ + atomic_set(&ct->ct_general.use, 2); + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); + NF_CT_STAT_INC(net, insert); + local_bh_enable(); + return 0; - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); +out: + nf_conntrack_double_unlock(hash, reply_hash); + NF_CT_STAT_INC(net, insert_failed); + local_bh_enable(); + return -EEXIST; +} +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); - write_lock_bh(&nf_conntrack_lock); - __nf_conntrack_hash_insert(ct, hash, repl_hash); - write_unlock_bh(&nf_conntrack_lock); +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ + struct ct_pcpu *pcpu; + + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + + /* add this conntrack to the (per cpu) tmpl list */ + local_bh_disable(); + tmpl->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); + + spin_lock(&pcpu->lock); + /* Overload tuple linked list to put us in template list. */ + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->tmpl); + spin_unlock_bh(&pcpu->lock); } -EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); /* Confirm a connection given skb; places it in hash table */ int -__nf_conntrack_confirm(struct sk_buff **pskb) +__nf_conntrack_confirm(struct sk_buff *skb) { - unsigned int hash, repl_hash; + unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; + struct nf_conn_tstamp *tstamp; + struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; + struct net *net; + u16 zone; + unsigned int sequence; - ct = nf_ct_get(*pskb, &ctinfo); + ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); /* ipt_REJECT uses nf_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet @@ -443,61 +616,95 @@ __nf_conntrack_confirm(struct sk_buff **pskb) if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return NF_ACCEPT; - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + zone = nf_ct_zone(ct); + local_bh_disable(); + + do { + sequence = read_seqcount_begin(&net->ct.generation); + /* reuse the hash saved before */ + hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; + hash = hash_bucket(hash, net); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related - connections for unconfirmed conns. But packet copies and - REJECT will give spurious warnings here. */ + * connections for unconfirmed conns. But packet copies and + * REJECT will give spurious warnings here. + */ /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ - /* No external references means noone else could have - confirmed us. */ + /* No external references means no one else could have + * confirmed us. + */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); - DEBUGP("Confirming conntrack %p\n", ct); - - write_lock_bh(&nf_conntrack_lock); + pr_debug("Confirming conntrack %p\n", ct); + /* We have to check the DYING flag inside the lock to prevent + a race against nf_ct_get_next_corpse() possibly called from + user context, else we insert an already 'dead' hash, blocking + further use of that particular connection -JM */ + + if (unlikely(nf_ct_is_dying(ct))) { + nf_conntrack_double_unlock(hash, reply_hash); + local_bh_enable(); + return NF_ACCEPT; + } /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - list_for_each_entry(h, &nf_conntrack_hash[hash], list) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple)) + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - list_for_each_entry(h, &nf_conntrack_hash[repl_hash], list) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple)) + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - /* Remove from unconfirmed list */ - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + nf_ct_del_from_dying_or_unconfirmed_list(ct); - __nf_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout.expires += jiffies; add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); - set_bit(IPS_CONFIRMED_BIT, &ct->status); - NF_CT_STAT_INC(insert); - write_unlock_bh(&nf_conntrack_lock); + ct->status |= IPS_CONFIRMED; + + /* set conntrack timestamp, if enabled. */ + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + if (skb->tstamp.tv64 == 0) + __net_timestamp(skb); + + tstamp->start = ktime_to_ns(skb->tstamp); + } + /* Since the lookup is lockless, hash insertion must be done after + * starting the timer and setting the CONFIRMED bit. The RCU barriers + * guarantee that no other CPU can find the conntrack before the above + * stores are visible. + */ + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); + NF_CT_STAT_INC(net, insert); + local_bh_enable(); + help = nfct_help(ct); if (help && help->helper) - nf_conntrack_event_cache(IPCT_HELPER, *pskb); -#ifdef CONFIG_NF_NAT_NEEDED - if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || - test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_NATINFO, *pskb); -#endif + nf_conntrack_event_cache(IPCT_HELPER, ct); + nf_conntrack_event_cache(master_ct(ct) ? - IPCT_RELATED : IPCT_NEW, *pskb); + IPCT_RELATED : IPCT_NEW, ct); return NF_ACCEPT; out: - NF_CT_STAT_INC(insert_failed); - write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + NF_CT_STAT_INC(net, insert_failed); + local_bh_enable(); return NF_DROP; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -508,230 +715,321 @@ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { + struct net *net = nf_ct_net(ignored_conntrack); struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *ct; + u16 zone = nf_ct_zone(ignored_conntrack); + unsigned int hash = hash_conntrack(net, zone, tuple); - read_lock_bh(&nf_conntrack_lock); - h = __nf_conntrack_find(tuple, ignored_conntrack); - read_unlock_bh(&nf_conntrack_lock); + /* Disable BHs the entire time since we need to disable them at + * least once for the stats anyway. + */ + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (ct != ignored_conntrack && + nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone) { + NF_CT_STAT_INC(net, found); + rcu_read_unlock_bh(); + return 1; + } + NF_CT_STAT_INC(net, searched); + } + rcu_read_unlock_bh(); - return h != NULL; + return 0; } EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); +#define NF_CT_EVICTION_RANGE 8 + /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static int early_drop(struct list_head *chain) +static noinline int early_drop(struct net *net, unsigned int _hash) { - /* Traverse backwards: gives us oldest, which is roughly LRU */ + /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; struct nf_conn *ct = NULL, *tmp; + struct hlist_nulls_node *n; + unsigned int i = 0, cnt = 0; int dropped = 0; + unsigned int hash, sequence; + spinlock_t *lockp; + + local_bh_disable(); +restart: + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_bucket(_hash, net); + for (; i < net->ct.htable_size; i++) { + lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (read_seqcount_retry(&net->ct.generation, sequence)) { + spin_unlock(lockp); + goto restart; + } + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], + hnnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); + if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && + !nf_ct_is_dying(tmp) && + atomic_inc_not_zero(&tmp->ct_general.use)) { + ct = tmp; + break; + } + cnt++; + } - read_lock_bh(&nf_conntrack_lock); - list_for_each_entry_reverse(h, chain, list) { - tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) { - ct = tmp; - atomic_inc(&ct->ct_general.use); + hash = (hash + 1) % net->ct.htable_size; + spin_unlock(lockp); + + if (ct || cnt >= NF_CT_EVICTION_RANGE) break; - } + } - read_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (!ct) return dropped; if (del_timer(&ct->timeout)) { - death_by_timeout((unsigned long)ct); - dropped = 1; - NF_CT_STAT_INC_ATOMIC(early_drop); + if (nf_ct_delete(ct, 0, 0)) { + dropped = 1; + NF_CT_STAT_INC_ATOMIC(net, early_drop); + } } nf_ct_put(ct); return dropped; } +void init_nf_conntrack_hash_rnd(void) +{ + unsigned int rand; + + /* + * Why not initialize nf_conntrack_rnd in a "init()" function ? + * Because there isn't enough entropy when system initializing, + * and we initialize it as late as possible. + */ + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&nf_conntrack_hash_rnd, 0, rand); +} + static struct nf_conn * -__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, +__nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, - const struct nf_conntrack_l3proto *l3proto, - u_int32_t features) + gfp_t gfp, u32 hash) { - struct nf_conn *conntrack = NULL; - struct nf_conntrack_helper *helper; + struct nf_conn *ct; - if (unlikely(!nf_conntrack_hash_rnd_initted)) { - get_random_bytes(&nf_conntrack_hash_rnd, 4); - nf_conntrack_hash_rnd_initted = 1; + if (unlikely(!nf_conntrack_hash_rnd)) { + init_nf_conntrack_hash_rnd(); + /* recompute the hash as nf_conntrack_hash_rnd is initialized */ + hash = hash_conntrack_raw(orig, zone); } /* We don't want any race condition at early drop stage */ - atomic_inc(&nf_conntrack_count); - - if (nf_conntrack_max - && atomic_read(&nf_conntrack_count) > nf_conntrack_max) { - unsigned int hash = hash_conntrack(orig); - /* Try dropping from this hash chain. */ - if (!early_drop(&nf_conntrack_hash[hash])) { - atomic_dec(&nf_conntrack_count); - if (net_ratelimit()) - printk(KERN_WARNING - "nf_conntrack: table full, dropping" - " packet.\n"); + atomic_inc(&net->ct.count); + + if (nf_conntrack_max && + unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { + if (!early_drop(net, hash)) { + atomic_dec(&net->ct.count); + net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); } } - /* find features needed by this conntrack. */ - features |= l3proto->get_features(orig); - - /* FIXME: protect helper list per RCU */ - read_lock_bh(&nf_conntrack_lock); - helper = __nf_ct_helper_find(repl); - /* NAT might want to assign a helper later */ - if (helper || features & NF_CT_F_NAT) - features |= NF_CT_F_HELP; - read_unlock_bh(&nf_conntrack_lock); - - DEBUGP("nf_conntrack_alloc: features=0x%x\n", features); - - read_lock_bh(&nf_ct_cache_lock); - - if (unlikely(!nf_ct_cache[features].use)) { - DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n", - features); - goto out; - } - - conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC); - if (conntrack == NULL) { - DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n"); - goto out; + /* + * Do not use kmem_cache_zalloc(), as this cache uses + * SLAB_DESTROY_BY_RCU. + */ + ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + if (ct == NULL) { + atomic_dec(&net->ct.count); + return ERR_PTR(-ENOMEM); } - - memset(conntrack, 0, nf_ct_cache[features].size); - conntrack->features = features; - atomic_set(&conntrack->ct_general.use, 1); - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* + * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next + * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. + */ + memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, + offsetof(struct nf_conn, proto) - + offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); + spin_lock_init(&ct->lock); + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; + ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* save hash for reusing when confirming */ + *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; /* Don't set timer yet: wait for confirmation */ - setup_timer(&conntrack->timeout, death_by_timeout, - (unsigned long)conntrack); - read_unlock_bh(&nf_ct_cache_lock); + setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); + write_pnet(&ct->ct_net, net); +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (zone) { + struct nf_conntrack_zone *nf_ct_zone; + + nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); + if (!nf_ct_zone) + goto out_free; + nf_ct_zone->id = zone; + } +#endif + /* Because we use RCU lookups, we set ct_general.use to zero before + * this is inserted in any list. + */ + atomic_set(&ct->ct_general.use, 0); + return ct; - return conntrack; -out: - read_unlock_bh(&nf_ct_cache_lock); - atomic_dec(&nf_conntrack_count); - return conntrack; +#ifdef CONFIG_NF_CONNTRACK_ZONES +out_free: + atomic_dec(&net->ct.count); + kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + return ERR_PTR(-ENOMEM); +#endif } -struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_tuple *repl) +struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + gfp_t gfp) { - struct nf_conntrack_l3proto *l3proto; - struct nf_conn *ct; - - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(orig->src.l3num); - ct = __nf_conntrack_alloc(orig, repl, l3proto, 0); - rcu_read_unlock(); - - return ct; + return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); } EXPORT_SYMBOL_GPL(nf_conntrack_alloc); -void nf_conntrack_free(struct nf_conn *conntrack) +void nf_conntrack_free(struct nf_conn *ct) { - u_int32_t features = conntrack->features; - NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM); - DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features, - conntrack); - kmem_cache_free(nf_ct_cache[features].cachep, conntrack); - atomic_dec(&nf_conntrack_count); + struct net *net = nf_ct_net(ct); + + /* A freed object has refcnt == 0, that's + * the golden rule for SLAB_DESTROY_BY_RCU + */ + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); + + nf_ct_ext_destroy(ct); + nf_ct_ext_free(ct); + kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + smp_mb__before_atomic(); + atomic_dec(&net->ct.count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); + /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct nf_conntrack_tuple_hash * -init_conntrack(const struct nf_conntrack_tuple *tuple, +init_conntrack(struct net *net, struct nf_conn *tmpl, + const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, u32 hash) { - struct nf_conn *conntrack; + struct nf_conn *ct; + struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; - struct nf_conntrack_expect *exp; - u_int32_t features = 0; + struct nf_conntrack_ecache *ecache; + struct nf_conntrack_expect *exp = NULL; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + struct nf_conn_timeout *timeout_ext; + unsigned int *timeouts; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { - DEBUGP("Can't invert tuple.\n"); + pr_debug("Can't invert tuple.\n"); return NULL; } - read_lock_bh(&nf_conntrack_lock); - exp = __nf_conntrack_expect_find(tuple); - if (exp && exp->helper) - features = NF_CT_F_HELP; - read_unlock_bh(&nf_conntrack_lock); + ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, + hash); + if (IS_ERR(ct)) + return (struct nf_conntrack_tuple_hash *)ct; - conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto, features); - if (conntrack == NULL || IS_ERR(conntrack)) { - DEBUGP("Can't allocate conntrack.\n"); - return (struct nf_conntrack_tuple_hash *)conntrack; + if (tmpl && nfct_synproxy(tmpl)) { + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); } - if (!l4proto->new(conntrack, skb, dataoff)) { - nf_conntrack_free(conntrack); - DEBUGP("init conntrack: can't track with proto module\n"); + timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; + if (timeout_ext) + timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); + else + timeouts = l4proto->get_timeouts(net); + + if (!l4proto->new(ct, skb, dataoff, timeouts)) { + nf_conntrack_free(ct); + pr_debug("init conntrack: can't track with proto module\n"); return NULL; } - write_lock_bh(&nf_conntrack_lock); - exp = find_expectation(tuple); + if (timeout_ext) + nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC); + + nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + nf_ct_labels_ext_add(ct); + + ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; + nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC); + + local_bh_disable(); + if (net->ct.expect_count) { + spin_lock(&nf_conntrack_expect_lock); + exp = nf_ct_find_expectation(net, zone, tuple); + if (exp) { + pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + ct, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &ct->status); + /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ + ct->master = exp->master; + if (exp->helper) { + help = nf_ct_helper_ext_add(ct, exp->helper, + GFP_ATOMIC); + if (help) + rcu_assign_pointer(help->helper, exp->helper); + } - if (exp) { - DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", - conntrack, exp); - /* Welcome, Mr. Bond. We've been expecting you... */ - __set_bit(IPS_EXPECTED_BIT, &conntrack->status); - conntrack->master = exp->master; - if (exp->helper) - nfct_help(conntrack)->helper = exp->helper; #ifdef CONFIG_NF_CONNTRACK_MARK - conntrack->mark = exp->master->mark; + ct->mark = exp->master->mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK - conntrack->secmark = exp->master->secmark; + ct->secmark = exp->master->secmark; #endif - nf_conntrack_get(&conntrack->master->ct_general); - NF_CT_STAT_INC(expect_new); - } else { - struct nf_conn_help *help = nfct_help(conntrack); - - if (help) - help->helper = __nf_ct_helper_find(&repl_tuple); - NF_CT_STAT_INC(new); + NF_CT_STAT_INC(net, expect_new); + } + spin_unlock(&nf_conntrack_expect_lock); + } + if (!exp) { + __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); + NF_CT_STAT_INC(net, new); } - /* Overload tuple linked list to put us in unconfirmed list. */ - list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + /* Now it is inserted into the unconfirmed list, bump refcount */ + nf_conntrack_get(&ct->ct_general); + nf_ct_add_to_unconfirmed_list(ct); - write_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (exp) { if (exp->expectfn) - exp->expectfn(conntrack, exp); - nf_conntrack_expect_put(exp); + exp->expectfn(ct, exp); + nf_ct_expect_put(exp); } - return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; + return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; } /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ static inline struct nf_conn * -resolve_normal_ct(struct sk_buff *skb, +resolve_normal_ct(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, @@ -743,18 +1041,22 @@ resolve_normal_ct(struct sk_buff *skb, struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, &tuple, l3proto, l4proto)) { - DEBUGP("resolve_normal_ct: Can't get tuple\n"); + pr_debug("resolve_normal_ct: Can't get tuple\n"); return NULL; } /* look for tuple match */ - h = nf_conntrack_find_get(&tuple, NULL); + hash = hash_conntrack_raw(&tuple, zone); + h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { - h = init_conntrack(&tuple, l3proto, l4proto, skb, dataoff); + h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + skb, dataoff, hash); if (!h) return NULL; if (IS_ERR(h)) @@ -764,19 +1066,20 @@ resolve_normal_ct(struct sk_buff *skb, /* It exists; we have (non-exclusive) reference. */ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { - *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + *ctinfo = IP_CT_ESTABLISHED_REPLY; /* Please set reply bit if this packet OK */ *set_reply = 1; } else { /* Once we've had two way comms, always ESTABLISHED. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - DEBUGP("nf_conntrack_in: normal packet for %p\n", ct); + pr_debug("nf_conntrack_in: normal packet for %p\n", ct); *ctinfo = IP_CT_ESTABLISHED; } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - DEBUGP("nf_conntrack_in: related packet for %p\n", ct); + pr_debug("nf_conntrack_in: related packet for %p\n", + ct); *ctinfo = IP_CT_RELATED; } else { - DEBUGP("nf_conntrack_in: new packet for %p\n", ct); + pr_debug("nf_conntrack_in: new packet for %p\n", ct); *ctinfo = IP_CT_NEW; } *set_reply = 0; @@ -787,81 +1090,116 @@ resolve_normal_ct(struct sk_buff *skb, } unsigned int -nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) +nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, + struct sk_buff *skb) { - struct nf_conn *ct; + struct nf_conn *ct, *tmpl = NULL; enum ip_conntrack_info ctinfo; struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts; unsigned int dataoff; u_int8_t protonum; int set_reply = 0; int ret; - /* Previously seen (loopback or untracked)? Ignore. */ - if ((*pskb)->nfct) { - NF_CT_STAT_INC_ATOMIC(ignore); - return NF_ACCEPT; + if (skb->nfct) { + /* Previously seen (loopback or untracked)? Ignore. */ + tmpl = (struct nf_conn *)skb->nfct; + if (!nf_ct_is_template(tmpl)) { + NF_CT_STAT_INC_ATOMIC(net, ignore); + return NF_ACCEPT; + } + skb->nfct = NULL; } /* rcu_read_lock()ed by nf_hook_slow */ - l3proto = __nf_ct_l3proto_find((u_int16_t)pf); - - if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { - DEBUGP("not prepared to track yet or error occured\n"); - return -ret; + l3proto = __nf_ct_l3proto_find(pf); + ret = l3proto->get_l4proto(skb, skb_network_offset(skb), + &dataoff, &protonum); + if (ret <= 0) { + pr_debug("not prepared to track yet or error occurred\n"); + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = -ret; + goto out; } - l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum); + l4proto = __nf_ct_l4proto_find(pf, protonum); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter * core what to do with the packet. */ - if (l4proto->error != NULL && - (ret = l4proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) { - NF_CT_STAT_INC_ATOMIC(error); - NF_CT_STAT_INC_ATOMIC(invalid); - return -ret; + if (l4proto->error != NULL) { + ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, + pf, hooknum); + if (ret <= 0) { + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = -ret; + goto out; + } + /* ICMP[v6] protocol trackers may assign one conntrack. */ + if (skb->nfct) + goto out; } - ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, l4proto, - &set_reply, &ctinfo); + ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, + l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ - NF_CT_STAT_INC_ATOMIC(invalid); - return NF_ACCEPT; + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = NF_ACCEPT; + goto out; } if (IS_ERR(ct)) { /* Too stressed to deal. */ - NF_CT_STAT_INC_ATOMIC(drop); - return NF_DROP; + NF_CT_STAT_INC_ATOMIC(net, drop); + ret = NF_DROP; + goto out; } - NF_CT_ASSERT((*pskb)->nfct); + NF_CT_ASSERT(skb->nfct); + + /* Decide what timeout policy we want to apply to this flow. */ + timeouts = nf_ct_timeout_lookup(net, ct, l4proto); - ret = l4proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); - if (ret < 0) { + ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts); + if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ - DEBUGP("nf_conntrack_in: Can't track with proto module\n"); - nf_conntrack_put((*pskb)->nfct); - (*pskb)->nfct = NULL; - NF_CT_STAT_INC_ATOMIC(invalid); - return -ret; + pr_debug("nf_conntrack_in: Can't track with proto module\n"); + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; + NF_CT_STAT_INC_ATOMIC(net, invalid); + if (ret == -NF_DROP) + NF_CT_STAT_INC_ATOMIC(net, drop); + ret = -ret; + goto out; } if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, *pskb); + nf_conntrack_event_cache(IPCT_REPLY, ct); +out: + if (tmpl) { + /* Special case: we have to repeat this hook, assign the + * template again to this packet. We assume that this packet + * has no conntrack assigned. This is used by nf_ct_tcp. */ + if (ret == NF_REPEAT) + skb->nfct = (struct nf_conntrack *)tmpl; + else + nf_ct_put(tmpl); + } return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_in); -int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig) +bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig) { - int ret; + bool ret; rcu_read_lock(); ret = nf_ct_invert_tuple(inverse, orig, @@ -880,22 +1218,19 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, { struct nf_conn_help *help = nfct_help(ct); - write_lock_bh(&nf_conntrack_lock); /* Should be unconfirmed, so not in hash table yet */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); - DEBUGP("Altering reply tuple of %p to ", ct); - NF_CT_DUMP_TUPLE(newreply); + pr_debug("Altering reply tuple of %p to ", ct); + nf_ct_dump_tuple(newreply); ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (!ct->master && help && help->expecting == 0) { - struct nf_conntrack_helper *helper; - helper = __nf_ct_helper_find(newreply); - if (helper) - memset(&help->help, 0, sizeof(help->help)); - help->helper = helper; - } - write_unlock_bh(&nf_conntrack_lock); + if (ct->master || (help && !hlist_empty(&help->expectations))) + return; + + rcu_read_lock(); + __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); @@ -906,105 +1241,125 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, unsigned long extra_jiffies, int do_acct) { - int event = 0; - NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); NF_CT_ASSERT(skb); - write_lock_bh(&nf_conntrack_lock); - /* Only update if this is not a fixed timeout */ - if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { - write_unlock_bh(&nf_conntrack_lock); - return; - } + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) + goto acct; /* If not in hash table, timer will not be active yet */ if (!nf_ct_is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - event = IPCT_REFRESH; } else { unsigned long newtime = jiffies + extra_jiffies; /* Only update the timeout if the new timeout is at least HZ jiffies from the old timeout. Need del_timer for race avoidance (may already be dying). */ - if (newtime - ct->timeout.expires >= HZ - && del_timer(&ct->timeout)) { - ct->timeout.expires = newtime; - add_timer(&ct->timeout); - event = IPCT_REFRESH; - } + if (newtime - ct->timeout.expires >= HZ) + mod_timer_pending(&ct->timeout, newtime); } -#ifdef CONFIG_NF_CT_ACCT +acct: if (do_acct) { - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - skb->len - skb_network_offset(skb); + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; - if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) - || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) - event |= IPCT_COUNTER_FILLING; + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes); + } } -#endif +} +EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); - write_unlock_bh(&nf_conntrack_lock); +bool __nf_ct_kill_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + int do_acct) +{ + if (do_acct) { + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(skb->len - skb_network_offset(skb), + &counter[CTINFO2DIR(ctinfo)].bytes); + } + } - /* must be unlocked when calling event cache */ - if (event) - nf_conntrack_event_cache(event, skb); + if (del_timer(&ct->timeout)) { + ct->timeout.function((unsigned long)ct); + return true; + } + return false; } -EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); +EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#ifdef CONFIG_NF_CONNTRACK_ZONES +static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { + .len = sizeof(struct nf_conntrack_zone), + .align = __alignof__(struct nf_conntrack_zone), + .id = NF_CT_EXT_ZONE, +}; +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/mutex.h> - /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be * in ip_conntrack_core, since we don't want the protocols to autoload * or depend on ctnetlink */ -int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb, +int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), - &tuple->src.u.tcp.port); - NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), - &tuple->dst.u.tcp.port); + if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || + nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) + goto nla_put_failure; return 0; -nfattr_failure: +nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nfattr); +EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); -static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), - [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t) +const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, + [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, }; +EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); -int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[], +int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t) { - if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) return -EINVAL; - if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) - return -EINVAL; - - t->src.u.tcp.port = *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); - t->dst.u.tcp.port = *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); return 0; } -EXPORT_SYMBOL_GPL(nf_ct_port_nfattr_to_tuple); +EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); + +int nf_ct_port_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} +EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ -void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -1012,7 +1367,7 @@ void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) /* This ICMP is in reverse direction to the packet which caused it */ ct = nf_ct_get(skb, &ctinfo); if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) - ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + ctinfo = IP_CT_RELATED_REPLY; else ctinfo = IP_CT_RELATED; @@ -1021,55 +1376,66 @@ void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) nskb->nfctinfo = ctinfo; nf_conntrack_get(nskb->nfct); } -EXPORT_SYMBOL_GPL(__nf_conntrack_attach); - -static inline int -do_iter(const struct nf_conntrack_tuple_hash *i, - int (*iter)(struct nf_conn *i, void *data), - void *data) -{ - return iter(nf_ct_tuplehash_to_ctrack(i), data); -} /* Bring out ya dead! */ static struct nf_conn * -get_next_corpse(int (*iter)(struct nf_conn *i, void *data), +get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; + struct hlist_nulls_node *n; + int cpu; + spinlock_t *lockp; + + for (; *bucket < net->ct.htable_size; (*bucket)++) { + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; + local_bh_disable(); + spin_lock(lockp); + if (*bucket < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } + } + spin_unlock(lockp); + local_bh_enable(); + } - write_lock_bh(&nf_conntrack_lock); - for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { - list_for_each_entry(h, &nf_conntrack_hash[*bucket], list) { + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) - goto found; + set_bit(IPS_DYING_BIT, &ct->status); } + spin_unlock_bh(&pcpu->lock); } - list_for_each_entry(h, &unconfirmed, list) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - set_bit(IPS_DYING_BIT, &ct->status); - } - write_unlock_bh(&nf_conntrack_lock); return NULL; found: atomic_inc(&ct->ct_general.use); - write_unlock_bh(&nf_conntrack_lock); + spin_unlock(lockp); + local_bh_enable(); return ct; } -void -nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) +void nf_ct_iterate_cleanup(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data, u32 portid, int report) { struct nf_conn *ct; unsigned int bucket = 0; - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) - death_by_timeout((unsigned long)ct); + nf_ct_delete(ct, portid, report); + /* ... else the timer will get him soon. */ nf_ct_put(ct); @@ -1082,201 +1448,430 @@ static int kill_all(struct nf_conn *i, void *data) return 1; } -static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size) +void nf_ct_free_hashtable(void *hash, unsigned int size) { - if (vmalloced) + if (is_vmalloc_addr(hash)) vfree(hash); else free_pages((unsigned long)hash, - get_order(sizeof(struct list_head) * size)); + get_order(sizeof(struct hlist_head) * size)); } +EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush(void) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { - nf_ct_iterate_cleanup(kill_all, NULL); + nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report); } -EXPORT_SYMBOL_GPL(nf_conntrack_flush); +EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void nf_conntrack_cleanup(void) +static void nf_ct_release_dying_list(struct net *net) { - int i; - - rcu_assign_pointer(ip_ct_attach, NULL); + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct hlist_nulls_node *n; + int cpu; - /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module - delete... */ - synchronize_net(); + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - nf_ct_event_cache_flush(); - i_see_dead_people: - nf_conntrack_flush(); - if (atomic_read(&nf_conntrack_count) != 0) { - schedule(); - goto i_see_dead_people; + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&pcpu->lock); } - /* wait until all references to nf_conntrack_untracked are dropped */ - while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) - schedule(); +} - rcu_assign_pointer(nf_ct_destroy, NULL); +static int untrack_refs(void) +{ + int cnt = 0, cpu; - for (i = 0; i < NF_CT_F_NUM; i++) { - if (nf_ct_cache[i].use == 0) - continue; + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); - NF_CT_ASSERT(nf_ct_cache[i].use == 1); - nf_ct_cache[i].use = 1; - nf_conntrack_unregister_cache(i); + cnt += atomic_read(&ct->ct_general.use) - 1; } - kmem_cache_destroy(nf_conntrack_expect_cachep); - free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, - nf_conntrack_htable_size); + return cnt; +} + +void nf_conntrack_cleanup_start(void) +{ + RCU_INIT_POINTER(ip_ct_attach, NULL); +} + +void nf_conntrack_cleanup_end(void) +{ + RCU_INIT_POINTER(nf_ct_destroy, NULL); + while (untrack_refs() > 0) + schedule(); +#ifdef CONFIG_NF_CONNTRACK_ZONES + nf_ct_extend_unregister(&nf_ct_zone_extend); +#endif nf_conntrack_proto_fini(); + nf_conntrack_seqadj_fini(); + nf_conntrack_labels_fini(); + nf_conntrack_helper_fini(); + nf_conntrack_timeout_fini(); + nf_conntrack_ecache_fini(); + nf_conntrack_tstamp_fini(); + nf_conntrack_acct_fini(); + nf_conntrack_expect_fini(); } -static struct list_head *alloc_hashtable(int size, int *vmalloced) +/* + * Mishearing the voices in his head, our hero wonders how he's + * supposed to kill the mall. + */ +void nf_conntrack_cleanup_net(struct net *net) { - struct list_head *hash; - unsigned int i; + LIST_HEAD(single); - *vmalloced = 0; - hash = (void*)__get_free_pages(GFP_KERNEL, - get_order(sizeof(struct list_head) - * size)); + list_add(&net->exit_list, &single); + nf_conntrack_cleanup_net_list(&single); +} + +void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) +{ + int busy; + struct net *net; + + /* + * This makes sure all current packets have passed through + * netfilter framework. Roll on, two-stage module + * delete... + */ + synchronize_net(); +i_see_dead_people: + busy = 0; + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); + nf_ct_release_dying_list(net); + if (atomic_read(&net->ct.count) != 0) + busy = 1; + } + if (busy) { + schedule(); + goto i_see_dead_people; + } + + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); + nf_conntrack_proto_pernet_fini(net); + nf_conntrack_helper_pernet_fini(net); + nf_conntrack_ecache_pernet_fini(net); + nf_conntrack_tstamp_pernet_fini(net); + nf_conntrack_acct_pernet_fini(net); + nf_conntrack_expect_pernet_fini(net); + kmem_cache_destroy(net->ct.nf_conntrack_cachep); + kfree(net->ct.slabname); + free_percpu(net->ct.stat); + free_percpu(net->ct.pcpu_lists); + } +} + +void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) +{ + struct hlist_nulls_head *hash; + unsigned int nr_slots, i; + size_t sz; + + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); + nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + sz = nr_slots * sizeof(struct hlist_nulls_head); + hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + get_order(sz)); if (!hash) { - *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); - hash = vmalloc(sizeof(struct list_head) * size); + hash = vzalloc(sz); } - if (hash) - for (i = 0; i < size; i++) - INIT_LIST_HEAD(&hash[i]); + if (hash && nulls) + for (i = 0; i < nr_slots; i++) + INIT_HLIST_NULLS_HEAD(&hash[i], i); return hash; } +EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); -int set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) { - int i, bucket, hashsize, vmalloced; - int old_vmalloced, old_size; - int rnd; - struct list_head *hash, *old_hash; + int i, bucket, rc; + unsigned int hashsize, old_size; + struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + if (current->nsproxy->net_ns != &init_net) + return -EOPNOTSUPP; /* On boot, we can set this without any fancy locking. */ if (!nf_conntrack_htable_size) return param_set_uint(val, kp); - hashsize = simple_strtol(val, NULL, 0); + rc = kstrtouint(val, 0, &hashsize); + if (rc) + return rc; if (!hashsize) return -EINVAL; - hash = alloc_hashtable(hashsize, &vmalloced); + hash = nf_ct_alloc_hashtable(&hashsize, 1); if (!hash) return -ENOMEM; - /* We have to rehahs for the new table anyway, so we also can - * use a newrandom seed */ - get_random_bytes(&rnd, 4); - - write_lock_bh(&nf_conntrack_lock); - for (i = 0; i < nf_conntrack_htable_size; i++) { - while (!list_empty(&nf_conntrack_hash[i])) { - h = list_entry(nf_conntrack_hash[i].next, - struct nf_conntrack_tuple_hash, list); - list_del(&h->list); - bucket = __hash_conntrack(&h->tuple, hashsize, rnd); - list_add_tail(&h->list, &hash[bucket]); + local_bh_disable(); + nf_conntrack_all_lock(); + write_seqcount_begin(&init_net.ct.generation); + + /* Lookups in the old hash might happen in parallel, which means we + * might get false negatives during connection lookup. New connections + * created because of a false negative won't make it into the hash + * though since that required taking the locks. + */ + + for (i = 0; i < init_net.ct.htable_size; i++) { + while (!hlist_nulls_empty(&init_net.ct.hash[i])) { + h = hlist_nulls_entry(init_net.ct.hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); + ct = nf_ct_tuplehash_to_ctrack(h); + hlist_nulls_del_rcu(&h->hnnode); + bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), + hashsize); + hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = nf_conntrack_htable_size; - old_vmalloced = nf_conntrack_vmalloc; - old_hash = nf_conntrack_hash; + old_size = init_net.ct.htable_size; + old_hash = init_net.ct.hash; - nf_conntrack_htable_size = hashsize; - nf_conntrack_vmalloc = vmalloced; - nf_conntrack_hash = hash; - nf_conntrack_hash_rnd = rnd; - write_unlock_bh(&nf_conntrack_lock); + init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; + init_net.ct.hash = hash; - free_conntrack_hash(old_hash, old_vmalloced, old_size); + write_seqcount_end(&init_net.ct.generation); + nf_conntrack_all_unlock(); + local_bh_enable(); + + nf_ct_free_hashtable(old_hash, old_size); return 0; } +EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); -module_param_call(hashsize, set_hashsize, param_get_uint, +module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -int __init nf_conntrack_init(void) +void nf_ct_untracked_status_or(unsigned long bits) { - int ret; + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(nf_conntrack_untracked, cpu).status |= bits; +} +EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); + +int nf_conntrack_init_start(void) +{ + int max_factor = 8; + int i, ret, cpu; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_init(&nf_conntrack_locks[i]); /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB - * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ + * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ if (!nf_conntrack_htable_size) { nf_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) - / sizeof(struct list_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 8192; - if (nf_conntrack_htable_size < 16) - nf_conntrack_htable_size = 16; + = (((totalram_pages << PAGE_SHIFT) / 16384) + / sizeof(struct hlist_head)); + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 16384; + if (nf_conntrack_htable_size < 32) + nf_conntrack_htable_size = 32; + + /* Use a max. factor of four by default to get the same max as + * with the old struct list_heads. When a table size is given + * we use the old value of 8 to avoid reducing the max. + * entries. */ + max_factor = 4; } - nf_conntrack_max = 8 * nf_conntrack_htable_size; + nf_conntrack_max = max_factor * nf_conntrack_htable_size; - printk("nf_conntrack version %s (%u buckets, %d max)\n", + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); - nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size, - &nf_conntrack_vmalloc); - if (!nf_conntrack_hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_out; - } + ret = nf_conntrack_expect_init(); + if (ret < 0) + goto err_expect; - ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic", - sizeof(struct nf_conn)); - if (ret < 0) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_free_hash; - } + ret = nf_conntrack_acct_init(); + if (ret < 0) + goto err_acct; - nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect", - sizeof(struct nf_conntrack_expect), - 0, 0, NULL, NULL); - if (!nf_conntrack_expect_cachep) { - printk(KERN_ERR "Unable to create nf_expect slab cache\n"); - goto err_free_conntrack_slab; - } + ret = nf_conntrack_tstamp_init(); + if (ret < 0) + goto err_tstamp; - ret = nf_conntrack_proto_init(); + ret = nf_conntrack_ecache_init(); if (ret < 0) - goto out_free_expect_slab; + goto err_ecache; - /* For use by REJECT target */ - rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach); - rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + ret = nf_conntrack_timeout_init(); + if (ret < 0) + goto err_timeout; + + ret = nf_conntrack_helper_init(); + if (ret < 0) + goto err_helper; + + ret = nf_conntrack_labels_init(); + if (ret < 0) + goto err_labels; + + ret = nf_conntrack_seqadj_init(); + if (ret < 0) + goto err_seqadj; + +#ifdef CONFIG_NF_CONNTRACK_ZONES + ret = nf_ct_extend_register(&nf_ct_zone_extend); + if (ret < 0) + goto err_extend; +#endif + ret = nf_conntrack_proto_init(); + if (ret < 0) + goto err_proto; - /* Set up fake conntrack: - - to never be deleted, not in any hashes */ - atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + /* Set up fake conntrack: to never be deleted, not in any hashes */ + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + write_pnet(&ct->ct_net, &init_net); + atomic_set(&ct->ct_general.use, 1); + } /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); + return 0; +err_proto: +#ifdef CONFIG_NF_CONNTRACK_ZONES + nf_ct_extend_unregister(&nf_ct_zone_extend); +err_extend: +#endif + nf_conntrack_seqadj_fini(); +err_seqadj: + nf_conntrack_labels_fini(); +err_labels: + nf_conntrack_helper_fini(); +err_helper: + nf_conntrack_timeout_fini(); +err_timeout: + nf_conntrack_ecache_fini(); +err_ecache: + nf_conntrack_tstamp_fini(); +err_tstamp: + nf_conntrack_acct_fini(); +err_acct: + nf_conntrack_expect_fini(); +err_expect: return ret; +} + +void nf_conntrack_init_end(void) +{ + /* For use by REJECT target */ + RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); + RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); +} -out_free_expect_slab: - kmem_cache_destroy(nf_conntrack_expect_cachep); -err_free_conntrack_slab: - nf_conntrack_unregister_cache(NF_CT_F_BASIC); -err_free_hash: - free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, - nf_conntrack_htable_size); -err_out: - return -ENOMEM; +/* + * We need to use special "null" values, not used in hash table + */ +#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) +#define DYING_NULLS_VAL ((1<<30)+1) +#define TEMPLATE_NULLS_VAL ((1<<30)+2) + +int nf_conntrack_init_net(struct net *net) +{ + int ret = -ENOMEM; + int cpu; + + atomic_set(&net->ct.count, 0); + seqcount_init(&net->ct.generation); + + net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); + if (!net->ct.pcpu_lists) + goto err_stat; + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_init(&pcpu->lock); + INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); + } + + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) + goto err_pcpu_lists; + + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); + if (!net->ct.slabname) + goto err_slabname; + + net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!net->ct.nf_conntrack_cachep) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + goto err_cache; + } + + net->ct.htable_size = nf_conntrack_htable_size; + net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); + if (!net->ct.hash) { + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_hash; + } + ret = nf_conntrack_expect_pernet_init(net); + if (ret < 0) + goto err_expect; + ret = nf_conntrack_acct_pernet_init(net); + if (ret < 0) + goto err_acct; + ret = nf_conntrack_tstamp_pernet_init(net); + if (ret < 0) + goto err_tstamp; + ret = nf_conntrack_ecache_pernet_init(net); + if (ret < 0) + goto err_ecache; + ret = nf_conntrack_helper_pernet_init(net); + if (ret < 0) + goto err_helper; + ret = nf_conntrack_proto_pernet_init(net); + if (ret < 0) + goto err_proto; + return 0; + +err_proto: + nf_conntrack_helper_pernet_fini(net); +err_helper: + nf_conntrack_ecache_pernet_fini(net); +err_ecache: + nf_conntrack_tstamp_pernet_fini(net); +err_tstamp: + nf_conntrack_acct_pernet_fini(net); +err_acct: + nf_conntrack_expect_pernet_fini(net); +err_expect: + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); +err_hash: + kmem_cache_destroy(net->ct.nf_conntrack_cachep); +err_cache: + kfree(net->ct.slabname); +err_slabname: + free_percpu(net->ct.stat); +err_pcpu_lists: + free_percpu(net->ct.pcpu_lists); +err_stat: + return ret; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 6bd421df2db..1df17614656 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -1,8 +1,10 @@ /* Event cache for netfilter. */ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> +/* + * (C) 2005 Harald Welte <laforge@gnumonks.org> + * (C) 2005 Patrick McHardy <kaber@trash.net> + * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,101 +18,244 @@ #include <linux/stddef.h> #include <linux/err.h> #include <linux/percpu.h> -#include <linux/notifier.h> #include <linux/kernel.h> #include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> -ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain); -EXPORT_SYMBOL_GPL(nf_conntrack_chain); - -ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain); -EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain); - -DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); -EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache); +static DEFINE_MUTEX(nf_ct_ecache_mutex); /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ -static inline void -__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache) +void nf_ct_deliver_cached_events(struct nf_conn *ct) { - if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct) - && ecache->events) - atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events, - ecache->ct); - - ecache->events = 0; - nf_ct_put(ecache->ct); - ecache->ct = NULL; + struct net *net = nf_ct_net(ct); + unsigned long events, missed; + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + struct nf_ct_event item; + int ret; + + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (notify == NULL) + goto out_unlock; + + e = nf_ct_ecache_find(ct); + if (e == NULL) + goto out_unlock; + + events = xchg(&e->cache, 0); + + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events) + goto out_unlock; + + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. */ + missed = e->missed; + + if (!((events | missed) & e->ctmask)) + goto out_unlock; + + item.ct = ct; + item.portid = 0; + item.report = 0; + + ret = notify->fcn(events | missed, &item); + + if (likely(ret >= 0 && !missed)) + goto out_unlock; + + spin_lock_bh(&ct->lock); + if (ret < 0) + e->missed |= events; + else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); + +out_unlock: + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); -/* Deliver all cached events for a particular conntrack. This is called - * by code prior to async packet handling for freeing the skb */ -void nf_ct_deliver_cached_events(const struct nf_conn *ct) +int nf_conntrack_register_notifier(struct net *net, + struct nf_ct_event_notifier *new) { - struct nf_conntrack_ecache *ecache; + int ret; + struct nf_ct_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + if (notify != NULL) { + ret = -EBUSY; + goto out_unlock; + } + rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); + ret = 0; - local_bh_disable(); - ecache = &__get_cpu_var(nf_conntrack_ecache); - if (ecache->ct == ct) - __nf_ct_deliver_cached_events(ecache); - local_bh_enable(); +out_unlock: + mutex_unlock(&nf_ct_ecache_mutex); + return ret; } -EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); +EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); -/* Deliver cached events for old pending events, if current conntrack != old */ -void __nf_ct_event_cache_init(struct nf_conn *ct) +void nf_conntrack_unregister_notifier(struct net *net, + struct nf_ct_event_notifier *new) { - struct nf_conntrack_ecache *ecache; - - /* take care of delivering potentially old events */ - ecache = &__get_cpu_var(nf_conntrack_ecache); - BUG_ON(ecache->ct == ct); - if (ecache->ct) - __nf_ct_deliver_cached_events(ecache); - /* initialize for this conntrack/packet */ - ecache->ct = ct; - nf_conntrack_get(&ct->ct_general); + struct nf_ct_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + BUG_ON(notify != new); + RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); } -EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init); +EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); -/* flush the event cache - touches other CPU's data and must not be called - * while packets are still passing through the code */ -void nf_ct_event_cache_flush(void) +int nf_ct_expect_register_notifier(struct net *net, + struct nf_exp_event_notifier *new) { - struct nf_conntrack_ecache *ecache; - int cpu; + int ret; + struct nf_exp_event_notifier *notify; - for_each_possible_cpu(cpu) { - ecache = &per_cpu(nf_conntrack_ecache, cpu); - if (ecache->ct) - nf_ct_put(ecache->ct); + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + if (notify != NULL) { + ret = -EBUSY; + goto out_unlock; } + rcu_assign_pointer(net->ct.nf_expect_event_cb, new); + ret = 0; + +out_unlock: + mutex_unlock(&nf_ct_ecache_mutex); + return ret; } +EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); -int nf_conntrack_register_notifier(struct notifier_block *nb) +void nf_ct_expect_unregister_notifier(struct net *net, + struct nf_exp_event_notifier *new) { - return atomic_notifier_chain_register(&nf_conntrack_chain, nb); + struct nf_exp_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + BUG_ON(notify != new); + RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); } -EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); + +#define NF_CT_EVENTS_DEFAULT 1 +static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; +static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; -int nf_conntrack_unregister_notifier(struct notifier_block *nb) +#ifdef CONFIG_SYSCTL +static struct ctl_table event_sysctl_table[] = { + { + .procname = "nf_conntrack_events", + .data = &init_net.ct.sysctl_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_events_retry_timeout", + .data = &init_net.ct.sysctl_events_retry_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type event_extend __read_mostly = { + .len = sizeof(struct nf_conntrack_ecache), + .align = __alignof__(struct nf_conntrack_ecache), + .id = NF_CT_EXT_ECACHE, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_event_init_sysctl(struct net *net) { - return atomic_notifier_chain_unregister(&nf_conntrack_chain, nb); + struct ctl_table *table; + + table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_events; + table[1].data = &net->ct.sysctl_events_retry_timeout; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.event_sysctl_header = + register_net_sysctl(net, "net/netfilter", table); + if (!net->ct.event_sysctl_header) { + printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.event_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.event_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_event_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +int nf_conntrack_ecache_pernet_init(struct net *net) +{ + net->ct.sysctl_events = nf_ct_events; + net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + return nf_conntrack_event_init_sysctl(net); +} + +void nf_conntrack_ecache_pernet_fini(struct net *net) +{ + nf_conntrack_event_fini_sysctl(net); } -EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); -int nf_conntrack_expect_register_notifier(struct notifier_block *nb) +int nf_conntrack_ecache_init(void) { - return atomic_notifier_chain_register(&nf_conntrack_expect_chain, nb); + int ret = nf_ct_extend_register(&event_extend); + if (ret < 0) + pr_err("nf_ct_event: Unable to register event extension.\n"); + return ret; } -EXPORT_SYMBOL_GPL(nf_conntrack_expect_register_notifier); -int nf_conntrack_expect_unregister_notifier(struct notifier_block *nb) +void nf_conntrack_ecache_fini(void) { - return atomic_notifier_chain_unregister(&nf_conntrack_expect_chain, nb); + nf_ct_extend_unregister(&event_extend); } -EXPORT_SYMBOL_GPL(nf_conntrack_expect_unregister_notifier); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 117cbfdb910..f87e8f68ad4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (c) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,81 +20,130 @@ #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/moduleparam.h> +#include <linux/export.h> +#include <net/net_namespace.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_zones.h> -LIST_HEAD(nf_conntrack_expect_list); -EXPORT_SYMBOL_GPL(nf_conntrack_expect_list); +unsigned int nf_ct_expect_hsize __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); -struct kmem_cache *nf_conntrack_expect_cachep __read_mostly; -static unsigned int nf_conntrack_expect_next_id; +unsigned int nf_ct_expect_max __read_mostly; + +static struct kmem_cache *nf_ct_expect_cachep __read_mostly; /* nf_conntrack_expect helper functions */ -void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); NF_CT_ASSERT(master_help); NF_CT_ASSERT(!timer_pending(&exp->timeout)); - list_del(&exp->list); - NF_CT_STAT_INC(expect_delete); - master_help->expecting--; - nf_conntrack_expect_put(exp); + hlist_del_rcu(&exp->hnode); + net->ct.expect_count--; + + hlist_del(&exp->lnode); + master_help->expecting[exp->class]--; + + nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); + nf_ct_expect_put(exp); + + NF_CT_STAT_INC(net, expect_delete); } -EXPORT_SYMBOL_GPL(nf_ct_unlink_expect); +EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); -static void expectation_timed_out(unsigned long ul_expect) +static void nf_ct_expectation_timed_out(unsigned long ul_expect) { struct nf_conntrack_expect *exp = (void *)ul_expect; - write_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); - write_unlock_bh(&nf_conntrack_lock); - nf_conntrack_expect_put(exp); + spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_put(exp); +} + +static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +{ + unsigned int hash; + + if (unlikely(!nf_conntrack_hash_rnd)) { + init_nf_conntrack_hash_rnd(); + } + + hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), + (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | + (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + return ((u64)hash * nf_ct_expect_hsize) >> 32; } struct nf_conntrack_expect * -__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple) +__nf_ct_expect_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; + unsigned int h; + + if (!net->ct.expect_count) + return NULL; - list_for_each_entry(i, &nf_conntrack_expect_list, list) { - if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) + h = nf_ct_expect_dst_hash(tuple); + hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + nf_ct_zone(i->master) == zone) return i; } return NULL; } -EXPORT_SYMBOL_GPL(__nf_conntrack_expect_find); +EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * -nf_conntrack_expect_find_get(const struct nf_conntrack_tuple *tuple) +nf_ct_expect_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; - read_lock_bh(&nf_conntrack_lock); - i = __nf_conntrack_expect_find(tuple); - if (i) - atomic_inc(&i->use); - read_unlock_bh(&nf_conntrack_lock); + rcu_read_lock(); + i = __nf_ct_expect_find(net, zone, tuple); + if (i && !atomic_inc_not_zero(&i->use)) + i = NULL; + rcu_read_unlock(); return i; } -EXPORT_SYMBOL_GPL(nf_conntrack_expect_find_get); +EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * -find_expectation(const struct nf_conntrack_tuple *tuple) +nf_ct_find_expectation(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_expect *exp; + struct nf_conntrack_expect *i, *exp = NULL; + unsigned int h; + + if (!net->ct.expect_count) + return NULL; - exp = __nf_conntrack_expect_find(tuple); + h = nf_ct_expect_dst_hash(tuple); + hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { + if (!(i->flags & NF_CT_EXPECT_INACTIVE) && + nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + nf_ct_zone(i->master) == zone) { + exp = i; + break; + } + } if (!exp) return NULL; @@ -105,6 +155,18 @@ find_expectation(const struct nf_conntrack_tuple *tuple) if (!nf_ct_is_confirmed(exp->master)) return NULL; + /* Avoid race with other CPUs, that for exp->master ct, is + * about to invoke ->destroy(), or nf_ct_delete() via timeout + * or early_drop(). + * + * The atomic_inc_not_zero() check tells: If that fails, we + * know that the ct is being destroyed. If it succeeds, we + * can be sure the ct cannot disappear underneath. + */ + if (unlikely(nf_ct_is_dying(exp->master) || + !atomic_inc_not_zero(&exp->master->ct_general.use))) + return NULL; + if (exp->flags & NF_CT_EXPECT_PERMANENT) { atomic_inc(&exp->use); return exp; @@ -112,6 +174,8 @@ find_expectation(const struct nf_conntrack_tuple *tuple) nf_ct_unlink_expect(exp); return exp; } + /* Undo exp->master refcnt increase, if del_timer() failed */ + nf_ct_put(exp->master); return NULL; } @@ -119,19 +183,22 @@ find_expectation(const struct nf_conntrack_tuple *tuple) /* delete all expectations for this conntrack */ void nf_ct_remove_expectations(struct nf_conn *ct) { - struct nf_conntrack_expect *i, *tmp; struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *next; /* Optimization: most connection never expect any others. */ - if (!help || help->expecting == 0) + if (!help) return; - list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) { - if (i->master == ct && del_timer(&i->timeout)) { - nf_ct_unlink_expect(i); - nf_conntrack_expect_put(i); + spin_lock_bh(&nf_conntrack_expect_lock); + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); } } + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); @@ -141,63 +208,48 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, { /* Part covered by intersection of masks must be unequal, otherwise they clash */ - struct nf_conntrack_tuple intersect_mask; + struct nf_conntrack_tuple_mask intersect_mask; int count; - intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num; intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; - intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all; - intersect_mask.dst.protonum = a->mask.dst.protonum - & b->mask.dst.protonum; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ intersect_mask.src.u3.all[count] = a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } - for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ - intersect_mask.dst.u3.all[count] = - a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count]; - } - return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { - return a->master == b->master - && nf_ct_tuple_equal(&a->tuple, &b->tuple) - && nf_ct_tuple_equal(&a->mask, &b->mask); + return a->master == b->master && a->class == b->class && + nf_ct_tuple_equal(&a->tuple, &b->tuple) && + nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + nf_ct_zone(a->master) == nf_ct_zone(b->master); } /* Generally a bad idea to call this: could have matched already. */ -void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp) +void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { - struct nf_conntrack_expect *i; - - write_lock_bh(&nf_conntrack_lock); - /* choose the oldest expectation to evict */ - list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { - if (expect_matches(i, exp) && del_timer(&i->timeout)) { - nf_ct_unlink_expect(i); - write_unlock_bh(&nf_conntrack_lock); - nf_conntrack_expect_put(i); - return; - } + spin_lock_bh(&nf_conntrack_expect_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); } - write_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } -EXPORT_SYMBOL_GPL(nf_conntrack_unexpect_related); +EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); /* We don't increase the master conntrack refcount for non-fulfilled * conntracks. During the conntrack destruction, the expectations are * always killed before the conntrack itself */ -struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me) +struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) { struct nf_conntrack_expect *new; - new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC); + new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); if (!new) return NULL; @@ -205,12 +257,13 @@ struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me) atomic_set(&new->use, 1); return new; } -EXPORT_SYMBOL_GPL(nf_conntrack_expect_alloc); +EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); -void nf_conntrack_expect_init(struct nf_conntrack_expect *exp, int family, - union nf_conntrack_address *saddr, - union nf_conntrack_address *daddr, - u_int8_t proto, __be16 *src, __be16 *dst) +void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, + u_int8_t family, + const union nf_inet_addr *saddr, + const union nf_inet_addr *daddr, + u_int8_t proto, const __be16 *src, const __be16 *dst) { int len; @@ -220,12 +273,11 @@ void nf_conntrack_expect_init(struct nf_conntrack_expect *exp, int family, len = 16; exp->flags = 0; + exp->class = class; exp->expectfn = NULL; exp->helper = NULL; exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; - exp->mask.src.l3num = 0xFFFF; - exp->mask.dst.protonum = 0xFF; if (saddr) { memcpy(&exp->tuple.src.u3, saddr, len); @@ -242,107 +294,115 @@ void nf_conntrack_expect_init(struct nf_conntrack_expect *exp, int family, memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); } - if (daddr) { - memcpy(&exp->tuple.dst.u3, daddr, len); - if (sizeof(exp->tuple.dst.u3) > len) - /* address needs to be cleared for nf_ct_tuple_equal */ - memset((void *)&exp->tuple.dst.u3 + len, 0x00, - sizeof(exp->tuple.dst.u3) - len); - memset(&exp->mask.dst.u3, 0xFF, len); - if (sizeof(exp->mask.dst.u3) > len) - memset((void *)&exp->mask.dst.u3 + len, 0x00, - sizeof(exp->mask.dst.u3) - len); - } else { - memset(&exp->tuple.dst.u3, 0x00, sizeof(exp->tuple.dst.u3)); - memset(&exp->mask.dst.u3, 0x00, sizeof(exp->mask.dst.u3)); - } - if (src) { - exp->tuple.src.u.all = (__force u16)*src; - exp->mask.src.u.all = 0xFFFF; + exp->tuple.src.u.all = *src; + exp->mask.src.u.all = htons(0xFFFF); } else { exp->tuple.src.u.all = 0; exp->mask.src.u.all = 0; } - if (dst) { - exp->tuple.dst.u.all = (__force u16)*dst; - exp->mask.dst.u.all = 0xFFFF; - } else { - exp->tuple.dst.u.all = 0; - exp->mask.dst.u.all = 0; - } + memcpy(&exp->tuple.dst.u3, daddr, len); + if (sizeof(exp->tuple.dst.u3) > len) + /* address needs to be cleared for nf_ct_tuple_equal */ + memset((void *)&exp->tuple.dst.u3 + len, 0x00, + sizeof(exp->tuple.dst.u3) - len); + + exp->tuple.dst.u.all = *dst; + +#ifdef CONFIG_NF_NAT_NEEDED + memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); + memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); +#endif } -EXPORT_SYMBOL_GPL(nf_conntrack_expect_init); +EXPORT_SYMBOL_GPL(nf_ct_expect_init); -void nf_conntrack_expect_put(struct nf_conntrack_expect *exp) +static void nf_ct_expect_free_rcu(struct rcu_head *head) +{ + struct nf_conntrack_expect *exp; + + exp = container_of(head, struct nf_conntrack_expect, rcu); + kmem_cache_free(nf_ct_expect_cachep, exp); +} + +void nf_ct_expect_put(struct nf_conntrack_expect *exp) { if (atomic_dec_and_test(&exp->use)) - kmem_cache_free(nf_conntrack_expect_cachep, exp); + call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } -EXPORT_SYMBOL_GPL(nf_conntrack_expect_put); +EXPORT_SYMBOL_GPL(nf_ct_expect_put); -static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) +static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); - - atomic_inc(&exp->use); - master_help->expecting++; - list_add(&exp->list, &nf_conntrack_expect_list); - - setup_timer(&exp->timeout, expectation_timed_out, (unsigned long)exp); - exp->timeout.expires = jiffies + master_help->helper->timeout * HZ; + struct nf_conntrack_helper *helper; + struct net *net = nf_ct_exp_net(exp); + unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + + /* two references : one for hash insert, one for the timer */ + atomic_add(2, &exp->use); + + hlist_add_head(&exp->lnode, &master_help->expectations); + master_help->expecting[exp->class]++; + + hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + net->ct.expect_count++; + + setup_timer(&exp->timeout, nf_ct_expectation_timed_out, + (unsigned long)exp); + helper = rcu_dereference_protected(master_help->helper, + lockdep_is_held(&nf_conntrack_expect_lock)); + if (helper) { + exp->timeout.expires = jiffies + + helper->expect_policy[exp->class].timeout * HZ; + } add_timer(&exp->timeout); - exp->id = ++nf_conntrack_expect_next_id; - atomic_inc(&exp->use); - NF_CT_STAT_INC(expect_create); + NF_CT_STAT_INC(net, expect_create); + return 0; } /* Race with expectations being used means we could have none to find; OK. */ -static void evict_oldest_expect(struct nf_conn *master) +static void evict_oldest_expect(struct nf_conn *master, + struct nf_conntrack_expect *new) { - struct nf_conntrack_expect *i; + struct nf_conn_help *master_help = nfct_help(master); + struct nf_conntrack_expect *exp, *last = NULL; - list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { - if (i->master == master) { - if (del_timer(&i->timeout)) { - nf_ct_unlink_expect(i); - nf_conntrack_expect_put(i); - } - break; - } + hlist_for_each_entry(exp, &master_help->expectations, lnode) { + if (exp->class == new->class) + last = exp; } -} - -static inline int refresh_timer(struct nf_conntrack_expect *i) -{ - struct nf_conn_help *master_help = nfct_help(i->master); - - if (!del_timer(&i->timeout)) - return 0; - i->timeout.expires = jiffies + master_help->helper->timeout*HZ; - add_timer(&i->timeout); - return 1; + if (last && del_timer(&last->timeout)) { + nf_ct_unlink_expect(last); + nf_ct_expect_put(last); + } } -int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) +static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) { + const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); - int ret; - - NF_CT_ASSERT(master_help); - - write_lock_bh(&nf_conntrack_lock); - list_for_each_entry(i, &nf_conntrack_expect_list, list) { + struct nf_conntrack_helper *helper; + struct net *net = nf_ct_exp_net(expect); + struct hlist_node *next; + unsigned int h; + int ret = 1; + + if (!master_help) { + ret = -ESHUTDOWN; + goto out; + } + h = nf_ct_expect_dst_hash(&expect->tuple); + hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_ct_expect_put(i); + break; } } else if (expect_clash(i, expect)) { ret = -EBUSY; @@ -350,61 +410,123 @@ int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) } } /* Will be over limit? */ - if (master_help->helper->max_expected && - master_help->expecting >= master_help->helper->max_expected) - evict_oldest_expect(master); + helper = rcu_dereference_protected(master_help->helper, + lockdep_is_held(&nf_conntrack_expect_lock)); + if (helper) { + p = &helper->expect_policy[expect->class]; + if (p->max_expected && + master_help->expecting[expect->class] >= p->max_expected) { + evict_oldest_expect(master, expect); + if (master_help->expecting[expect->class] + >= p->max_expected) { + ret = -EMFILE; + goto out; + } + } + } - nf_conntrack_expect_insert(expect); - nf_conntrack_expect_event(IPEXP_NEW, expect); - ret = 0; + if (net->ct.expect_count >= nf_ct_expect_max) { + net_warn_ratelimited("nf_conntrack: expectation table full\n"); + ret = -EMFILE; + } out: - write_unlock_bh(&nf_conntrack_lock); return ret; } -EXPORT_SYMBOL_GPL(nf_conntrack_expect_related); -#ifdef CONFIG_PROC_FS -static void *exp_seq_start(struct seq_file *s, loff_t *pos) +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, + u32 portid, int report) { - struct list_head *e = &nf_conntrack_expect_list; - loff_t i; + int ret; - /* strange seq_file api calls stop even if we fail, - * thus we need to grab lock since stop unlocks */ - read_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); + ret = __nf_ct_expect_check(expect); + if (ret <= 0) + goto out; - if (list_empty(e)) - return NULL; + ret = nf_ct_expect_insert(expect); + if (ret < 0) + goto out; + spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); + return ret; +out: + spin_unlock_bh(&nf_conntrack_expect_lock); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); + +#ifdef CONFIG_NF_CONNTRACK_PROCFS +struct ct_expect_iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_node *ct_expect_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; + struct hlist_node *n; + + for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { + n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + if (n) + return n; + } + return NULL; +} + +static struct hlist_node *ct_expect_get_next(struct seq_file *seq, + struct hlist_node *head) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; - for (i = 0; i <= *pos; i++) { - e = e->next; - if (e == &nf_conntrack_expect_list) + head = rcu_dereference(hlist_next_rcu(head)); + while (head == NULL) { + if (++st->bucket >= nf_ct_expect_hsize) return NULL; + head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } - return e; + return head; } -static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) { - struct list_head *e = v; + struct hlist_node *head = ct_expect_get_first(seq); - ++*pos; - e = e->next; + if (head) + while (pos && (head = ct_expect_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} - if (e == &nf_conntrack_expect_list) - return NULL; +static void *exp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return ct_expect_get_idx(seq, *pos); +} - return e; +static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + return ct_expect_get_next(seq, v); } -static void exp_seq_stop(struct seq_file *s, void *v) +static void exp_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - read_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); } static int exp_seq_show(struct seq_file *s, void *v) { - struct nf_conntrack_expect *expect = v; + struct nf_conntrack_expect *expect; + struct nf_conntrack_helper *helper; + struct hlist_node *n = v; + char *delim = ""; + + expect = hlist_entry(n, struct nf_conntrack_expect, hnode); if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) @@ -418,10 +540,30 @@ static int exp_seq_show(struct seq_file *s, void *v) __nf_ct_l3proto_find(expect->tuple.src.l3num), __nf_ct_l4proto_find(expect->tuple.src.l3num, expect->tuple.dst.protonum)); + + if (expect->flags & NF_CT_EXPECT_PERMANENT) { + seq_printf(s, "PERMANENT"); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_INACTIVE) { + seq_printf(s, "%sINACTIVE", delim); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_USERSPACE) + seq_printf(s, "%sUSERSPACE", delim); + + helper = rcu_dereference(nfct_help(expect->master)->helper); + if (helper) { + seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); + if (helper->expect_policy[expect->class].name) + seq_printf(s, "/%s", + helper->expect_policy[expect->class].name); + } + return seq_putc(s, '\n'); } -static struct seq_operations exp_seq_ops = { +static const struct seq_operations exp_seq_ops = { .start = exp_seq_start, .next = exp_seq_next, .stop = exp_seq_stop, @@ -430,14 +572,85 @@ static struct seq_operations exp_seq_ops = { static int exp_open(struct inode *inode, struct file *file) { - return seq_open(file, &exp_seq_ops); + return seq_open_net(inode, file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); } -const struct file_operations exp_file_ops = { +static const struct file_operations exp_file_ops = { .owner = THIS_MODULE, .open = exp_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release + .release = seq_release_net, }; -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ + +static int exp_proc_init(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_PROCFS + struct proc_dir_entry *proc; + + proc = proc_create("nf_conntrack_expect", 0440, net->proc_net, + &exp_file_ops); + if (!proc) + return -ENOMEM; +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ + return 0; +} + +static void exp_proc_remove(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_PROCFS + remove_proc_entry("nf_conntrack_expect", net->proc_net); +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ +} + +module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); + +int nf_conntrack_expect_pernet_init(struct net *net) +{ + int err = -ENOMEM; + + net->ct.expect_count = 0; + net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (net->ct.expect_hash == NULL) + goto err1; + + err = exp_proc_init(net); + if (err < 0) + goto err2; + + return 0; +err2: + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); +err1: + return err; +} + +void nf_conntrack_expect_pernet_fini(struct net *net) +{ + exp_proc_remove(net); + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); +} + +int nf_conntrack_expect_init(void) +{ + if (!nf_ct_expect_hsize) { + nf_ct_expect_hsize = nf_conntrack_htable_size / 256; + if (!nf_ct_expect_hsize) + nf_ct_expect_hsize = 1; + } + nf_ct_expect_max = nf_ct_expect_hsize * 4; + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL); + if (!nf_ct_expect_cachep) + return -ENOMEM; + return 0; +} + +void nf_conntrack_expect_fini(void) +{ + rcu_barrier(); /* Wait for call_rcu() before destroy */ + kmem_cache_destroy(nf_ct_expect_cachep); +} diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c new file mode 100644 index 00000000000..1a9545965c0 --- /dev/null +++ b/net/netfilter/nf_conntrack_extend.c @@ -0,0 +1,191 @@ +/* Structure dynamic extension infrastructure + * Copyright (C) 2004 Rusty Russell IBM Corporation + * Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/rcupdate.h> +#include <linux/slab.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack_extend.h> + +static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM]; +static DEFINE_MUTEX(nf_ct_ext_type_mutex); + +void __nf_ct_ext_destroy(struct nf_conn *ct) +{ + unsigned int i; + struct nf_ct_ext_type *t; + struct nf_ct_ext *ext = ct->ext; + + for (i = 0; i < NF_CT_EXT_NUM; i++) { + if (!__nf_ct_ext_exist(ext, i)) + continue; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[i]); + + /* Here the nf_ct_ext_type might have been unregisterd. + * I.e., it has responsible to cleanup private + * area in all conntracks when it is unregisterd. + */ + if (t && t->destroy) + t->destroy(ct); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(__nf_ct_ext_destroy); + +static void * +nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, + size_t var_alloc_len, gfp_t gfp) +{ + unsigned int off, len; + struct nf_ct_ext_type *t; + size_t alloc_size; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[id]); + BUG_ON(t == NULL); + off = ALIGN(sizeof(struct nf_ct_ext), t->align); + len = off + t->len + var_alloc_len; + alloc_size = t->alloc_size + var_alloc_len; + rcu_read_unlock(); + + *ext = kzalloc(alloc_size, gfp); + if (!*ext) + return NULL; + + (*ext)->offset[id] = off; + (*ext)->len = len; + + return (void *)(*ext) + off; +} + +void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, + size_t var_alloc_len, gfp_t gfp) +{ + struct nf_ct_ext *old, *new; + int i, newlen, newoff; + struct nf_ct_ext_type *t; + + /* Conntrack must not be confirmed to avoid races on reallocation. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + + old = ct->ext; + if (!old) + return nf_ct_ext_create(&ct->ext, id, var_alloc_len, gfp); + + if (__nf_ct_ext_exist(old, id)) + return NULL; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[id]); + BUG_ON(t == NULL); + + newoff = ALIGN(old->len, t->align); + newlen = newoff + t->len + var_alloc_len; + rcu_read_unlock(); + + new = __krealloc(old, newlen, gfp); + if (!new) + return NULL; + + if (new != old) { + for (i = 0; i < NF_CT_EXT_NUM; i++) { + if (!__nf_ct_ext_exist(old, i)) + continue; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[i]); + if (t && t->move) + t->move((void *)new + new->offset[i], + (void *)old + old->offset[i]); + rcu_read_unlock(); + } + kfree_rcu(old, rcu); + ct->ext = new; + } + + new->offset[id] = newoff; + new->len = newlen; + memset((void *)new + newoff, 0, newlen - newoff); + return (void *)new + newoff; +} +EXPORT_SYMBOL(__nf_ct_ext_add_length); + +static void update_alloc_size(struct nf_ct_ext_type *type) +{ + int i, j; + struct nf_ct_ext_type *t1, *t2; + enum nf_ct_ext_id min = 0, max = NF_CT_EXT_NUM - 1; + + /* unnecessary to update all types */ + if ((type->flags & NF_CT_EXT_F_PREALLOC) == 0) { + min = type->id; + max = type->id; + } + + /* This assumes that extended areas in conntrack for the types + whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */ + for (i = min; i <= max; i++) { + t1 = rcu_dereference_protected(nf_ct_ext_types[i], + lockdep_is_held(&nf_ct_ext_type_mutex)); + if (!t1) + continue; + + t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + + t1->len; + for (j = 0; j < NF_CT_EXT_NUM; j++) { + t2 = rcu_dereference_protected(nf_ct_ext_types[j], + lockdep_is_held(&nf_ct_ext_type_mutex)); + if (t2 == NULL || t2 == t1 || + (t2->flags & NF_CT_EXT_F_PREALLOC) == 0) + continue; + + t1->alloc_size = ALIGN(t1->alloc_size, t2->align) + + t2->len; + } + } +} + +/* This MUST be called in process context. */ +int nf_ct_extend_register(struct nf_ct_ext_type *type) +{ + int ret = 0; + + mutex_lock(&nf_ct_ext_type_mutex); + if (nf_ct_ext_types[type->id]) { + ret = -EBUSY; + goto out; + } + + /* This ensures that nf_ct_ext_create() can allocate enough area + before updating alloc_size */ + type->alloc_size = ALIGN(sizeof(struct nf_ct_ext), type->align) + + type->len; + rcu_assign_pointer(nf_ct_ext_types[type->id], type); + update_alloc_size(type); +out: + mutex_unlock(&nf_ct_ext_type_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_extend_register); + +/* This MUST be called in process context. */ +void nf_ct_extend_unregister(struct nf_ct_ext_type *type) +{ + mutex_lock(&nf_ct_ext_type_mutex); + RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL); + update_alloc_size(type); + mutex_unlock(&nf_ct_ext_type_mutex); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} +EXPORT_SYMBOL_GPL(nf_ct_extend_unregister); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index a186799f654..b8a0924064e 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -13,6 +14,7 @@ #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/ip.h> +#include <linux/slab.h> #include <linux/ipv6.h> #include <linux/ctype.h> #include <linux/inet.h> @@ -29,6 +31,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ftp connection tracking helper"); MODULE_ALIAS("ip_conntrack_ftp"); +MODULE_ALIAS_NFCT_HELPER("ftp"); /* This is slow, but it's simple. --RR */ static char *ftp_buffer; @@ -40,28 +43,26 @@ static u_int16_t ports[MAX_PORTS]; static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); -static int loose; +static bool loose; module_param(loose, bool, 0600); -unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb, +unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, enum nf_ct_ftp_type type, + unsigned int protoff, unsigned int matchoff, unsigned int matchlen, - struct nf_conntrack_expect *exp, - u32 *seq); + struct nf_conntrack_expect *exp); EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); -static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); +static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, - char); + char, unsigned int *); static struct ftp_search { const char *pattern; @@ -69,7 +70,7 @@ static struct ftp_search { char skip; char term; enum nf_ct_ftp_type ftptype; - int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); } search[IP_CT_DIR_MAX][2] = { [IP_CT_DIR_ORIGINAL] = { { @@ -93,10 +94,8 @@ static struct ftp_search { { .pattern = "227 ", .plen = sizeof("227 ") - 1, - .skip = '(', - .term = ')', .ftptype = NF_CT_FTP_PASV, - .getnum = try_rfc959, + .getnum = try_rfc1123, }, { .pattern = "229 ", @@ -135,23 +134,25 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[], i++; else { /* Unexpected character; true if it's the - terminator and we're finished. */ - if (*data == term && i == array_size - 1) + terminator (or we don't care about one) + and we're finished. */ + if ((*data == term || !term) && i == array_size - 1) return len; - DEBUGP("Char %u (got %u nums) `%u' unexpected\n", - len, i, *data); + pr_debug("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); return 0; } } - DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); - + pr_debug("Failed to fill %u numbers separated by %c\n", + array_size, sep); return 0; } /* Returns 0, or length of numbers: 192,168,1,1,5,6 */ static int try_rfc959(const char *data, size_t dlen, - struct nf_conntrack_man *cmd, char term) + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) { int length; u_int32_t array[6]; @@ -166,6 +167,33 @@ static int try_rfc959(const char *data, size_t dlen, return length; } +/* + * From RFC 1123: + * The format of the 227 reply to a PASV command is not + * well standardized. In particular, an FTP client cannot + * assume that the parentheses shown on page 40 of RFC-959 + * will be present (and in fact, Figure 3 on page 43 omits + * them). Therefore, a User-FTP program that interprets + * the PASV reply must scan the reply for the first digit + * of the host and port numbers. + */ +static int try_rfc1123(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) +{ + int i; + for (i = 0; i < dlen; i++) + if (isdigit(data[i])) + break; + + if (i == dlen) + return 0; + + *offset += i; + + return try_rfc959(data + i, dlen - i, cmd, 0, offset); +} + /* Grab port: number up to delimiter */ static int get_port(const char *data, int start, size_t dlen, char delim, __be16 *port) @@ -179,13 +207,13 @@ static int get_port(const char *data, int start, size_t dlen, char delim, if (tmp_port == 0) break; *port = htons(tmp_port); - DEBUGP("get_port: return %d\n", tmp_port); + pr_debug("get_port: return %d\n", tmp_port); return i + 1; } else if (data[i] >= '0' && data[i] <= '9') tmp_port = tmp_port*10 + data[i] - '0'; else { /* Some other crap */ - DEBUGP("get_port: invalid char.\n"); + pr_debug("get_port: invalid char.\n"); break; } } @@ -194,7 +222,7 @@ static int get_port(const char *data, int start, size_t dlen, char delim, /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, - char term) + char term, unsigned int *offset) { char delim; int length; @@ -202,22 +230,22 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, then delimiter again. */ if (dlen <= 3) { - DEBUGP("EPRT: too short\n"); + pr_debug("EPRT: too short\n"); return 0; } delim = data[0]; if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { - DEBUGP("try_eprt: invalid delimitter.\n"); + pr_debug("try_eprt: invalid delimitter.\n"); return 0; } if ((cmd->l3num == PF_INET && data[1] != '1') || (cmd->l3num == PF_INET6 && data[1] != '2')) { - DEBUGP("EPRT: invalid protocol number.\n"); + pr_debug("EPRT: invalid protocol number.\n"); return 0; } - DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim); + pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim); if (data[1] == '1') { u_int32_t array[4]; @@ -235,22 +263,23 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, if (length == 0) return 0; - DEBUGP("EPRT: Got IP address!\n"); + pr_debug("EPRT: Got IP address!\n"); /* Start offset includes initial "|1|", and trailing delimiter */ return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); } /* Returns 0, or length of numbers: |||6446| */ static int try_epsv_response(const char *data, size_t dlen, - struct nf_conntrack_man *cmd, char term) + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) { char delim; /* Three delimiters. */ if (dlen <= 3) return 0; delim = data[0]; - if (isdigit(delim) || delim < 33 || delim > 126 - || data[1] != delim || data[2] != delim) + if (isdigit(delim) || delim < 33 || delim > 126 || + data[1] != delim || data[2] != delim) return 0; return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); @@ -264,11 +293,12 @@ static int find_pattern(const char *data, size_t dlen, unsigned int *numlen, struct nf_conntrack_man *cmd, int (*getnum)(const char *, size_t, - struct nf_conntrack_man *, char)) + struct nf_conntrack_man *, char, + unsigned int *)) { - size_t i; + size_t i = plen; - DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); + pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); if (dlen == 0) return 0; @@ -283,33 +313,35 @@ static int find_pattern(const char *data, size_t dlen, #if 0 size_t i; - DEBUGP("ftp: string mismatch\n"); + pr_debug("ftp: string mismatch\n"); for (i = 0; i < plen; i++) { - DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", - i, data[i], data[i], - pattern[i], pattern[i]); + pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); } #endif return 0; } - DEBUGP("Pattern matches!\n"); + pr_debug("Pattern matches!\n"); /* Now we've found the constant string, try to skip to the 'skip' character */ - for (i = plen; data[i] != skip; i++) - if (i == dlen - 1) return -1; + if (skip) { + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; - /* Skip over the last character */ - i++; + /* Skip over the last character */ + i++; + } - DEBUGP("Skipped up to `%c'!\n", skip); + pr_debug("Skipped up to `%c'!\n", skip); *numoff = i; - *numlen = getnum(data + i, dlen - i, cmd, term); + *numlen = getnum(data + i, dlen - i, cmd, term, numoff); if (!*numlen) return -1; - DEBUGP("Match succeeded!\n"); + pr_debug("Match succeeded!\n"); return 1; } @@ -325,71 +357,74 @@ static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir) } /* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct nf_ct_ftp_master *info, int dir, +static void update_nl_seq(struct nf_conn *ct, u32 nl_seq, + struct nf_ct_ftp_master *info, int dir, struct sk_buff *skb) { - unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; + unsigned int i, oldest; /* Look for oldest: if we find exact match, we're done. */ for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { if (info->seq_aft_nl[dir][i] == nl_seq) return; - - if (oldest == info->seq_aft_nl_num[dir] - || before(info->seq_aft_nl[dir][i], oldest)) - oldest = i; } if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); - } else if (oldest != NUM_SEQ_TO_REMEMBER) { - info->seq_aft_nl[dir][oldest] = nl_seq; - nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else { + if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1])) + oldest = 0; + else + oldest = 1; + + if (after(nl_seq, info->seq_aft_nl[dir][oldest])) + info->seq_aft_nl[dir][oldest] = nl_seq; } } -static int help(struct sk_buff **pskb, +static int help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { unsigned int dataoff, datalen; - struct tcphdr _tcph, *th; - char *fb_ptr; + const struct tcphdr *th; + struct tcphdr _tcph; + const char *fb_ptr; int ret; u32 seq; int dir = CTINFO2DIR(ctinfo); - unsigned int matchlen, matchoff; - struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info; + unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); + struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; + union nf_inet_addr *daddr; struct nf_conntrack_man cmd = {}; unsigned int i; int found = 0, ends_in_nl; typeof(nf_nat_ftp_hook) nf_nat_ftp; /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { - DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) { + pr_debug("ftp: Conntrackinfo = %u\n", ctinfo); return NF_ACCEPT; } - th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); if (th == NULL) return NF_ACCEPT; dataoff = protoff + th->doff * 4; /* No data? */ - if (dataoff >= (*pskb)->len) { - DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, - (*pskb)->len); + if (dataoff >= skb->len) { + pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, + skb->len); return NF_ACCEPT; } - datalen = (*pskb)->len - dataoff; + datalen = skb->len - dataoff; spin_lock_bh(&nf_ftp_lock); - fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer); + fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); BUG_ON(fb_ptr == NULL); ends_in_nl = (fb_ptr[datalen - 1] == '\n'); @@ -397,19 +432,26 @@ static int help(struct sk_buff **pskb, /* Look up to see if we're just after a \n. */ if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* We're picking up this, clear flags and let it continue */ + if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) { + ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP; + goto skip_nl_seq; + } + /* Now if this ends in \n, update ftp info. */ - DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", - ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", - ct_ftp_info->seq_aft_nl[dir][0], - ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", - ct_ftp_info->seq_aft_nl[dir][1]); + pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][0], + ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][1]); ret = NF_ACCEPT; goto out_update_nl; } +skip_nl_seq: /* Initialize IP/IPv6 addr to expected address (it's not mentioned in EPSV responses) */ - cmd.l3num = ct->tuplehash[dir].tuple.src.l3num; + cmd.l3num = nf_ct_l3num(ct); memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, sizeof(cmd.u3.all)); @@ -429,10 +471,8 @@ static int help(struct sk_buff **pskb, connection tracking, not packet filtering. However, it is necessary for accurate tracking in this case. */ - if (net_ratelimit()) - printk("conntrack_ftp: partial %s %u+%u\n", - search[dir][i].pattern, - ntohl(th->seq), datalen); + nf_ct_helper_log(skb, ct, "partial matching of `%s'", + search[dir][i].pattern); ret = NF_DROP; goto out; } else if (found == 0) { /* No match */ @@ -440,12 +480,13 @@ static int help(struct sk_buff **pskb, goto out_update_nl; } - DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", - (int)matchlen, fb_ptr + matchoff, - matchlen, ntohl(th->seq) + matchoff); + pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + matchlen, fb_ptr + matchoff, + matchlen, ntohl(th->seq) + matchoff); - exp = nf_conntrack_expect_alloc(ct); + exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -453,10 +494,10 @@ static int help(struct sk_buff **pskb, /* We refer to the reverse direction ("!dir") tuples here, * because we're expecting something in the other direction. * Doesn't matter unless NAT is happening. */ - exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3; + daddr = &ct->tuplehash[!dir].tuple.dst.u3; /* Update the ftp info */ - if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) && + if ((cmd.l3num == nf_ct_l3num(ct)) && memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, sizeof(cmd.u3.all))) { /* Enrico Scholz's passive FTP to partially RNAT'd ftp @@ -464,14 +505,13 @@ static int help(struct sk_buff **pskb, different IP address. Simply don't record it for NAT. */ if (cmd.l3num == PF_INET) { - DEBUGP("conntrack_ftp: NOT RECORDING: " NIPQUAD_FMT " != " NIPQUAD_FMT "\n", - NIPQUAD(cmd.u3.ip), - NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip)); + pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n", + &cmd.u3.ip, + &ct->tuplehash[dir].tuple.src.u3.ip); } else { - DEBUGP("conntrack_ftp: NOT RECORDING: " NIP6_FMT " != " NIP6_FMT "\n", - NIP6(*((struct in6_addr *)cmd.u3.ip6)), - NIP6(*((struct in6_addr *)ct->tuplehash[dir] - .tuple.src.u3.ip6))); + pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n", + cmd.u3.ip6, + ct->tuplehash[dir].tuple.src.u3.ip6); } /* Thanks to Cristiano Lincoln Mattos @@ -482,67 +522,60 @@ static int help(struct sk_buff **pskb, ret = NF_ACCEPT; goto out_put_expect; } - memcpy(&exp->tuple.dst.u3, &cmd.u3.all, - sizeof(exp->tuple.dst.u3)); + daddr = &cmd.u3; } - exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3; - exp->tuple.src.l3num = cmd.l3num; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.u.tcp.port = cmd.u.tcp.port; - exp->tuple.dst.protonum = IPPROTO_TCP; - - exp->mask = (struct nf_conntrack_tuple) - { .src = { .l3num = 0xFFFF, - .u = { .tcp = { 0 }}, - }, - .dst = { .protonum = 0xFF, - .u = { .tcp = { __constant_htons(0xFFFF) }}, - }, - }; - if (cmd.l3num == PF_INET) { - exp->mask.src.u3.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u3.ip = htonl(0xFFFFFFFF); - } else { - memset(exp->mask.src.u3.ip6, 0xFF, - sizeof(exp->mask.src.u3.ip6)); - memset(exp->mask.dst.u3.ip6, 0xFF, - sizeof(exp->mask.src.u3.ip6)); - } - - exp->expectfn = NULL; - exp->helper = NULL; - exp->flags = 0; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num, + &ct->tuplehash[!dir].tuple.src.u3, daddr, + IPPROTO_TCP, NULL, &cmd.u.tcp.port); /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook); if (nf_nat_ftp && ct->status & IPS_NAT_MASK) - ret = nf_nat_ftp(pskb, ctinfo, search[dir][i].ftptype, - matchoff, matchlen, exp, &seq); + ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype, + protoff, matchoff, matchlen, exp); else { /* Can't expect this? Best to drop packet now. */ - if (nf_conntrack_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - else + } else ret = NF_ACCEPT; } out_put_expect: - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info, dir, *pskb); + update_nl_seq(ct, seq, ct_ftp_info, dir, skb); out: spin_unlock_bh(&nf_ftp_lock); return ret; } -static struct nf_conntrack_helper ftp[MAX_PORTS][2]; -static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")]; +static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct) +{ + struct nf_ct_ftp_master *ftp = nfct_help_data(ct); + + /* This conntrack has been injected from user-space, always pick up + * sequence tracking. Otherwise, the first FTP command after the + * failover breaks. + */ + ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP; + ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP; + return 0; +} + +static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; + +static const struct nf_conntrack_expect_policy ftp_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_ftp_fini(void) @@ -553,9 +586,9 @@ static void nf_conntrack_ftp_fini(void) if (ftp[i][j].me == NULL) continue; - DEBUGP("nf_ct_ftp: unregistering helper for pf: %d " - "port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); + pr_debug("nf_ct_ftp: unregistering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_helper_unregister(&ftp[i][j]); } } @@ -566,7 +599,6 @@ static void nf_conntrack_ftp_fini(void) static int __init nf_conntrack_ftp_init(void) { int i, j = -1, ret = 0; - char *tmpname; ftp_buffer = kmalloc(65536, GFP_KERNEL); if (!ftp_buffer) @@ -581,29 +613,25 @@ static int __init nf_conntrack_ftp_init(void) ftp[i][0].tuple.src.l3num = PF_INET; ftp[i][1].tuple.src.l3num = PF_INET6; for (j = 0; j < 2; j++) { + ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master); ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; - ftp[i][j].mask.src.l3num = 0xFFFF; - ftp[i][j].mask.src.u.tcp.port = htons(0xFFFF); - ftp[i][j].mask.dst.protonum = 0xFF; - ftp[i][j].max_expected = 1; - ftp[i][j].timeout = 5 * 60; /* 5 Minutes */ + ftp[i][j].expect_policy = &ftp_exp_policy; ftp[i][j].me = THIS_MODULE; ftp[i][j].help = help; - tmpname = &ftp_names[i][j][0]; + ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr; if (ports[i] == FTP_PORT) - sprintf(tmpname, "ftp"); + sprintf(ftp[i][j].name, "ftp"); else - sprintf(tmpname, "ftp-%d", ports[i]); - ftp[i][j].name = tmpname; + sprintf(ftp[i][j].name, "ftp-%d", ports[i]); - DEBUGP("nf_ct_ftp: registering helper for pf: %d " - "port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); + pr_debug("nf_ct_ftp: registering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); ret = nf_conntrack_helper_register(&ftp[i][j]); if (ret) { - printk("nf_ct_ftp: failed to register helper " - " for pf: %d port: %d\n", + printk(KERN_ERR "nf_ct_ftp: failed to register" + " helper for pf: %d port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_ftp_fini(); return ret; diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index f6fad713d48..bcd5ed6b713 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -87,7 +87,7 @@ typedef struct field_t { unsigned char ub; unsigned short attr; unsigned short offset; - struct field_t *fields; + const struct field_t *fields; } field_t; /* Bit Stream */ @@ -96,37 +96,37 @@ typedef struct { unsigned char *beg; unsigned char *end; unsigned char *cur; - unsigned bit; + unsigned int bit; } bitstr_t; /* Tool Functions */ -#define INC_BIT(bs) if((++bs->bit)>7){bs->cur++;bs->bit=0;} -#define INC_BITS(bs,b) if((bs->bit+=b)>7){bs->cur+=bs->bit>>3;bs->bit&=7;} -#define BYTE_ALIGN(bs) if(bs->bit){bs->cur++;bs->bit=0;} -#define CHECK_BOUND(bs,n) if(bs->cur+(n)>bs->end)return(H323_ERROR_BOUND) -static unsigned get_len(bitstr_t * bs); -static unsigned get_bit(bitstr_t * bs); -static unsigned get_bits(bitstr_t * bs, unsigned b); -static unsigned get_bitmap(bitstr_t * bs, unsigned b); -static unsigned get_uint(bitstr_t * bs, int b); +#define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} +#define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} +#define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} +#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) +static unsigned int get_len(bitstr_t *bs); +static unsigned int get_bit(bitstr_t *bs); +static unsigned int get_bits(bitstr_t *bs, unsigned int b); +static unsigned int get_bitmap(bitstr_t *bs, unsigned int b); +static unsigned int get_uint(bitstr_t *bs, int b); /* Decoder Functions */ -static int decode_nul(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_bool(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_oid(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_int(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_enum(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_bitstr(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_numstr(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_octstr(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_bmpstr(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_seq(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_seqof(bitstr_t * bs, field_t * f, char *base, int level); -static int decode_choice(bitstr_t * bs, field_t * f, char *base, int level); +static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level); /* Decoder Functions Vector */ -typedef int (*decoder_t) (bitstr_t *, field_t *, char *, int); -static decoder_t Decoders[] = { +typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int); +static const decoder_t Decoders[] = { decode_nul, decode_bool, decode_oid, @@ -150,9 +150,9 @@ static decoder_t Decoders[] = { * Functions ****************************************************************************/ /* Assume bs is aligned && v < 16384 */ -unsigned get_len(bitstr_t * bs) +static unsigned int get_len(bitstr_t *bs) { - unsigned v; + unsigned int v; v = *bs->cur++; @@ -166,9 +166,9 @@ unsigned get_len(bitstr_t * bs) } /****************************************************************************/ -unsigned get_bit(bitstr_t * bs) +static unsigned int get_bit(bitstr_t *bs) { - unsigned b = (*bs->cur) & (0x80 >> bs->bit); + unsigned int b = (*bs->cur) & (0x80 >> bs->bit); INC_BIT(bs); @@ -177,9 +177,9 @@ unsigned get_bit(bitstr_t * bs) /****************************************************************************/ /* Assume b <= 8 */ -unsigned get_bits(bitstr_t * bs, unsigned b) +static unsigned int get_bits(bitstr_t *bs, unsigned int b) { - unsigned v, l; + unsigned int v, l; v = (*bs->cur) & (0xffU >> bs->bit); l = b + bs->bit; @@ -203,9 +203,9 @@ unsigned get_bits(bitstr_t * bs, unsigned b) /****************************************************************************/ /* Assume b <= 32 */ -unsigned get_bitmap(bitstr_t * bs, unsigned b) +static unsigned int get_bitmap(bitstr_t *bs, unsigned int b) { - unsigned v, l, shift, bytes; + unsigned int v, l, shift, bytes; if (!b) return 0; @@ -213,18 +213,18 @@ unsigned get_bitmap(bitstr_t * bs, unsigned b) l = bs->bit + b; if (l < 8) { - v = (unsigned) (*bs->cur) << (bs->bit + 24); + v = (unsigned int)(*bs->cur) << (bs->bit + 24); bs->bit = l; } else if (l == 8) { - v = (unsigned) (*bs->cur++) << (bs->bit + 24); + v = (unsigned int)(*bs->cur++) << (bs->bit + 24); bs->bit = 0; } else { for (bytes = l >> 3, shift = 24, v = 0; bytes; bytes--, shift -= 8) - v |= (unsigned) (*bs->cur++) << shift; + v |= (unsigned int)(*bs->cur++) << shift; if (l < 32) { - v |= (unsigned) (*bs->cur) << shift; + v |= (unsigned int)(*bs->cur) << shift; v <<= bs->bit; } else if (l > 32) { v <<= bs->bit; @@ -242,9 +242,9 @@ unsigned get_bitmap(bitstr_t * bs, unsigned b) /**************************************************************************** * Assume bs is aligned and sizeof(unsigned int) == 4 ****************************************************************************/ -unsigned get_uint(bitstr_t * bs, int b) +static unsigned int get_uint(bitstr_t *bs, int b) { - unsigned v = 0; + unsigned int v = 0; switch (b) { case 4: @@ -264,7 +264,8 @@ unsigned get_uint(bitstr_t * bs, int b) } /****************************************************************************/ -int decode_nul(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_nul(bitstr_t *bs, const struct field_t *f, + char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -272,7 +273,8 @@ int decode_nul(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_bool(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_bool(bitstr_t *bs, const struct field_t *f, + char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -283,7 +285,8 @@ int decode_bool(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_oid(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_oid(bitstr_t *bs, const struct field_t *f, + char *base, int level) { int len; @@ -299,9 +302,10 @@ int decode_oid(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_int(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_int(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned len; + unsigned int len; PRINT("%*.s%s", level * TAB_SIZE, " ", f->name); @@ -318,9 +322,9 @@ int decode_int(bitstr_t * bs, field_t * f, char *base, int level) len = get_bits(bs, 2) + 1; BYTE_ALIGN(bs); if (base && (f->attr & DECODE)) { /* timeToLive */ - unsigned v = get_uint(bs, len) + f->lb; + unsigned int v = get_uint(bs, len) + f->lb; PRINT(" = %u", v); - *((unsigned *) (base + f->offset)) = v; + *((unsigned int *)(base + f->offset)) = v; } bs->cur += len; break; @@ -342,7 +346,8 @@ int decode_int(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_enum(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_enum(bitstr_t *bs, const struct field_t *f, + char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -357,9 +362,10 @@ int decode_enum(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_bitstr(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_bitstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned len; + unsigned int len; PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -390,9 +396,10 @@ int decode_bitstr(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_numstr(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_numstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned len; + unsigned int len; PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -407,9 +414,10 @@ int decode_numstr(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_octstr(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_octstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned len; + unsigned int len; PRINT("%*.s%s", level * TAB_SIZE, " ", f->name); @@ -424,7 +432,7 @@ int decode_octstr(bitstr_t * bs, field_t * f, char *base, int level) bs->cur[0], bs->cur[1], bs->cur[2], bs->cur[3], bs->cur[4] * 256 + bs->cur[5])); - *((unsigned *) (base + f->offset)) = + *((unsigned int *)(base + f->offset)) = bs->cur - bs->buf; } } @@ -455,9 +463,10 @@ int decode_octstr(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_bmpstr(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned len; + unsigned int len; PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -480,11 +489,12 @@ int decode_bmpstr(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_seq(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_seq(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned ext, bmp, i, opt, len = 0, bmp2, bmp2_len; + unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len; int err; - field_t *son; + const struct field_t *son; unsigned char *beg = NULL; PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -498,7 +508,7 @@ int decode_seq(bitstr_t * bs, field_t * f, char *base, int level) /* Get fields bitmap */ bmp = get_bitmap(bs, f->sz); if (base) - *(unsigned *) base = bmp; + *(unsigned int *)base = bmp; /* Decode the root components */ for (i = opt = 0, son = f->fields; i < f->lb; i++, son++) { @@ -518,7 +528,7 @@ int decode_seq(bitstr_t * bs, field_t * f, char *base, int level) CHECK_BOUND(bs, 2); len = get_len(bs); CHECK_BOUND(bs, len); - if (!base) { + if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); bs->cur += len; @@ -550,20 +560,11 @@ int decode_seq(bitstr_t * bs, field_t * f, char *base, int level) bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) - *(unsigned *) base = bmp; + *(unsigned int *)base = bmp; BYTE_ALIGN(bs); /* Decode the extension components */ for (opt = 0; opt < bmp2_len; opt++, i++, son++) { - if (i < f->ub && son->attr & STOP) { - PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", - son->name); - return H323_ERROR_STOP; - } - - if (!((0x80000000 >> opt) & bmp2)) /* Not present */ - continue; - /* Check Range */ if (i >= f->ub) { /* Newer Version? */ CHECK_BOUND(bs, 2); @@ -573,6 +574,15 @@ int decode_seq(bitstr_t * bs, field_t * f, char *base, int level) continue; } + if (son->attr & STOP) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + return H323_ERROR_STOP; + } + + if (!((0x80000000 >> opt) & bmp2)) /* Not present */ + continue; + CHECK_BOUND(bs, 2); len = get_len(bs); CHECK_BOUND(bs, len); @@ -596,11 +606,12 @@ int decode_seq(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int decode_seqof(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_seqof(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned count, effective_count = 0, i, len = 0; + unsigned int count, effective_count = 0, i, len = 0; int err; - field_t *son; + const struct field_t *son; unsigned char *beg = NULL; PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -620,7 +631,7 @@ int decode_seqof(bitstr_t * bs, field_t * f, char *base, int level) CHECK_BOUND(bs, 2); count = *bs->cur++; count <<= 8; - count = *bs->cur++; + count += *bs->cur++; break; case SEMI: BYTE_ALIGN(bs); @@ -636,8 +647,8 @@ int decode_seqof(bitstr_t * bs, field_t * f, char *base, int level) /* Write Count */ if (base) { effective_count = count > f->ub ? f->ub : count; - *(unsigned *) base = effective_count; - base += sizeof(unsigned); + *(unsigned int *)base = effective_count; + base += sizeof(unsigned int); } /* Decode nested field */ @@ -685,11 +696,12 @@ int decode_seqof(bitstr_t * bs, field_t * f, char *base, int level) /****************************************************************************/ -int decode_choice(bitstr_t * bs, field_t * f, char *base, int level) +static int decode_choice(bitstr_t *bs, const struct field_t *f, + char *base, int level) { - unsigned type, ext, len = 0; + unsigned int type, ext, len = 0; int err; - field_t *son; + const struct field_t *son; unsigned char *beg = NULL; PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -704,11 +716,13 @@ int decode_choice(bitstr_t * bs, field_t * f, char *base, int level) } else { ext = 0; type = get_bits(bs, f->sz); + if (type >= f->lb) + return H323_ERROR_RANGE; } /* Write Type */ if (base) - *(unsigned *) base = type; + *(unsigned int *)base = type; /* Check Range */ if (type >= f->ub) { /* Newer version? */ @@ -752,9 +766,9 @@ int decode_choice(bitstr_t * bs, field_t * f, char *base, int level) } /****************************************************************************/ -int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage * ras) +int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras) { - static field_t ras_message = { + static const struct field_t ras_message = { FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT, 0, _RasMessage }; @@ -769,9 +783,9 @@ int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage * ras) /****************************************************************************/ static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg, - size_t sz, H323_UserInformation * uuie) + size_t sz, H323_UserInformation *uuie) { - static field_t h323_userinformation = { + static const struct field_t h323_userinformation = { FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT, 0, _H323_UserInformation }; @@ -790,7 +804,7 @@ int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, MultimediaSystemControlMessage * mscm) { - static field_t multimediasystemcontrolmessage = { + static const struct field_t multimediasystemcontrolmessage = { FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4, DECODE | EXT, 0, _MultimediaSystemControlMessage }; @@ -805,7 +819,7 @@ int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, } /****************************************************************************/ -int DecodeQ931(unsigned char *buf, size_t sz, Q931 * q931) +int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) { unsigned char *p = buf; int len; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index b284db73ca7..3a3a60b126e 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -2,6 +2,7 @@ * H.323 connection tracking helper * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * This source code is licensed under General Public License version 2. * @@ -17,6 +18,7 @@ #include <linux/inet.h> #include <linux/in.h> #include <linux/ip.h> +#include <linux/slab.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/skbuff.h> @@ -29,14 +31,9 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_conntrack_h323.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - /* Parameters */ static unsigned int default_rrq_ttl __read_mostly = 300; module_param(default_rrq_ttl, uint, 0600); @@ -46,62 +43,67 @@ static int gkrouted_only __read_mostly = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); -static int callforward_filter __read_mostly = 1; +static bool callforward_filter __read_mostly = true; module_param(callforward_filter, bool, 0600); MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " "if both endpoints are on different sides " "(determined by routing information)"); /* Hooks for NAT */ -int (*set_h245_addr_hook) (struct sk_buff **pskb, +int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) __read_mostly; -int (*set_h225_addr_hook) (struct sk_buff **pskb, +int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) __read_mostly; -int (*set_sig_addr_hook) (struct sk_buff **pskb, +int (*set_sig_addr_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) __read_mostly; -int (*set_ras_addr_hook) (struct sk_buff **pskb, +int (*set_ras_addr_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) __read_mostly; -int (*nat_rtp_rtcp_hook) (struct sk_buff **pskb, +int (*nat_rtp_rtcp_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, __be16 port, __be16 rtp_port, struct nf_conntrack_expect *rtp_exp, struct nf_conntrack_expect *rtcp_exp) __read_mostly; -int (*nat_t120_hook) (struct sk_buff **pskb, +int (*nat_t120_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; -int (*nat_h245_hook) (struct sk_buff **pskb, +int (*nat_h245_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; -int (*nat_callforwarding_hook) (struct sk_buff **pskb, +int (*nat_callforwarding_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; -int (*nat_q931_hook) (struct sk_buff **pskb, +int (*nat_q931_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int idx, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; @@ -114,13 +116,14 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[]; static struct nf_conntrack_helper nf_conntrack_helper_ras[]; /****************************************************************************/ -static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff, +static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned char **data, int *datalen, int *dataoff) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); - struct tcphdr _tcph, *th; + const struct tcphdr *th; + struct tcphdr _tcph; int tcpdatalen; int tcpdataoff; unsigned char *tpkt; @@ -128,7 +131,7 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff, int tpktoff; /* Get TCP header */ - th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); if (th == NULL) return 0; @@ -136,13 +139,13 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff, tcpdataoff = protoff + th->doff * 4; /* Get TCP data length */ - tcpdatalen = (*pskb)->len - tcpdataoff; + tcpdatalen = skb->len - tcpdataoff; if (tcpdatalen <= 0) /* No TCP data */ goto clear_out; if (*data == NULL) { /* first TPKT */ /* Get first TPKT pointer */ - tpkt = skb_header_pointer(*pskb, tcpdataoff, tcpdatalen, + tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen, h323_buffer); BUG_ON(tpkt == NULL); @@ -150,9 +153,9 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff, if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { /* Netmeeting sends TPKT header and data separately */ if (info->tpkt_len[dir] > 0) { - DEBUGP("nf_ct_h323: previous packet " - "indicated separate TPKT data of %hu " - "bytes\n", info->tpkt_len[dir]); + pr_debug("nf_ct_h323: previous packet " + "indicated separate TPKT data of %hu " + "bytes\n", info->tpkt_len[dir]); if (info->tpkt_len[dir] <= tcpdatalen) { /* Yes, there was a TPKT header * received */ @@ -163,9 +166,7 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff, } /* Fragmented TPKT */ - if (net_ratelimit()) - printk("nf_ct_h323: " - "fragmented TPKT\n"); + pr_debug("nf_ct_h323: fragmented TPKT\n"); goto clear_out; } @@ -192,15 +193,14 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff, if (tpktlen > tcpdatalen) { if (tcpdatalen == 4) { /* Separate TPKT header */ /* Netmeeting sends TPKT header and data separately */ - DEBUGP("nf_ct_h323: separate TPKT header indicates " - "there will be TPKT data of %hu bytes\n", - tpktlen - 4); + pr_debug("nf_ct_h323: separate TPKT header indicates " + "there will be TPKT data of %hu bytes\n", + tpktlen - 4); info->tpkt_len[dir] = tpktlen - 4; return 0; } - if (net_ratelimit()) - printk("nf_ct_h323: incomplete TPKT (fragmented?)\n"); + pr_debug("nf_ct_h323: incomplete TPKT (fragmented?)\n"); goto clear_out; } @@ -220,12 +220,11 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff, } /****************************************************************************/ -static int get_h245_addr(struct nf_conn *ct, unsigned char *data, +static int get_h245_addr(struct nf_conn *ct, const unsigned char *data, H245_TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 *port) + union nf_inet_addr *addr, __be16 *port) { - unsigned char *p; - int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + const unsigned char *p; int len; if (taddr->choice != eH245_TransportAddress_unicastAddress) @@ -233,13 +232,13 @@ static int get_h245_addr(struct nf_conn *ct, unsigned char *data, switch (taddr->unicastAddress.choice) { case eUnicastAddress_iPAddress: - if (family != AF_INET) + if (nf_ct_l3num(ct) != AF_INET) return 0; p = data + taddr->unicastAddress.iPAddress.network; len = 4; break; case eUnicastAddress_iP6Address: - if (family != AF_INET6) + if (nf_ct_l3num(ct) != AF_INET6) return 0; p = data + taddr->unicastAddress.iP6Address.network; len = 16; @@ -256,8 +255,9 @@ static int get_h245_addr(struct nf_conn *ct, unsigned char *data, } /****************************************************************************/ -static int expect_rtp_rtcp(struct sk_buff **pskb, struct nf_conn *ct, +static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr) { @@ -265,7 +265,7 @@ static int expect_rtp_rtcp(struct sk_buff **pskb, struct nf_conn *ct, int ret = 0; __be16 port; __be16 rtp_port, rtcp_port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *rtp_exp; struct nf_conntrack_expect *rtcp_exp; typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp; @@ -277,68 +277,69 @@ static int expect_rtp_rtcp(struct sk_buff **pskb, struct nf_conn *ct, return 0; /* RTP port is even */ - port &= htons(~1); - rtp_port = port; - rtcp_port = htons(ntohs(port) + 1); + rtp_port = port & ~htons(1); + rtcp_port = port | htons(1); /* Create expect for RTP */ - if ((rtp_exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(rtp_exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3, - IPPROTO_UDP, NULL, &rtp_port); + nf_ct_expect_init(rtp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_UDP, NULL, &rtp_port); /* Create expect for RTCP */ - if ((rtcp_exp = nf_conntrack_expect_alloc(ct)) == NULL) { - nf_conntrack_expect_put(rtp_exp); + if ((rtcp_exp = nf_ct_expect_alloc(ct)) == NULL) { + nf_ct_expect_put(rtp_exp); return -1; } - nf_conntrack_expect_init(rtcp_exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3, - IPPROTO_UDP, NULL, &rtcp_port); + nf_ct_expect_init(rtcp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_UDP, NULL, &rtcp_port); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, + ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, rtp_port, rtp_exp, rtcp_exp); } else { /* Conntrack only */ - if (nf_conntrack_expect_related(rtp_exp) == 0) { - if (nf_conntrack_expect_related(rtcp_exp) == 0) { - DEBUGP("nf_ct_h323: expect RTP "); - NF_CT_DUMP_TUPLE(&rtp_exp->tuple); - DEBUGP("nf_ct_h323: expect RTCP "); - NF_CT_DUMP_TUPLE(&rtcp_exp->tuple); + if (nf_ct_expect_related(rtp_exp) == 0) { + if (nf_ct_expect_related(rtcp_exp) == 0) { + pr_debug("nf_ct_h323: expect RTP "); + nf_ct_dump_tuple(&rtp_exp->tuple); + pr_debug("nf_ct_h323: expect RTCP "); + nf_ct_dump_tuple(&rtcp_exp->tuple); } else { - nf_conntrack_unexpect_related(rtp_exp); + nf_ct_unexpect_related(rtp_exp); ret = -1; } } else ret = -1; } - nf_conntrack_expect_put(rtp_exp); - nf_conntrack_expect_put(rtcp_exp); + nf_ct_expect_put(rtp_exp); + nf_ct_expect_put(rtcp_exp); return ret; } /****************************************************************************/ -static int expect_t120(struct sk_buff **pskb, +static int expect_t120(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr) { int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_t120_hook) nat_t120; @@ -349,39 +350,41 @@ static int expect_t120(struct sk_buff **pskb, return 0; /* Create expect for T.120 connections */ - if ((exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */ if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_t120 = rcu_dereference(nat_t120_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_t120(pskb, ct, ctinfo, data, dataoff, taddr, + ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ - if (nf_conntrack_expect_related(exp) == 0) { - DEBUGP("nf_ct_h323: expect T.120 "); - NF_CT_DUMP_TUPLE(&exp->tuple); + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_h323: expect T.120 "); + nf_ct_dump_tuple(&exp->tuple); } else ret = -1; } - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); return ret; } /****************************************************************************/ -static int process_h245_channel(struct sk_buff **pskb, +static int process_h245_channel(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H2250LogicalChannelParameters *channel) { @@ -389,7 +392,7 @@ static int process_h245_channel(struct sk_buff **pskb, if (channel->options & eH2250LogicalChannelParameters_mediaChannel) { /* RTP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, &channel->mediaChannel); if (ret < 0) return -1; @@ -398,7 +401,7 @@ static int process_h245_channel(struct sk_buff **pskb, if (channel-> options & eH2250LogicalChannelParameters_mediaControlChannel) { /* RTCP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, &channel->mediaControlChannel); if (ret < 0) return -1; @@ -408,19 +411,21 @@ static int process_h245_channel(struct sk_buff **pskb, } /****************************************************************************/ -static int process_olc(struct sk_buff **pskb, struct nf_conn *ct, +static int process_olc(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, OpenLogicalChannel *olc) { int ret; - DEBUGP("nf_ct_h323: OpenLogicalChannel\n"); + pr_debug("nf_ct_h323: OpenLogicalChannel\n"); if (olc->forwardLogicalChannelParameters.multiplexParameters.choice == eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters) { - ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff, + ret = process_h245_channel(skb, ct, ctinfo, + protoff, data, dataoff, &olc-> forwardLogicalChannelParameters. multiplexParameters. @@ -438,7 +443,8 @@ static int process_olc(struct sk_buff **pskb, struct nf_conn *ct, eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) { ret = - process_h245_channel(pskb, ct, ctinfo, data, dataoff, + process_h245_channel(skb, ct, ctinfo, + protoff, data, dataoff, &olc-> reverseLogicalChannelParameters. multiplexParameters. @@ -456,7 +462,7 @@ static int process_olc(struct sk_buff **pskb, struct nf_conn *ct, t120.choice == eDataProtocolCapability_separateLANStack && olc->separateStack.networkAddress.choice == eNetworkAccessParameters_networkAddress_localAreaAddress) { - ret = expect_t120(pskb, ct, ctinfo, data, dataoff, + ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff, &olc->separateStack.networkAddress. localAreaAddress); if (ret < 0) @@ -467,15 +473,15 @@ static int process_olc(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_olca(struct sk_buff **pskb, struct nf_conn *ct, +static int process_olca(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, OpenLogicalChannelAck *olca) { H2250LogicalChannelAckParameters *ack; int ret; - DEBUGP("nf_ct_h323: OpenLogicalChannelAck\n"); + pr_debug("nf_ct_h323: OpenLogicalChannelAck\n"); if ((olca->options & eOpenLogicalChannelAck_reverseLogicalChannelParameters) && @@ -485,7 +491,8 @@ static int process_olca(struct sk_buff **pskb, struct nf_conn *ct, choice == eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) { - ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff, + ret = process_h245_channel(skb, ct, ctinfo, + protoff, data, dataoff, &olca-> reverseLogicalChannelParameters. multiplexParameters. @@ -504,7 +511,8 @@ static int process_olca(struct sk_buff **pskb, struct nf_conn *ct, if (ack->options & eH2250LogicalChannelAckParameters_mediaChannel) { /* RTP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, + protoff, data, dataoff, &ack->mediaChannel); if (ret < 0) return -1; @@ -513,44 +521,57 @@ static int process_olca(struct sk_buff **pskb, struct nf_conn *ct, if (ack->options & eH2250LogicalChannelAckParameters_mediaControlChannel) { /* RTCP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, + protoff, data, dataoff, &ack->mediaControlChannel); if (ret < 0) return -1; } } + if ((olca->options & eOpenLogicalChannelAck_separateStack) && + olca->separateStack.networkAddress.choice == + eNetworkAccessParameters_networkAddress_localAreaAddress) { + ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff, + &olca->separateStack.networkAddress. + localAreaAddress); + if (ret < 0) + return -1; + } + return 0; } /****************************************************************************/ -static int process_h245(struct sk_buff **pskb, struct nf_conn *ct, +static int process_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, MultimediaSystemControlMessage *mscm) { switch (mscm->choice) { case eMultimediaSystemControlMessage_request: if (mscm->request.choice == eRequestMessage_openLogicalChannel) { - return process_olc(pskb, ct, ctinfo, data, dataoff, + return process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &mscm->request.openLogicalChannel); } - DEBUGP("nf_ct_h323: H.245 Request %d\n", - mscm->request.choice); + pr_debug("nf_ct_h323: H.245 Request %d\n", + mscm->request.choice); break; case eMultimediaSystemControlMessage_response: if (mscm->response.choice == eResponseMessage_openLogicalChannelAck) { - return process_olca(pskb, ct, ctinfo, data, dataoff, + return process_olca(skb, ct, ctinfo, + protoff, data, dataoff, &mscm->response. openLogicalChannelAck); } - DEBUGP("nf_ct_h323: H.245 Response %d\n", - mscm->response.choice); + pr_debug("nf_ct_h323: H.245 Response %d\n", + mscm->response.choice); break; default: - DEBUGP("nf_ct_h323: H.245 signal %d\n", mscm->choice); + pr_debug("nf_ct_h323: H.245 signal %d\n", mscm->choice); break; } @@ -558,7 +579,7 @@ static int process_h245(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int h245_help(struct sk_buff **pskb, unsigned int protoff, +static int h245_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { static MultimediaSystemControlMessage mscm; @@ -568,34 +589,33 @@ static int h245_help(struct sk_buff **pskb, unsigned int protoff, int ret; /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; - } - DEBUGP("nf_ct_h245: skblen = %u\n", (*pskb)->len); + + pr_debug("nf_ct_h245: skblen = %u\n", skb->len); spin_lock_bh(&nf_h323_lock); /* Process each TPKT */ - while (get_tpkt_data(pskb, protoff, ct, ctinfo, + while (get_tpkt_data(skb, protoff, ct, ctinfo, &data, &datalen, &dataoff)) { - DEBUGP("nf_ct_h245: TPKT len=%d ", datalen); - NF_CT_DUMP_TUPLE(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + pr_debug("nf_ct_h245: TPKT len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); /* Decode H.245 signal */ ret = DecodeMultimediaSystemControlMessage(data, datalen, &mscm); if (ret < 0) { - if (net_ratelimit()) - printk("nf_ct_h245: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); + pr_debug("nf_ct_h245: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); /* We don't drop when decoding error */ break; } /* Process H.245 signal */ - if (process_h245(pskb, ct, ctinfo, &data, dataoff, &mscm) < 0) + if (process_h245(skb, ct, ctinfo, protoff, + &data, dataoff, &mscm) < 0) goto drop; } @@ -604,43 +624,45 @@ static int h245_help(struct sk_buff **pskb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - if (net_ratelimit()) - printk("nf_ct_h245: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process H.245 message"); return NF_DROP; } /****************************************************************************/ +static const struct nf_conntrack_expect_policy h245_exp_policy = { + .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */, + .timeout = 240, +}; + static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = { .name = "H.245", .me = THIS_MODULE, - .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */, - .timeout = 240, + .data_len = sizeof(struct nf_ct_h323_master), + .tuple.src.l3num = AF_UNSPEC, .tuple.dst.protonum = IPPROTO_UDP, - .mask.src.u.udp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, - .help = h245_help + .help = h245_help, + .expect_policy = &h245_exp_policy, }; /****************************************************************************/ int get_h225_addr(struct nf_conn *ct, unsigned char *data, TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 *port) + union nf_inet_addr *addr, __be16 *port) { - unsigned char *p; - int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + const unsigned char *p; int len; switch (taddr->choice) { case eTransportAddress_ipAddress: - if (family != AF_INET) + if (nf_ct_l3num(ct) != AF_INET) return 0; p = data + taddr->ipAddress.ip; len = 4; break; case eTransportAddress_ip6Address: - if (family != AF_INET6) + if (nf_ct_l3num(ct) != AF_INET6) return 0; - p = data + taddr->ip6Address.ip6; + p = data + taddr->ip6Address.ip; len = 16; break; default: @@ -655,15 +677,15 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data, } /****************************************************************************/ -static int expect_h245(struct sk_buff **pskb, struct nf_conn *ct, +static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr) { int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_h245_hook) nat_h245; @@ -674,81 +696,95 @@ static int expect_h245(struct sk_buff **pskb, struct nf_conn *ct, return 0; /* Create expect for h245 connection */ - if ((exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); exp->helper = &nf_conntrack_helper_h245; if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_h245 = rcu_dereference(nat_h245_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_h245(pskb, ct, ctinfo, data, dataoff, taddr, + ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ - if (nf_conntrack_expect_related(exp) == 0) { - DEBUGP("nf_ct_q931: expect H.245 "); - NF_CT_DUMP_TUPLE(&exp->tuple); + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_q931: expect H.245 "); + nf_ct_dump_tuple(&exp->tuple); } else ret = -1; } - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); return ret; } /* If the calling party is on the same side of the forward-to party, * we don't need to track the second call */ -static int callforward_do_filter(union nf_conntrack_address *src, - union nf_conntrack_address *dst, - int family) +static int callforward_do_filter(const union nf_inet_addr *src, + const union nf_inet_addr *dst, + u_int8_t family) { - struct flowi fl1, fl2; + const struct nf_afinfo *afinfo; int ret = 0; - memset(&fl1, 0, sizeof(fl1)); - memset(&fl2, 0, sizeof(fl2)); + /* rcu_read_lock()ed by nf_hook_slow() */ + afinfo = nf_get_afinfo(family); + if (!afinfo) + return 0; switch (family) { case AF_INET: { + struct flowi4 fl1, fl2; struct rtable *rt1, *rt2; - fl1.fl4_dst = src->ip; - fl2.fl4_dst = dst->ip; - if (ip_route_output_key(&rt1, &fl1) == 0) { - if (ip_route_output_key(&rt2, &fl2) == 0) { - if (rt1->rt_gateway == rt2->rt_gateway && - rt1->u.dst.dev == rt2->u.dst.dev) + memset(&fl1, 0, sizeof(fl1)); + fl1.daddr = src->ip; + + memset(&fl2, 0, sizeof(fl2)); + fl2.daddr = dst->ip; + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, + flowi4_to_flowi(&fl1), false)) { + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, + flowi4_to_flowi(&fl2), false)) { + if (rt_nexthop(rt1, fl1.daddr) == + rt_nexthop(rt2, fl2.daddr) && + rt1->dst.dev == rt2->dst.dev) ret = 1; - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); } - dst_release(&rt1->u.dst); + dst_release(&rt1->dst); } break; } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) case AF_INET6: { + struct flowi6 fl1, fl2; struct rt6_info *rt1, *rt2; - memcpy(&fl1.fl6_dst, src, sizeof(fl1.fl6_dst)); - memcpy(&fl2.fl6_dst, dst, sizeof(fl2.fl6_dst)); - rt1 = (struct rt6_info *)ip6_route_output(NULL, &fl1); - if (rt1) { - rt2 = (struct rt6_info *)ip6_route_output(NULL, &fl2); - if (rt2) { - if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, - sizeof(rt1->rt6i_gateway)) && - rt1->u.dst.dev == rt2->u.dst.dev) + memset(&fl1, 0, sizeof(fl1)); + fl1.daddr = src->in6; + + memset(&fl2, 0, sizeof(fl2)); + fl2.daddr = dst->in6; + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, + flowi6_to_flowi(&fl1), false)) { + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, + flowi6_to_flowi(&fl2), false)) { + if (ipv6_addr_equal(rt6_nexthop(rt1), + rt6_nexthop(rt2)) && + rt1->dst.dev == rt2->dst.dev) ret = 1; - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); } - dst_release(&rt1->u.dst); + dst_release(&rt1->dst); } break; } @@ -759,16 +795,17 @@ static int callforward_do_filter(union nf_conntrack_address *src, } /****************************************************************************/ -static int expect_callforwarding(struct sk_buff **pskb, +static int expect_callforwarding(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr) { int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_callforwarding_hook) nat_callforwarding; @@ -780,43 +817,46 @@ static int expect_callforwarding(struct sk_buff **pskb, * we don't need to track the second call */ if (callforward_filter && callforward_do_filter(&addr, &ct->tuplehash[!dir].tuple.src.u3, - ct->tuplehash[!dir].tuple.src.l3num)) { - DEBUGP("nf_ct_q931: Call Forwarding not tracked\n"); + nf_ct_l3num(ct))) { + pr_debug("nf_ct_q931: Call Forwarding not tracked\n"); return 0; } /* Create expect for the second call leg */ - if ((exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, &addr, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); exp->helper = nf_conntrack_helper_q931; if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* Need NAT */ - ret = nat_callforwarding(pskb, ct, ctinfo, data, dataoff, + ret = nat_callforwarding(skb, ct, ctinfo, + protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ - if (nf_conntrack_expect_related(exp) == 0) { - DEBUGP("nf_ct_q931: expect Call Forwarding "); - NF_CT_DUMP_TUPLE(&exp->tuple); + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_q931: expect Call Forwarding "); + nf_ct_dump_tuple(&exp->tuple); } else ret = -1; } - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); return ret; } /****************************************************************************/ -static int process_setup(struct sk_buff **pskb, struct nf_conn *ct, +static int process_setup(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Setup_UUIE *setup) { @@ -824,13 +864,13 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct, int ret; int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; typeof(set_h225_addr_hook) set_h225_addr; - DEBUGP("nf_ct_q931: Setup\n"); + pr_debug("nf_ct_q931: Setup\n"); if (setup->options & eSetup_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &setup->h245Address); if (ret < 0) return -1; @@ -838,16 +878,15 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct, set_h225_addr = rcu_dereference(set_h225_addr_hook); if ((setup->options & eSetup_UUIE_destCallSignalAddress) && - (set_h225_addr) && ct->status && IPS_NAT_MASK && + (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK && get_h225_addr(ct, *data, &setup->destCallSignalAddress, &addr, &port) && memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) { - DEBUGP("nf_ct_q931: set destCallSignalAddress " - NIP6_FMT ":%hu->" NIP6_FMT ":%hu\n", - NIP6(*(struct in6_addr *)&addr), ntohs(port), - NIP6(*(struct in6_addr *)&ct->tuplehash[!dir].tuple.src.u3), - ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); - ret = set_h225_addr(pskb, data, dataoff, + pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n", + &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3, + ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); + ret = set_h225_addr(skb, protoff, data, dataoff, &setup->destCallSignalAddress, &ct->tuplehash[!dir].tuple.src.u3, ct->tuplehash[!dir].tuple.src.u.tcp.port); @@ -856,16 +895,15 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct, } if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && - (set_h225_addr) && ct->status & IPS_NAT_MASK && + (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK && get_h225_addr(ct, *data, &setup->sourceCallSignalAddress, &addr, &port) && memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) { - DEBUGP("nf_ct_q931: set sourceCallSignalAddress " - NIP6_FMT ":%hu->" NIP6_FMT ":%hu\n", - NIP6(*(struct in6_addr *)&addr), ntohs(port), - NIP6(*(struct in6_addr *)&ct->tuplehash[!dir].tuple.dst.u3), - ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); - ret = set_h225_addr(pskb, data, dataoff, + pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n", + &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3, + ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); + ret = set_h225_addr(skb, protoff, data, dataoff, &setup->sourceCallSignalAddress, &ct->tuplehash[!dir].tuple.dst.u3, ct->tuplehash[!dir].tuple.dst.u.tcp.port); @@ -875,7 +913,8 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct, if (setup->options & eSetup_UUIE_fastStart) { for (i = 0; i < setup->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &setup->fastStart.item[i]); if (ret < 0) return -1; @@ -886,19 +925,20 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_callproceeding(struct sk_buff **pskb, +static int process_callproceeding(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, CallProceeding_UUIE *callproc) { int ret; int i; - DEBUGP("nf_ct_q931: CallProceeding\n"); + pr_debug("nf_ct_q931: CallProceeding\n"); if (callproc->options & eCallProceeding_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &callproc->h245Address); if (ret < 0) return -1; @@ -906,7 +946,8 @@ static int process_callproceeding(struct sk_buff **pskb, if (callproc->options & eCallProceeding_UUIE_fastStart) { for (i = 0; i < callproc->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &callproc->fastStart.item[i]); if (ret < 0) return -1; @@ -917,18 +958,19 @@ static int process_callproceeding(struct sk_buff **pskb, } /****************************************************************************/ -static int process_connect(struct sk_buff **pskb, struct nf_conn *ct, +static int process_connect(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Connect_UUIE *connect) { int ret; int i; - DEBUGP("nf_ct_q931: Connect\n"); + pr_debug("nf_ct_q931: Connect\n"); if (connect->options & eConnect_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &connect->h245Address); if (ret < 0) return -1; @@ -936,7 +978,8 @@ static int process_connect(struct sk_buff **pskb, struct nf_conn *ct, if (connect->options & eConnect_UUIE_fastStart) { for (i = 0; i < connect->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &connect->fastStart.item[i]); if (ret < 0) return -1; @@ -947,18 +990,19 @@ static int process_connect(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_alerting(struct sk_buff **pskb, struct nf_conn *ct, +static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Alerting_UUIE *alert) { int ret; int i; - DEBUGP("nf_ct_q931: Alerting\n"); + pr_debug("nf_ct_q931: Alerting\n"); if (alert->options & eAlerting_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &alert->h245Address); if (ret < 0) return -1; @@ -966,7 +1010,8 @@ static int process_alerting(struct sk_buff **pskb, struct nf_conn *ct, if (alert->options & eAlerting_UUIE_fastStart) { for (i = 0; i < alert->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &alert->fastStart.item[i]); if (ret < 0) return -1; @@ -977,51 +1022,28 @@ static int process_alerting(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_information(struct sk_buff **pskb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Information_UUIE *info) -{ - int ret; - int i; - - DEBUGP("nf_ct_q931: Information\n"); - - if (info->options & eInformation_UUIE_fastStart) { - for (i = 0; i < info->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &info->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_facility(struct sk_buff **pskb, struct nf_conn *ct, +static int process_facility(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Facility_UUIE *facility) { int ret; int i; - DEBUGP("nf_ct_q931: Facility\n"); + pr_debug("nf_ct_q931: Facility\n"); if (facility->reason.choice == eFacilityReason_callForwarded) { if (facility->options & eFacility_UUIE_alternativeAddress) - return expect_callforwarding(pskb, ct, ctinfo, data, - dataoff, + return expect_callforwarding(skb, ct, ctinfo, + protoff, data, dataoff, &facility-> alternativeAddress); return 0; } if (facility->options & eFacility_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &facility->h245Address); if (ret < 0) return -1; @@ -1029,7 +1051,8 @@ static int process_facility(struct sk_buff **pskb, struct nf_conn *ct, if (facility->options & eFacility_UUIE_fastStart) { for (i = 0; i < facility->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &facility->fastStart.item[i]); if (ret < 0) return -1; @@ -1040,18 +1063,19 @@ static int process_facility(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_progress(struct sk_buff **pskb, struct nf_conn *ct, +static int process_progress(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Progress_UUIE *progress) { int ret; int i; - DEBUGP("nf_ct_q931: Progress\n"); + pr_debug("nf_ct_q931: Progress\n"); if (progress->options & eProgress_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &progress->h245Address); if (ret < 0) return -1; @@ -1059,7 +1083,8 @@ static int process_progress(struct sk_buff **pskb, struct nf_conn *ct, if (progress->options & eProgress_UUIE_fastStart) { for (i = 0; i < progress->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &progress->fastStart.item[i]); if (ret < 0) return -1; @@ -1070,9 +1095,10 @@ static int process_progress(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_q931(struct sk_buff **pskb, struct nf_conn *ct, +static int process_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, Q931 *q931) + unsigned int protoff, unsigned char **data, int dataoff, + Q931 *q931) { H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu; int i; @@ -1080,38 +1106,34 @@ static int process_q931(struct sk_buff **pskb, struct nf_conn *ct, switch (pdu->h323_message_body.choice) { case eH323_UU_PDU_h323_message_body_setup: - ret = process_setup(pskb, ct, ctinfo, data, dataoff, + ret = process_setup(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.setup); break; case eH323_UU_PDU_h323_message_body_callProceeding: - ret = process_callproceeding(pskb, ct, ctinfo, data, dataoff, + ret = process_callproceeding(skb, ct, ctinfo, + protoff, data, dataoff, &pdu->h323_message_body. callProceeding); break; case eH323_UU_PDU_h323_message_body_connect: - ret = process_connect(pskb, ct, ctinfo, data, dataoff, + ret = process_connect(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.connect); break; case eH323_UU_PDU_h323_message_body_alerting: - ret = process_alerting(pskb, ct, ctinfo, data, dataoff, + ret = process_alerting(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.alerting); break; - case eH323_UU_PDU_h323_message_body_information: - ret = process_information(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body. - information); - break; case eH323_UU_PDU_h323_message_body_facility: - ret = process_facility(pskb, ct, ctinfo, data, dataoff, + ret = process_facility(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.facility); break; case eH323_UU_PDU_h323_message_body_progress: - ret = process_progress(pskb, ct, ctinfo, data, dataoff, + ret = process_progress(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.progress); break; default: - DEBUGP("nf_ct_q931: Q.931 signal %d\n", - pdu->h323_message_body.choice); + pr_debug("nf_ct_q931: Q.931 signal %d\n", + pdu->h323_message_body.choice); break; } @@ -1120,7 +1142,8 @@ static int process_q931(struct sk_buff **pskb, struct nf_conn *ct, if (pdu->options & eH323_UU_PDU_h245Control) { for (i = 0; i < pdu->h245Control.count; i++) { - ret = process_h245(pskb, ct, ctinfo, data, dataoff, + ret = process_h245(skb, ct, ctinfo, + protoff, data, dataoff, &pdu->h245Control.item[i]); if (ret < 0) return -1; @@ -1131,7 +1154,7 @@ static int process_q931(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int q931_help(struct sk_buff **pskb, unsigned int protoff, +static int q931_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { static Q931 q931; @@ -1141,33 +1164,32 @@ static int q931_help(struct sk_buff **pskb, unsigned int protoff, int ret; /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; - } - DEBUGP("nf_ct_q931: skblen = %u\n", (*pskb)->len); + + pr_debug("nf_ct_q931: skblen = %u\n", skb->len); spin_lock_bh(&nf_h323_lock); /* Process each TPKT */ - while (get_tpkt_data(pskb, protoff, ct, ctinfo, + while (get_tpkt_data(skb, protoff, ct, ctinfo, &data, &datalen, &dataoff)) { - DEBUGP("nf_ct_q931: TPKT len=%d ", datalen); - NF_CT_DUMP_TUPLE(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + pr_debug("nf_ct_q931: TPKT len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); /* Decode Q.931 signal */ ret = DecodeQ931(data, datalen, &q931); if (ret < 0) { - if (net_ratelimit()) - printk("nf_ct_q931: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); + pr_debug("nf_ct_q931: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); /* We don't drop when decoding error */ break; } /* Process Q.931 signal */ - if (process_q931(pskb, ct, ctinfo, &data, dataoff, &q931) < 0) + if (process_q931(skb, ct, ctinfo, protoff, + &data, dataoff, &q931) < 0) goto drop; } @@ -1176,65 +1198,63 @@ static int q931_help(struct sk_buff **pskb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - if (net_ratelimit()) - printk("nf_ct_q931: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process Q.931 message"); return NF_DROP; } /****************************************************************************/ +static const struct nf_conntrack_expect_policy q931_exp_policy = { + /* T.120 and H.245 */ + .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4, + .timeout = 240, +}; + static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { { .name = "Q.931", .me = THIS_MODULE, - /* T.120 and H.245 */ - .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4, - .timeout = 240, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET, - .tuple.src.u.tcp.port = __constant_htons(Q931_PORT), + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), .tuple.dst.protonum = IPPROTO_TCP, - .mask.src.l3num = 0xFFFF, - .mask.src.u.tcp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, - .help = q931_help + .help = q931_help, + .expect_policy = &q931_exp_policy, }, { .name = "Q.931", .me = THIS_MODULE, - /* T.120 and H.245 */ - .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4, - .timeout = 240, .tuple.src.l3num = AF_INET6, - .tuple.src.u.tcp.port = __constant_htons(Q931_PORT), + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), .tuple.dst.protonum = IPPROTO_TCP, - .mask.src.l3num = 0xFFFF, - .mask.src.u.tcp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, - .help = q931_help + .help = q931_help, + .expect_policy = &q931_exp_policy, }, }; /****************************************************************************/ -static unsigned char *get_udp_data(struct sk_buff **pskb, unsigned int protoff, +static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, int *datalen) { - struct udphdr _uh, *uh; + const struct udphdr *uh; + struct udphdr _uh; int dataoff; - uh = skb_header_pointer(*pskb, protoff, sizeof(_uh), &_uh); + uh = skb_header_pointer(skb, protoff, sizeof(_uh), &_uh); if (uh == NULL) return NULL; dataoff = protoff + sizeof(_uh); - if (dataoff >= (*pskb)->len) + if (dataoff >= skb->len) return NULL; - *datalen = (*pskb)->len - dataoff; - return skb_header_pointer(*pskb, dataoff, *datalen, h323_buffer); + *datalen = skb->len - dataoff; + return skb_header_pointer(skb, dataoff, *datalen, h323_buffer); } /****************************************************************************/ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, - union nf_conntrack_address *addr, + union nf_inet_addr *addr, __be16 port) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; @@ -1244,7 +1264,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, tuple.dst.u.tcp.port = port; tuple.dst.protonum = IPPROTO_TCP; - exp = __nf_conntrack_expect_find(&tuple); + exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); if (exp && exp->master == ct) return exp; return NULL; @@ -1252,7 +1272,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, /****************************************************************************/ static int set_expect_timeout(struct nf_conntrack_expect *exp, - unsigned timeout) + unsigned int timeout) { if (!exp || !del_timer(&exp->timeout)) return 0; @@ -1264,17 +1284,17 @@ static int set_expect_timeout(struct nf_conntrack_expect *exp, } /****************************************************************************/ -static int expect_q931(struct sk_buff **pskb, struct nf_conn *ct, +static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int ret = 0; int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_q931_hook) nat_q931; @@ -1290,24 +1310,25 @@ static int expect_q931(struct sk_buff **pskb, struct nf_conn *ct, return 0; /* Create expect for Q.931 */ - if ((exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(exp, ct->tuplehash[!dir].tuple.src.l3num, - gkrouted_only ? /* only accept calls from GK? */ - &ct->tuplehash[!dir].tuple.src.u3 : - NULL, - &ct->tuplehash[!dir].tuple.dst.u3, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + gkrouted_only ? /* only accept calls from GK? */ + &ct->tuplehash[!dir].tuple.src.u3 : NULL, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); exp->helper = nf_conntrack_helper_q931; exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nat_q931 = rcu_dereference(nat_q931_hook); - if (nat_q931 && ct->status & IPS_NAT_MASK) { /* Need NAT */ - ret = nat_q931(pskb, ct, ctinfo, data, taddr, i, port, exp); + if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { /* Need NAT */ + ret = nat_q931(skb, ct, ctinfo, protoff, data, + taddr, i, port, exp); } else { /* Conntrack only */ - if (nf_conntrack_expect_related(exp) == 0) { - DEBUGP("nf_ct_ras: expect Q.931 "); - NF_CT_DUMP_TUPLE(&exp->tuple); + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); /* Save port for looking up expect in processing RCF */ info->sig_port[dir] = port; @@ -1315,39 +1336,42 @@ static int expect_q931(struct sk_buff **pskb, struct nf_conn *ct, ret = -1; } - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); return ret; } /****************************************************************************/ -static int process_grq(struct sk_buff **pskb, struct nf_conn *ct, +static int process_grq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, GatekeeperRequest *grq) { typeof(set_ras_addr_hook) set_ras_addr; - DEBUGP("nf_ct_ras: GRQ\n"); + pr_debug("nf_ct_ras: GRQ\n"); set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) /* NATed */ - return set_ras_addr(pskb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) /* NATed */ + return set_ras_addr(skb, ct, ctinfo, protoff, data, &grq->rasAddress, 1); return 0; } /****************************************************************************/ -static int process_gcf(struct sk_buff **pskb, struct nf_conn *ct, +static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, GatekeeperConfirm *gcf) { int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; - DEBUGP("nf_ct_ras: GCF\n"); + pr_debug("nf_ct_ras: GCF\n"); if (!get_h225_addr(ct, *data, &gcf->rasAddress, &addr, &port)) return 0; @@ -1362,44 +1386,46 @@ static int process_gcf(struct sk_buff **pskb, struct nf_conn *ct, return 0; /* Need new expect */ - if ((exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, &addr, - IPPROTO_UDP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_UDP, NULL, &port); exp->helper = nf_conntrack_helper_ras; - if (nf_conntrack_expect_related(exp) == 0) { - DEBUGP("nf_ct_ras: expect RAS "); - NF_CT_DUMP_TUPLE(&exp->tuple); + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect RAS "); + nf_ct_dump_tuple(&exp->tuple); } else ret = -1; - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); return ret; } /****************************************************************************/ -static int process_rrq(struct sk_buff **pskb, struct nf_conn *ct, +static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, RegistrationRequest *rrq) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int ret; typeof(set_ras_addr_hook) set_ras_addr; - DEBUGP("nf_ct_ras: RRQ\n"); + pr_debug("nf_ct_ras: RRQ\n"); - ret = expect_q931(pskb, ct, ctinfo, data, + ret = expect_q931(skb, ct, ctinfo, protoff, data, rrq->callSignalAddress.item, rrq->callSignalAddress.count); if (ret < 0) return -1; set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) { - ret = set_ras_addr(pskb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, protoff, data, rrq->rasAddress.item, rrq->rasAddress.count); if (ret < 0) @@ -1407,7 +1433,7 @@ static int process_rrq(struct sk_buff **pskb, struct nf_conn *ct, } if (rrq->options & eRegistrationRequest_timeToLive) { - DEBUGP("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive); + pr_debug("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive); info->timeout = rrq->timeToLive; } else info->timeout = default_rrq_ttl; @@ -1416,21 +1442,23 @@ static int process_rrq(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_rcf(struct sk_buff **pskb, struct nf_conn *ct, +static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, RegistrationConfirm *rcf) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int ret; struct nf_conntrack_expect *exp; typeof(set_sig_addr_hook) set_sig_addr; - DEBUGP("nf_ct_ras: RCF\n"); + pr_debug("nf_ct_ras: RCF\n"); set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(pskb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, protoff, data, rcf->callSignalAddress.item, rcf->callSignalAddress.count); if (ret < 0) @@ -1438,48 +1466,49 @@ static int process_rcf(struct sk_buff **pskb, struct nf_conn *ct, } if (rcf->options & eRegistrationConfirm_timeToLive) { - DEBUGP("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive); + pr_debug("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive); info->timeout = rcf->timeToLive; } if (info->timeout > 0) { - DEBUGP - ("nf_ct_ras: set RAS connection timeout to %u seconds\n", - info->timeout); - nf_ct_refresh(ct, *pskb, info->timeout * HZ); + pr_debug("nf_ct_ras: set RAS connection timeout to " + "%u seconds\n", info->timeout); + nf_ct_refresh(ct, skb, info->timeout * HZ); /* Set expect timeout */ - read_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3, info->sig_port[!dir]); if (exp) { - DEBUGP("nf_ct_ras: set Q.931 expect " - "timeout to %u seconds for", - info->timeout); - NF_CT_DUMP_TUPLE(&exp->tuple); + pr_debug("nf_ct_ras: set Q.931 expect " + "timeout to %u seconds for", + info->timeout); + nf_ct_dump_tuple(&exp->tuple); set_expect_timeout(exp, info->timeout); } - read_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; } /****************************************************************************/ -static int process_urq(struct sk_buff **pskb, struct nf_conn *ct, +static int process_urq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, UnregistrationRequest *urq) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int ret; typeof(set_sig_addr_hook) set_sig_addr; - DEBUGP("nf_ct_ras: URQ\n"); + pr_debug("nf_ct_ras: URQ\n"); set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(pskb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, protoff, data, urq->callSignalAddress.item, urq->callSignalAddress.count); if (ret < 0) @@ -1492,23 +1521,24 @@ static int process_urq(struct sk_buff **pskb, struct nf_conn *ct, info->sig_port[!dir] = 0; /* Give it 30 seconds for UCF or URJ */ - nf_ct_refresh(ct, *pskb, 30 * HZ); + nf_ct_refresh(ct, skb, 30 * HZ); return 0; } /****************************************************************************/ -static int process_arq(struct sk_buff **pskb, struct nf_conn *ct, +static int process_arq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, AdmissionRequest *arq) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + const struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; typeof(set_h225_addr_hook) set_h225_addr; - DEBUGP("nf_ct_ras: ARQ\n"); + pr_debug("nf_ct_ras: ARQ\n"); set_h225_addr = rcu_dereference(set_h225_addr_hook); if ((arq->options & eAdmissionRequest_destCallSignalAddress) && @@ -1516,9 +1546,10 @@ static int process_arq(struct sk_buff **pskb, struct nf_conn *ct, &addr, &port) && !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && port == info->sig_port[dir] && + nf_ct_l3num(ct) == NFPROTO_IPV4 && set_h225_addr && ct->status & IPS_NAT_MASK) { /* Answering ARQ */ - return set_h225_addr(pskb, data, 0, + return set_h225_addr(skb, protoff, data, 0, &arq->destCallSignalAddress, &ct->tuplehash[!dir].tuple.dst.u3, info->sig_port[!dir]); @@ -1528,9 +1559,10 @@ static int process_arq(struct sk_buff **pskb, struct nf_conn *ct, get_h225_addr(ct, *data, &arq->srcCallSignalAddress, &addr, &port) && !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && - set_h225_addr && ct->status & IPS_NAT_MASK) { + set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { /* Calling ARQ */ - return set_h225_addr(pskb, data, 0, + return set_h225_addr(skb, protoff, data, 0, &arq->srcCallSignalAddress, &ct->tuplehash[!dir].tuple.dst.u3, port); @@ -1540,18 +1572,19 @@ static int process_arq(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_acf(struct sk_buff **pskb, struct nf_conn *ct, +static int process_acf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, AdmissionConfirm *acf) { int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(set_sig_addr_hook) set_sig_addr; - DEBUGP("nf_ct_ras: ACF\n"); + pr_debug("nf_ct_ras: ACF\n"); if (!get_h225_addr(ct, *data, &acf->destCallSignalAddress, &addr, &port)) @@ -1560,81 +1593,85 @@ static int process_acf(struct sk_buff **pskb, struct nf_conn *ct, if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) { /* Answering ACF */ set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) - return set_sig_addr(pskb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) + return set_sig_addr(skb, ct, ctinfo, protoff, data, &acf->destCallSignalAddress, 1); return 0; } /* Need new expect */ - if ((exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, &addr, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; exp->helper = nf_conntrack_helper_q931; - if (nf_conntrack_expect_related(exp) == 0) { - DEBUGP("nf_ct_ras: expect Q.931 "); - NF_CT_DUMP_TUPLE(&exp->tuple); + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); } else ret = -1; - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); return ret; } /****************************************************************************/ -static int process_lrq(struct sk_buff **pskb, struct nf_conn *ct, +static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, LocationRequest *lrq) { typeof(set_ras_addr_hook) set_ras_addr; - DEBUGP("nf_ct_ras: LRQ\n"); + pr_debug("nf_ct_ras: LRQ\n"); set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) - return set_ras_addr(pskb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) + return set_ras_addr(skb, ct, ctinfo, protoff, data, &lrq->replyAddress, 1); return 0; } /****************************************************************************/ -static int process_lcf(struct sk_buff **pskb, struct nf_conn *ct, +static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, LocationConfirm *lcf) { int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; - DEBUGP("nf_ct_ras: LCF\n"); + pr_debug("nf_ct_ras: LCF\n"); if (!get_h225_addr(ct, *data, &lcf->callSignalAddress, &addr, &port)) return 0; /* Need new expect for call signal */ - if ((exp = nf_conntrack_expect_alloc(ct)) == NULL) + if ((exp = nf_ct_expect_alloc(ct)) == NULL) return -1; - nf_conntrack_expect_init(exp, ct->tuplehash[!dir].tuple.src.l3num, - &ct->tuplehash[!dir].tuple.src.u3, &addr, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; exp->helper = nf_conntrack_helper_q931; - if (nf_conntrack_expect_related(exp) == 0) { - DEBUGP("nf_ct_ras: expect Q.931 "); - NF_CT_DUMP_TUPLE(&exp->tuple); + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); } else ret = -1; - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); /* Ignore rasAddress */ @@ -1642,27 +1679,30 @@ static int process_lcf(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_irr(struct sk_buff **pskb, struct nf_conn *ct, +static int process_irr(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, InfoRequestResponse *irr) { int ret; typeof(set_ras_addr_hook) set_ras_addr; typeof(set_sig_addr_hook) set_sig_addr; - DEBUGP("nf_ct_ras: IRR\n"); + pr_debug("nf_ct_ras: IRR\n"); set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) { - ret = set_ras_addr(pskb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, protoff, data, &irr->rasAddress, 1); if (ret < 0) return -1; } set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(pskb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, protoff, data, irr->callSignalAddress.item, irr->callSignalAddress.count); if (ret < 0) @@ -1673,43 +1713,44 @@ static int process_irr(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int process_ras(struct sk_buff **pskb, struct nf_conn *ct, +static int process_ras(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, RasMessage *ras) { switch (ras->choice) { case eRasMessage_gatekeeperRequest: - return process_grq(pskb, ct, ctinfo, data, + return process_grq(skb, ct, ctinfo, protoff, data, &ras->gatekeeperRequest); case eRasMessage_gatekeeperConfirm: - return process_gcf(pskb, ct, ctinfo, data, + return process_gcf(skb, ct, ctinfo, protoff, data, &ras->gatekeeperConfirm); case eRasMessage_registrationRequest: - return process_rrq(pskb, ct, ctinfo, data, + return process_rrq(skb, ct, ctinfo, protoff, data, &ras->registrationRequest); case eRasMessage_registrationConfirm: - return process_rcf(pskb, ct, ctinfo, data, + return process_rcf(skb, ct, ctinfo, protoff, data, &ras->registrationConfirm); case eRasMessage_unregistrationRequest: - return process_urq(pskb, ct, ctinfo, data, + return process_urq(skb, ct, ctinfo, protoff, data, &ras->unregistrationRequest); case eRasMessage_admissionRequest: - return process_arq(pskb, ct, ctinfo, data, + return process_arq(skb, ct, ctinfo, protoff, data, &ras->admissionRequest); case eRasMessage_admissionConfirm: - return process_acf(pskb, ct, ctinfo, data, + return process_acf(skb, ct, ctinfo, protoff, data, &ras->admissionConfirm); case eRasMessage_locationRequest: - return process_lrq(pskb, ct, ctinfo, data, + return process_lrq(skb, ct, ctinfo, protoff, data, &ras->locationRequest); case eRasMessage_locationConfirm: - return process_lcf(pskb, ct, ctinfo, data, + return process_lcf(skb, ct, ctinfo, protoff, data, &ras->locationConfirm); case eRasMessage_infoRequestResponse: - return process_irr(pskb, ct, ctinfo, data, + return process_irr(skb, ct, ctinfo, protoff, data, &ras->infoRequestResponse); default: - DEBUGP("nf_ct_ras: RAS message %d\n", ras->choice); + pr_debug("nf_ct_ras: RAS message %d\n", ras->choice); break; } @@ -1717,7 +1758,7 @@ static int process_ras(struct sk_buff **pskb, struct nf_conn *ct, } /****************************************************************************/ -static int ras_help(struct sk_buff **pskb, unsigned int protoff, +static int ras_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { static RasMessage ras; @@ -1725,29 +1766,28 @@ static int ras_help(struct sk_buff **pskb, unsigned int protoff, int datalen = 0; int ret; - DEBUGP("nf_ct_ras: skblen = %u\n", (*pskb)->len); + pr_debug("nf_ct_ras: skblen = %u\n", skb->len); spin_lock_bh(&nf_h323_lock); /* Get UDP data */ - data = get_udp_data(pskb, protoff, &datalen); + data = get_udp_data(skb, protoff, &datalen); if (data == NULL) goto accept; - DEBUGP("nf_ct_ras: RAS message len=%d ", datalen); - NF_CT_DUMP_TUPLE(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + pr_debug("nf_ct_ras: RAS message len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); /* Decode RAS message */ ret = DecodeRasMessage(data, datalen, &ras); if (ret < 0) { - if (net_ratelimit()) - printk("nf_ct_ras: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); + pr_debug("nf_ct_ras: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); goto accept; } /* Process RAS message */ - if (process_ras(pskb, ct, ctinfo, &data, &ras) < 0) + if (process_ras(skb, ct, ctinfo, protoff, &data, &ras) < 0) goto drop; accept: @@ -1756,38 +1796,36 @@ static int ras_help(struct sk_buff **pskb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - if (net_ratelimit()) - printk("nf_ct_ras: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process RAS message"); return NF_DROP; } /****************************************************************************/ +static const struct nf_conntrack_expect_policy ras_exp_policy = { + .max_expected = 32, + .timeout = 240, +}; + static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { { .name = "RAS", .me = THIS_MODULE, - .max_expected = 32, - .timeout = 240, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(RAS_PORT), + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, - .mask.src.l3num = 0xFFFF, - .mask.src.u.udp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, .help = ras_help, + .expect_policy = &ras_exp_policy, }, { .name = "RAS", .me = THIS_MODULE, - .max_expected = 32, - .timeout = 240, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET6, - .tuple.src.u.udp.port = __constant_htons(RAS_PORT), + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, - .mask.src.l3num = 0xFFFF, - .mask.src.u.udp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, .help = ras_help, + .expect_policy = &ras_exp_policy, }, }; @@ -1798,8 +1836,9 @@ static void __exit nf_conntrack_h323_fini(void) nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); kfree(h323_buffer); - DEBUGP("nf_ct_h323: fini\n"); + pr_debug("nf_ct_h323: fini\n"); } /****************************************************************************/ @@ -1810,28 +1849,34 @@ static int __init nf_conntrack_h323_init(void) h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) return -ENOMEM; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]); + ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245); if (ret < 0) goto err1; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]); + ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]); if (ret < 0) goto err2; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]); + ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]); if (ret < 0) goto err3; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]); + ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]); if (ret < 0) goto err4; - DEBUGP("nf_ct_h323: init success\n"); + ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]); + if (ret < 0) + goto err5; + pr_debug("nf_ct_h323: init success\n"); return 0; -err4: +err5: nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); -err3: +err4: nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); -err2: +err3: nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); +err2: + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); err1: + kfree(h323_buffer); return ret; } @@ -1854,3 +1899,6 @@ MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); MODULE_DESCRIPTION("H.323 connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_h323"); +MODULE_ALIAS_NFCT_HELPER("RAS"); +MODULE_ALIAS_NFCT_HELPER("Q.931"); +MODULE_ALIAS_NFCT_HELPER("H.245"); diff --git a/net/netfilter/nf_conntrack_h323_types.c b/net/netfilter/nf_conntrack_h323_types.c index 4c6f8b3b120..d880f3523c1 100644 --- a/net/netfilter/nf_conntrack_h323_types.c +++ b/net/netfilter/nf_conntrack_h323_types.c @@ -1,26 +1,26 @@ -/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 +/* Generated by Jing Min Zhao's ASN.1 parser, May 16 2007 * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> * * This source code is licensed under General Public License version 2. */ -static field_t _TransportAddress_ipAddress[] = { /* SEQUENCE */ +static const struct field_t _TransportAddress_ipAddress[] = { /* SEQUENCE */ {FNAME("ip") OCTSTR, FIXD, 4, 0, DECODE, offsetof(TransportAddress_ipAddress, ip), NULL}, {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _TransportAddress_ipSourceRoute_route[] = { /* SEQUENCE OF */ +static const struct field_t _TransportAddress_ipSourceRoute_route[] = { /* SEQUENCE OF */ {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, }; -static field_t _TransportAddress_ipSourceRoute_routing[] = { /* CHOICE */ +static const struct field_t _TransportAddress_ipSourceRoute_routing[] = { /* CHOICE */ {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _TransportAddress_ipSourceRoute[] = { /* SEQUENCE */ +static const struct field_t _TransportAddress_ipSourceRoute[] = { /* SEQUENCE */ {FNAME("ip") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0, @@ -29,37 +29,37 @@ static field_t _TransportAddress_ipSourceRoute[] = { /* SEQUENCE */ _TransportAddress_ipSourceRoute_routing}, }; -static field_t _TransportAddress_ipxAddress[] = { /* SEQUENCE */ +static const struct field_t _TransportAddress_ipxAddress[] = { /* SEQUENCE */ {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL}, {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, {FNAME("port") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL}, }; -static field_t _TransportAddress_ip6Address[] = { /* SEQUENCE */ +static const struct field_t _TransportAddress_ip6Address[] = { /* SEQUENCE */ {FNAME("ip") OCTSTR, FIXD, 16, 0, DECODE, - offsetof(TransportAddress_ip6Address, ip6), NULL}, + offsetof(TransportAddress_ip6Address, ip), NULL}, {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _H221NonStandard[] = { /* SEQUENCE */ +static const struct field_t _H221NonStandard[] = { /* SEQUENCE */ {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _NonStandardIdentifier[] = { /* CHOICE */ +static const struct field_t _NonStandardIdentifier[] = { /* CHOICE */ {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard}, }; -static field_t _NonStandardParameter[] = { /* SEQUENCE */ +static const struct field_t _NonStandardParameter[] = { /* SEQUENCE */ {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP | EXT, 0, _NonStandardIdentifier}, {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _TransportAddress[] = { /* CHOICE */ +static const struct field_t _TransportAddress[] = { /* CHOICE */ {FNAME("ipAddress") SEQ, 0, 2, 2, DECODE, offsetof(TransportAddress, ipAddress), _TransportAddress_ipAddress}, {FNAME("ipSourceRoute") SEQ, 0, 4, 4, SKIP | EXT, 0, @@ -67,14 +67,15 @@ static field_t _TransportAddress[] = { /* CHOICE */ {FNAME("ipxAddress") SEQ, 0, 3, 3, SKIP, 0, _TransportAddress_ipxAddress}, {FNAME("ip6Address") SEQ, 0, 2, 2, DECODE | EXT, - offsetof(TransportAddress, ip6Address), _TransportAddress_ip6Address}, + offsetof(TransportAddress, ip6Address), + _TransportAddress_ip6Address}, {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, }; -static field_t _AliasAddress[] = { /* CHOICE */ +static const struct field_t _AliasAddress[] = { /* CHOICE */ {FNAME("dialedDigits") NUMDGT, 7, 1, 0, SKIP, 0, NULL}, {FNAME("h323-ID") BMPSTR, BYTE, 1, 0, SKIP, 0, NULL}, {FNAME("url-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL}, @@ -84,78 +85,78 @@ static field_t _AliasAddress[] = { /* CHOICE */ {FNAME("mobileUIM") CHOICE, 1, 2, 2, SKIP | EXT, 0, NULL}, }; -static field_t _Setup_UUIE_sourceAddress[] = { /* SEQUENCE OF */ +static const struct field_t _Setup_UUIE_sourceAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _VendorIdentifier[] = { /* SEQUENCE */ +static const struct field_t _VendorIdentifier[] = { /* SEQUENCE */ {FNAME("vendor") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard}, {FNAME("productId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL}, {FNAME("versionId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL}, }; -static field_t _GatekeeperInfo[] = { /* SEQUENCE */ +static const struct field_t _GatekeeperInfo[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, }; -static field_t _H310Caps[] = { /* SEQUENCE */ +static const struct field_t _H310Caps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _H320Caps[] = { /* SEQUENCE */ +static const struct field_t _H320Caps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _H321Caps[] = { /* SEQUENCE */ +static const struct field_t _H321Caps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _H322Caps[] = { /* SEQUENCE */ +static const struct field_t _H322Caps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _H323Caps[] = { /* SEQUENCE */ +static const struct field_t _H323Caps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _H324Caps[] = { /* SEQUENCE */ +static const struct field_t _H324Caps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _VoiceCaps[] = { /* SEQUENCE */ +static const struct field_t _VoiceCaps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _T120OnlyCaps[] = { /* SEQUENCE */ +static const struct field_t _T120OnlyCaps[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _SupportedProtocols[] = { /* CHOICE */ +static const struct field_t _SupportedProtocols[] = { /* CHOICE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, {FNAME("h310") SEQ, 1, 1, 3, SKIP | EXT, 0, _H310Caps}, @@ -170,29 +171,29 @@ static field_t _SupportedProtocols[] = { /* CHOICE */ {FNAME("t38FaxAnnexbOnly") SEQ, 2, 5, 5, SKIP | EXT, 0, NULL}, }; -static field_t _GatewayInfo_protocol[] = { /* SEQUENCE OF */ +static const struct field_t _GatewayInfo_protocol[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 4, 9, 11, SKIP | EXT, 0, _SupportedProtocols}, }; -static field_t _GatewayInfo[] = { /* SEQUENCE */ +static const struct field_t _GatewayInfo[] = { /* SEQUENCE */ {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, _GatewayInfo_protocol}, {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, }; -static field_t _McuInfo[] = { /* SEQUENCE */ +static const struct field_t _McuInfo[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, }; -static field_t _TerminalInfo[] = { /* SEQUENCE */ +static const struct field_t _TerminalInfo[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, }; -static field_t _EndpointType[] = { /* SEQUENCE */ +static const struct field_t _EndpointType[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("vendor") SEQ, 2, 3, 3, SKIP | EXT | OPT, 0, @@ -209,19 +210,19 @@ static field_t _EndpointType[] = { /* SEQUENCE */ 0, NULL}, }; -static field_t _Setup_UUIE_destinationAddress[] = { /* SEQUENCE OF */ +static const struct field_t _Setup_UUIE_destinationAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _Setup_UUIE_destExtraCallInfo[] = { /* SEQUENCE OF */ +static const struct field_t _Setup_UUIE_destExtraCallInfo[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _Setup_UUIE_destExtraCRV[] = { /* SEQUENCE OF */ +static const struct field_t _Setup_UUIE_destExtraCRV[] = { /* SEQUENCE OF */ {FNAME("item") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _Setup_UUIE_conferenceGoal[] = { /* CHOICE */ +static const struct field_t _Setup_UUIE_conferenceGoal[] = { /* CHOICE */ {FNAME("create") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("join") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("invite") NUL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -230,12 +231,12 @@ static field_t _Setup_UUIE_conferenceGoal[] = { /* CHOICE */ 0, NULL}, }; -static field_t _Q954Details[] = { /* SEQUENCE */ +static const struct field_t _Q954Details[] = { /* SEQUENCE */ {FNAME("conferenceCalling") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("threePartyService") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _QseriesOptions[] = { /* SEQUENCE */ +static const struct field_t _QseriesOptions[] = { /* SEQUENCE */ {FNAME("q932Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("q951Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("q952Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -246,32 +247,32 @@ static field_t _QseriesOptions[] = { /* SEQUENCE */ {FNAME("q954Info") SEQ, 0, 2, 2, SKIP | EXT, 0, _Q954Details}, }; -static field_t _CallType[] = { /* CHOICE */ +static const struct field_t _CallType[] = { /* CHOICE */ {FNAME("pointToPoint") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("oneToN") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("nToOne") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("nToN") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _H245_NonStandardIdentifier_h221NonStandard[] = { /* SEQUENCE */ +static const struct field_t _H245_NonStandardIdentifier_h221NonStandard[] = { /* SEQUENCE */ {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _H245_NonStandardIdentifier[] = { /* CHOICE */ +static const struct field_t _H245_NonStandardIdentifier[] = { /* CHOICE */ {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP, 0, _H245_NonStandardIdentifier_h221NonStandard}, }; -static field_t _H245_NonStandardParameter[] = { /* SEQUENCE */ +static const struct field_t _H245_NonStandardParameter[] = { /* SEQUENCE */ {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP, 0, _H245_NonStandardIdentifier}, {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _H261VideoCapability[] = { /* SEQUENCE */ +static const struct field_t _H261VideoCapability[] = { /* SEQUENCE */ {FNAME("qcifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL}, {FNAME("cifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL}, {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0, @@ -281,7 +282,7 @@ static field_t _H261VideoCapability[] = { /* SEQUENCE */ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _H262VideoCapability[] = { /* SEQUENCE */ +static const struct field_t _H262VideoCapability[] = { /* SEQUENCE */ {FNAME("profileAndLevel-SPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("profileAndLevel-MPatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("profileAndLevel-MPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -303,7 +304,7 @@ static field_t _H262VideoCapability[] = { /* SEQUENCE */ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _H263VideoCapability[] = { /* SEQUENCE */ +static const struct field_t _H263VideoCapability[] = { /* SEQUENCE */ {FNAME("sqcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, {FNAME("qcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, {FNAME("cifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, @@ -329,7 +330,7 @@ static field_t _H263VideoCapability[] = { /* SEQUENCE */ {FNAME("h263Options") SEQ, 5, 29, 31, SKIP | EXT | OPT, 0, NULL}, }; -static field_t _IS11172VideoCapability[] = { /* SEQUENCE */ +static const struct field_t _IS11172VideoCapability[] = { /* SEQUENCE */ {FNAME("constrainedBitstream") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, @@ -340,7 +341,7 @@ static field_t _IS11172VideoCapability[] = { /* SEQUENCE */ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _VideoCapability[] = { /* CHOICE */ +static const struct field_t _VideoCapability[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, {FNAME("h261VideoCapability") SEQ, 2, 5, 6, SKIP | EXT, 0, @@ -354,12 +355,12 @@ static field_t _VideoCapability[] = { /* CHOICE */ {FNAME("genericVideoCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, }; -static field_t _AudioCapability_g7231[] = { /* SEQUENCE */ +static const struct field_t _AudioCapability_g7231[] = { /* SEQUENCE */ {FNAME("maxAl-sduAudioFrames") INT, BYTE, 1, 0, SKIP, 0, NULL}, {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _IS11172AudioCapability[] = { /* SEQUENCE */ +static const struct field_t _IS11172AudioCapability[] = { /* SEQUENCE */ {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -371,7 +372,7 @@ static field_t _IS11172AudioCapability[] = { /* SEQUENCE */ {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, }; -static field_t _IS13818AudioCapability[] = { /* SEQUENCE */ +static const struct field_t _IS13818AudioCapability[] = { /* SEQUENCE */ {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -395,7 +396,7 @@ static field_t _IS13818AudioCapability[] = { /* SEQUENCE */ {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, }; -static field_t _AudioCapability[] = { /* CHOICE */ +static const struct field_t _AudioCapability[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, {FNAME("g711Alaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL}, @@ -423,7 +424,7 @@ static field_t _AudioCapability[] = { /* CHOICE */ {FNAME("g729Extensions") SEQ, 1, 8, 8, SKIP | EXT, 0, NULL}, }; -static field_t _DataProtocolCapability[] = { /* CHOICE */ +static const struct field_t _DataProtocolCapability[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, {FNAME("v14buffered") NUL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -441,7 +442,7 @@ static field_t _DataProtocolCapability[] = { /* CHOICE */ {FNAME("udp") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _T84Profile_t84Restricted[] = { /* SEQUENCE */ +static const struct field_t _T84Profile_t84Restricted[] = { /* SEQUENCE */ {FNAME("qcif") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("cif") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("ccir601Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -463,25 +464,25 @@ static field_t _T84Profile_t84Restricted[] = { /* SEQUENCE */ {FNAME("digPhotoHighProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _T84Profile[] = { /* CHOICE */ +static const struct field_t _T84Profile[] = { /* CHOICE */ {FNAME("t84Unrestricted") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("t84Restricted") SEQ, 0, 19, 19, SKIP | EXT, 0, _T84Profile_t84Restricted}, }; -static field_t _DataApplicationCapability_application_t84[] = { /* SEQUENCE */ +static const struct field_t _DataApplicationCapability_application_t84[] = { /* SEQUENCE */ {FNAME("t84Protocol") CHOICE, 3, 7, 14, SKIP | EXT, 0, _DataProtocolCapability}, {FNAME("t84Profile") CHOICE, 1, 2, 2, SKIP, 0, _T84Profile}, }; -static field_t _DataApplicationCapability_application_nlpid[] = { /* SEQUENCE */ +static const struct field_t _DataApplicationCapability_application_nlpid[] = { /* SEQUENCE */ {FNAME("nlpidProtocol") CHOICE, 3, 7, 14, SKIP | EXT, 0, _DataProtocolCapability}, {FNAME("nlpidData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _DataApplicationCapability_application[] = { /* CHOICE */ +static const struct field_t _DataApplicationCapability_application[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, {FNAME("t120") CHOICE, 3, 7, 14, DECODE | EXT, @@ -508,20 +509,20 @@ static field_t _DataApplicationCapability_application[] = { /* CHOICE */ {FNAME("genericDataCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, }; -static field_t _DataApplicationCapability[] = { /* SEQUENCE */ +static const struct field_t _DataApplicationCapability[] = { /* SEQUENCE */ {FNAME("application") CHOICE, 4, 10, 14, DECODE | EXT, offsetof(DataApplicationCapability, application), _DataApplicationCapability_application}, {FNAME("maxBitRate") INT, CONS, 0, 0, SKIP, 0, NULL}, }; -static field_t _EncryptionMode[] = { /* CHOICE */ +static const struct field_t _EncryptionMode[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, {FNAME("h233Encryption") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _DataType[] = { /* CHOICE */ +static const struct field_t _DataType[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, {FNAME("nullData") NUL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -537,7 +538,7 @@ static field_t _DataType[] = { /* CHOICE */ {FNAME("multiplexedStream") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL}, }; -static field_t _H222LogicalChannelParameters[] = { /* SEQUENCE */ +static const struct field_t _H222LogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("resourceID") INT, WORD, 0, 0, SKIP, 0, NULL}, {FNAME("subChannelID") INT, WORD, 0, 0, SKIP, 0, NULL}, {FNAME("pcr-pid") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, @@ -545,12 +546,12 @@ static field_t _H222LogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("streamDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL}, }; -static field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = { /* SEQUENCE */ +static const struct field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = { /* SEQUENCE */ {FNAME("controlFieldOctets") INT, 2, 0, 0, SKIP, 0, NULL}, {FNAME("sendBufferSize") INT, CONS, 0, 0, SKIP, 0, NULL}, }; -static field_t _H223LogicalChannelParameters_adaptationLayerType[] = { /* CHOICE */ +static const struct field_t _H223LogicalChannelParameters_adaptationLayerType[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, {FNAME("al1Framed") NUL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -564,53 +565,53 @@ static field_t _H223LogicalChannelParameters_adaptationLayerType[] = { /* CHOICE {FNAME("al3M") SEQ, 0, 5, 6, SKIP | EXT, 0, NULL}, }; -static field_t _H223LogicalChannelParameters[] = { /* SEQUENCE */ +static const struct field_t _H223LogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("adaptationLayerType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _H223LogicalChannelParameters_adaptationLayerType}, {FNAME("segmentableFlag") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _CRCLength[] = { /* CHOICE */ +static const struct field_t _CRCLength[] = { /* CHOICE */ {FNAME("crc8bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("crc16bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("crc32bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _V76HDLCParameters[] = { /* SEQUENCE */ +static const struct field_t _V76HDLCParameters[] = { /* SEQUENCE */ {FNAME("crcLength") CHOICE, 2, 3, 3, SKIP | EXT, 0, _CRCLength}, {FNAME("n401") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("loopbackTestProcedure") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _V76LogicalChannelParameters_suspendResume[] = { /* CHOICE */ +static const struct field_t _V76LogicalChannelParameters_suspendResume[] = { /* CHOICE */ {FNAME("noSuspendResume") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("suspendResumewAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("suspendResumewoAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = { /* CHOICE */ +static const struct field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = { /* CHOICE */ {FNAME("rej") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("sREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("mSREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _V76LogicalChannelParameters_mode_eRM[] = { /* SEQUENCE */ +static const struct field_t _V76LogicalChannelParameters_mode_eRM[] = { /* SEQUENCE */ {FNAME("windowSize") INT, 7, 1, 0, SKIP, 0, NULL}, {FNAME("recovery") CHOICE, 2, 3, 3, SKIP | EXT, 0, _V76LogicalChannelParameters_mode_eRM_recovery}, }; -static field_t _V76LogicalChannelParameters_mode[] = { /* CHOICE */ +static const struct field_t _V76LogicalChannelParameters_mode[] = { /* CHOICE */ {FNAME("eRM") SEQ, 0, 2, 2, SKIP | EXT, 0, _V76LogicalChannelParameters_mode_eRM}, {FNAME("uNERM") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _V75Parameters[] = { /* SEQUENCE */ +static const struct field_t _V75Parameters[] = { /* SEQUENCE */ {FNAME("audioHeaderPresent") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _V76LogicalChannelParameters[] = { /* SEQUENCE */ +static const struct field_t _V76LogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("hdlcParameters") SEQ, 0, 3, 3, SKIP | EXT, 0, _V76HDLCParameters}, {FNAME("suspendResume") CHOICE, 2, 3, 3, SKIP | EXT, 0, @@ -621,37 +622,38 @@ static field_t _V76LogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("v75Parameters") SEQ, 0, 1, 1, SKIP | EXT, 0, _V75Parameters}, }; -static field_t _H2250LogicalChannelParameters_nonStandard[] = { /* SEQUENCE OF */ +static const struct field_t _H2250LogicalChannelParameters_nonStandard[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, }; -static field_t _UnicastAddress_iPAddress[] = { /* SEQUENCE */ +static const struct field_t _UnicastAddress_iPAddress[] = { /* SEQUENCE */ {FNAME("network") OCTSTR, FIXD, 4, 0, DECODE, offsetof(UnicastAddress_iPAddress, network), NULL}, {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _UnicastAddress_iPXAddress[] = { /* SEQUENCE */ +static const struct field_t _UnicastAddress_iPXAddress[] = { /* SEQUENCE */ {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL}, {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, {FNAME("tsapIdentifier") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL}, }; -static field_t _UnicastAddress_iP6Address[] = { /* SEQUENCE */ - {FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, +static const struct field_t _UnicastAddress_iP6Address[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 16, 0, DECODE, + offsetof(UnicastAddress_iP6Address, network), NULL}, {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _UnicastAddress_iPSourceRouteAddress_routing[] = { /* CHOICE */ +static const struct field_t _UnicastAddress_iPSourceRouteAddress_routing[] = { /* CHOICE */ {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _UnicastAddress_iPSourceRouteAddress_route[] = { /* SEQUENCE OF */ +static const struct field_t _UnicastAddress_iPSourceRouteAddress_route[] = { /* SEQUENCE OF */ {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, }; -static field_t _UnicastAddress_iPSourceRouteAddress[] = { /* SEQUENCE */ +static const struct field_t _UnicastAddress_iPSourceRouteAddress[] = { /* SEQUENCE */ {FNAME("routing") CHOICE, 1, 2, 2, SKIP, 0, _UnicastAddress_iPSourceRouteAddress_routing}, {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, @@ -660,13 +662,13 @@ static field_t _UnicastAddress_iPSourceRouteAddress[] = { /* SEQUENCE */ _UnicastAddress_iPSourceRouteAddress_route}, }; -static field_t _UnicastAddress[] = { /* CHOICE */ +static const struct field_t _UnicastAddress[] = { /* CHOICE */ {FNAME("iPAddress") SEQ, 0, 2, 2, DECODE | EXT, offsetof(UnicastAddress, iPAddress), _UnicastAddress_iPAddress}, {FNAME("iPXAddress") SEQ, 0, 3, 3, SKIP | EXT, 0, _UnicastAddress_iPXAddress}, - {FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0, - _UnicastAddress_iP6Address}, + {FNAME("iP6Address") SEQ, 0, 2, 2, DECODE | EXT, + offsetof(UnicastAddress, iP6Address), _UnicastAddress_iP6Address}, {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, {FNAME("iPSourceRouteAddress") SEQ, 0, 4, 4, SKIP | EXT, 0, _UnicastAddress_iPSourceRouteAddress}, @@ -674,17 +676,17 @@ static field_t _UnicastAddress[] = { /* CHOICE */ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL}, }; -static field_t _MulticastAddress_iPAddress[] = { /* SEQUENCE */ +static const struct field_t _MulticastAddress_iPAddress[] = { /* SEQUENCE */ {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _MulticastAddress_iP6Address[] = { /* SEQUENCE */ +static const struct field_t _MulticastAddress_iP6Address[] = { /* SEQUENCE */ {FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _MulticastAddress[] = { /* CHOICE */ +static const struct field_t _MulticastAddress[] = { /* CHOICE */ {FNAME("iPAddress") SEQ, 0, 2, 2, SKIP | EXT, 0, _MulticastAddress_iPAddress}, {FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0, @@ -693,14 +695,14 @@ static field_t _MulticastAddress[] = { /* CHOICE */ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL}, }; -static field_t _H245_TransportAddress[] = { /* CHOICE */ +static const struct field_t _H245_TransportAddress[] = { /* CHOICE */ {FNAME("unicastAddress") CHOICE, 3, 5, 7, DECODE | EXT, offsetof(H245_TransportAddress, unicastAddress), _UnicastAddress}, {FNAME("multicastAddress") CHOICE, 1, 2, 4, SKIP | EXT, 0, _MulticastAddress}, }; -static field_t _H2250LogicalChannelParameters[] = { /* SEQUENCE */ +static const struct field_t _H2250LogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, _H2250LogicalChannelParameters_nonStandard}, {FNAME("sessionID") INT, BYTE, 0, 0, SKIP, 0, NULL}, @@ -726,7 +728,7 @@ static field_t _H2250LogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("source") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, NULL}, }; -static field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ +static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0, _H222LogicalChannelParameters}, {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0, @@ -740,7 +742,7 @@ static field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexPara {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = { /* SEQUENCE */ +static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("dataType") CHOICE, 3, 6, 9, DECODE | EXT, offsetof(OpenLogicalChannel_forwardLogicalChannelParameters, @@ -754,7 +756,7 @@ static field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = { /* SEQU {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, }; -static field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ +static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0, _H223LogicalChannelParameters}, {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0, @@ -765,7 +767,7 @@ static field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexPara h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, }; -static field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = { /* SEQUENCE */ +static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("dataType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _DataType}, {FNAME("multiplexParameters") CHOICE, 1, 2, 3, DECODE | EXT | OPT, offsetof(OpenLogicalChannel_reverseLogicalChannelParameters, @@ -776,23 +778,23 @@ static field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = { /* SEQU {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, }; -static field_t _NetworkAccessParameters_distribution[] = { /* CHOICE */ +static const struct field_t _NetworkAccessParameters_distribution[] = { /* CHOICE */ {FNAME("unicast") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("multicast") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _Q2931Address_address[] = { /* CHOICE */ +static const struct field_t _Q2931Address_address[] = { /* CHOICE */ {FNAME("internationalNumber") NUMSTR, 4, 1, 0, SKIP, 0, NULL}, {FNAME("nsapAddress") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, }; -static field_t _Q2931Address[] = { /* SEQUENCE */ +static const struct field_t _Q2931Address[] = { /* SEQUENCE */ {FNAME("address") CHOICE, 1, 2, 2, SKIP | EXT, 0, _Q2931Address_address}, {FNAME("subaddress") OCTSTR, 5, 1, 0, SKIP | OPT, 0, NULL}, }; -static field_t _NetworkAccessParameters_networkAddress[] = { /* CHOICE */ +static const struct field_t _NetworkAccessParameters_networkAddress[] = { /* CHOICE */ {FNAME("q2931Address") SEQ, 1, 2, 2, SKIP | EXT, 0, _Q2931Address}, {FNAME("e164Address") NUMDGT, 7, 1, 0, SKIP, 0, NULL}, {FNAME("localAreaAddress") CHOICE, 1, 2, 2, DECODE | EXT, @@ -800,7 +802,7 @@ static field_t _NetworkAccessParameters_networkAddress[] = { /* CHOICE */ _H245_TransportAddress}, }; -static field_t _NetworkAccessParameters[] = { /* SEQUENCE */ +static const struct field_t _NetworkAccessParameters[] = { /* SEQUENCE */ {FNAME("distribution") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _NetworkAccessParameters_distribution}, {FNAME("networkAddress") CHOICE, 2, 3, 3, DECODE | EXT, @@ -812,7 +814,7 @@ static field_t _NetworkAccessParameters[] = { /* SEQUENCE */ NULL}, }; -static field_t _OpenLogicalChannel[] = { /* SEQUENCE */ +static const struct field_t _OpenLogicalChannel[] = { /* SEQUENCE */ {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("forwardLogicalChannelParameters") SEQ, 1, 3, 5, DECODE | EXT, offsetof(OpenLogicalChannel, forwardLogicalChannelParameters), @@ -827,13 +829,13 @@ static field_t _OpenLogicalChannel[] = { /* SEQUENCE */ {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, }; -static field_t _Setup_UUIE_fastStart[] = { /* SEQUENCE OF */ +static const struct field_t _Setup_UUIE_fastStart[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, sizeof(OpenLogicalChannel), _OpenLogicalChannel} , }; -static field_t _Setup_UUIE[] = { /* SEQUENCE */ +static const struct field_t _Setup_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, offsetof(Setup_UUIE, h245Address), _TransportAddress}, @@ -892,13 +894,13 @@ static field_t _Setup_UUIE[] = { /* SEQUENCE */ NULL}, }; -static field_t _CallProceeding_UUIE_fastStart[] = { /* SEQUENCE OF */ +static const struct field_t _CallProceeding_UUIE_fastStart[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, sizeof(OpenLogicalChannel), _OpenLogicalChannel} , }; -static field_t _CallProceeding_UUIE[] = { /* SEQUENCE */ +static const struct field_t _CallProceeding_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, @@ -918,13 +920,13 @@ static field_t _CallProceeding_UUIE[] = { /* SEQUENCE */ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, }; -static field_t _Connect_UUIE_fastStart[] = { /* SEQUENCE OF */ +static const struct field_t _Connect_UUIE_fastStart[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, sizeof(OpenLogicalChannel), _OpenLogicalChannel} , }; -static field_t _Connect_UUIE[] = { /* SEQUENCE */ +static const struct field_t _Connect_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, offsetof(Connect_UUIE, h245Address), _TransportAddress}, @@ -952,13 +954,13 @@ static field_t _Connect_UUIE[] = { /* SEQUENCE */ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, }; -static field_t _Alerting_UUIE_fastStart[] = { /* SEQUENCE OF */ +static const struct field_t _Alerting_UUIE_fastStart[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, sizeof(OpenLogicalChannel), _OpenLogicalChannel} , }; -static field_t _Alerting_UUIE[] = { /* SEQUENCE */ +static const struct field_t _Alerting_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, @@ -984,24 +986,17 @@ static field_t _Alerting_UUIE[] = { /* SEQUENCE */ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, }; -static field_t _Information_UUIE_fastStart[] = { /* SEQUENCE OF */ - {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, - sizeof(OpenLogicalChannel), _OpenLogicalChannel} - , -}; - -static field_t _Information_UUIE[] = { /* SEQUENCE */ +static const struct field_t _Information_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, - {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, - offsetof(Information_UUIE, fastStart), _Information_UUIE_fastStart}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, SKIP | OPT, 0, NULL}, {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL}, }; -static field_t _ReleaseCompleteReason[] = { /* CHOICE */ +static const struct field_t _ReleaseCompleteReason[] = { /* CHOICE */ {FNAME("noBandwidth") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("gatekeeperResources") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("unreachableDestination") NUL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -1027,7 +1022,7 @@ static field_t _ReleaseCompleteReason[] = { /* CHOICE */ {FNAME("tunnelledSignallingRejected") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _ReleaseComplete_UUIE[] = { /* SEQUENCE */ +static const struct field_t _ReleaseComplete_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("reason") CHOICE, 4, 12, 22, SKIP | EXT | OPT, 0, _ReleaseCompleteReason}, @@ -1044,11 +1039,11 @@ static field_t _ReleaseComplete_UUIE[] = { /* SEQUENCE */ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, }; -static field_t _Facility_UUIE_alternativeAliasAddress[] = { /* SEQUENCE OF */ +static const struct field_t _Facility_UUIE_alternativeAliasAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _FacilityReason[] = { /* CHOICE */ +static const struct field_t _FacilityReason[] = { /* CHOICE */ {FNAME("routeCallToGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("callForwarded") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("routeCallToMC") NUL, FIXD, 0, 0, SKIP, 0, NULL}, @@ -1062,13 +1057,13 @@ static field_t _FacilityReason[] = { /* CHOICE */ {FNAME("transportedInformation") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ +static const struct field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, sizeof(OpenLogicalChannel), _OpenLogicalChannel} , }; -static field_t _Facility_UUIE[] = { /* SEQUENCE */ +static const struct field_t _Facility_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, @@ -1099,17 +1094,17 @@ static field_t _Facility_UUIE[] = { /* SEQUENCE */ NULL}, }; -static field_t _CallIdentifier[] = { /* SEQUENCE */ +static const struct field_t _CallIdentifier[] = { /* SEQUENCE */ {FNAME("guid") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, }; -static field_t _SecurityServiceMode[] = { /* CHOICE */ +static const struct field_t _SecurityServiceMode[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("default") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _SecurityCapabilities[] = { /* SEQUENCE */ +static const struct field_t _SecurityCapabilities[] = { /* SEQUENCE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("encryption") CHOICE, 2, 3, 3, SKIP | EXT, 0, @@ -1120,30 +1115,30 @@ static field_t _SecurityCapabilities[] = { /* SEQUENCE */ _SecurityServiceMode}, }; -static field_t _H245Security[] = { /* CHOICE */ +static const struct field_t _H245Security[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, {FNAME("noSecurity") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("tls") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities}, {FNAME("ipsec") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities}, }; -static field_t _DHset[] = { /* SEQUENCE */ +static const struct field_t _DHset[] = { /* SEQUENCE */ {FNAME("halfkey") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, {FNAME("modSize") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, {FNAME("generator") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, }; -static field_t _TypedCertificate[] = { /* SEQUENCE */ +static const struct field_t _TypedCertificate[] = { /* SEQUENCE */ {FNAME("type") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("certificate") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _H235_NonStandardParameter[] = { /* SEQUENCE */ +static const struct field_t _H235_NonStandardParameter[] = { /* SEQUENCE */ {FNAME("nonStandardIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _ClearToken[] = { /* SEQUENCE */ +static const struct field_t _ClearToken[] = { /* SEQUENCE */ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("timeStamp") INT, CONS, 1, 0, SKIP | OPT, 0, NULL}, {FNAME("password") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, @@ -1159,120 +1154,120 @@ static field_t _ClearToken[] = { /* SEQUENCE */ {FNAME("sendersID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, }; -static field_t _Progress_UUIE_tokens[] = { /* SEQUENCE OF */ +static const struct field_t _Progress_UUIE_tokens[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken}, }; -static field_t _Params[] = { /* SEQUENCE */ +static const struct field_t _Params[] = { /* SEQUENCE */ {FNAME("ranInt") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("iv8") OCTSTR, FIXD, 8, 0, SKIP | OPT, 0, NULL}, {FNAME("iv16") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, }; -static field_t _CryptoH323Token_cryptoEPPwdHash_token[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoEPPwdHash_token[] = { /* SEQUENCE */ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoH323Token_cryptoEPPwdHash[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoEPPwdHash[] = { /* SEQUENCE */ {FNAME("alias") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL}, {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, _CryptoH323Token_cryptoEPPwdHash_token}, }; -static field_t _CryptoH323Token_cryptoGKPwdHash_token[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoGKPwdHash_token[] = { /* SEQUENCE */ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoH323Token_cryptoGKPwdHash[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoGKPwdHash[] = { /* SEQUENCE */ {FNAME("gatekeeperId") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL}, {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, _CryptoH323Token_cryptoGKPwdHash_token}, }; -static field_t _CryptoH323Token_cryptoEPPwdEncr[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoEPPwdEncr[] = { /* SEQUENCE */ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoH323Token_cryptoGKPwdEncr[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoGKPwdEncr[] = { /* SEQUENCE */ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoH323Token_cryptoEPCert[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoEPCert[] = { /* SEQUENCE */ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoH323Token_cryptoGKCert[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoGKCert[] = { /* SEQUENCE */ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoH323Token_cryptoFastStart[] = { /* SEQUENCE */ +static const struct field_t _CryptoH323Token_cryptoFastStart[] = { /* SEQUENCE */ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoToken_cryptoEncryptedToken_token[] = { /* SEQUENCE */ +static const struct field_t _CryptoToken_cryptoEncryptedToken_token[] = { /* SEQUENCE */ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoToken_cryptoEncryptedToken[] = { /* SEQUENCE */ +static const struct field_t _CryptoToken_cryptoEncryptedToken[] = { /* SEQUENCE */ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, _CryptoToken_cryptoEncryptedToken_token}, }; -static field_t _CryptoToken_cryptoSignedToken_token[] = { /* SEQUENCE */ +static const struct field_t _CryptoToken_cryptoSignedToken_token[] = { /* SEQUENCE */ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoToken_cryptoSignedToken[] = { /* SEQUENCE */ +static const struct field_t _CryptoToken_cryptoSignedToken[] = { /* SEQUENCE */ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("token") SEQ, 0, 4, 4, SKIP, 0, _CryptoToken_cryptoSignedToken_token}, }; -static field_t _CryptoToken_cryptoHashedToken_token[] = { /* SEQUENCE */ +static const struct field_t _CryptoToken_cryptoHashedToken_token[] = { /* SEQUENCE */ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoToken_cryptoHashedToken[] = { /* SEQUENCE */ +static const struct field_t _CryptoToken_cryptoHashedToken[] = { /* SEQUENCE */ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("hashedVals") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken}, {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, _CryptoToken_cryptoHashedToken_token}, }; -static field_t _CryptoToken_cryptoPwdEncr[] = { /* SEQUENCE */ +static const struct field_t _CryptoToken_cryptoPwdEncr[] = { /* SEQUENCE */ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, }; -static field_t _CryptoToken[] = { /* CHOICE */ +static const struct field_t _CryptoToken[] = { /* CHOICE */ {FNAME("cryptoEncryptedToken") SEQ, 0, 2, 2, SKIP, 0, _CryptoToken_cryptoEncryptedToken}, {FNAME("cryptoSignedToken") SEQ, 0, 2, 2, SKIP, 0, @@ -1283,7 +1278,7 @@ static field_t _CryptoToken[] = { /* CHOICE */ _CryptoToken_cryptoPwdEncr}, }; -static field_t _CryptoH323Token[] = { /* CHOICE */ +static const struct field_t _CryptoH323Token[] = { /* CHOICE */ {FNAME("cryptoEPPwdHash") SEQ, 0, 3, 3, SKIP, 0, _CryptoH323Token_cryptoEPPwdHash}, {FNAME("cryptoGKPwdHash") SEQ, 0, 3, 3, SKIP, 0, @@ -1302,17 +1297,17 @@ static field_t _CryptoH323Token[] = { /* CHOICE */ _CryptoToken}, }; -static field_t _Progress_UUIE_cryptoTokens[] = { /* SEQUENCE OF */ +static const struct field_t _Progress_UUIE_cryptoTokens[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 3, 8, 8, SKIP | EXT, 0, _CryptoH323Token}, }; -static field_t _Progress_UUIE_fastStart[] = { /* SEQUENCE OF */ +static const struct field_t _Progress_UUIE_fastStart[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, sizeof(OpenLogicalChannel), _OpenLogicalChannel} , }; -static field_t _Progress_UUIE[] = { /* SEQUENCE */ +static const struct field_t _Progress_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, @@ -1333,7 +1328,7 @@ static field_t _Progress_UUIE[] = { /* SEQUENCE */ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, }; -static field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */ +static const struct field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */ {FNAME("setup") SEQ, 7, 13, 39, DECODE | EXT, offsetof(H323_UU_PDU_h323_message_body, setup), _Setup_UUIE}, {FNAME("callProceeding") SEQ, 1, 3, 12, DECODE | EXT, @@ -1343,9 +1338,7 @@ static field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */ offsetof(H323_UU_PDU_h323_message_body, connect), _Connect_UUIE}, {FNAME("alerting") SEQ, 1, 3, 17, DECODE | EXT, offsetof(H323_UU_PDU_h323_message_body, alerting), _Alerting_UUIE}, - {FNAME("information") SEQ, 0, 1, 7, DECODE | EXT, - offsetof(H323_UU_PDU_h323_message_body, information), - _Information_UUIE}, + {FNAME("information") SEQ, 0, 1, 7, SKIP | EXT, 0, _Information_UUIE}, {FNAME("releaseComplete") SEQ, 1, 2, 11, SKIP | EXT, 0, _ReleaseComplete_UUIE}, {FNAME("facility") SEQ, 3, 5, 21, DECODE | EXT, @@ -1359,7 +1352,7 @@ static field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */ {FNAME("notify") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, }; -static field_t _RequestMessage[] = { /* CHOICE */ +static const struct field_t _RequestMessage[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, {FNAME("masterSlaveDetermination") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, {FNAME("terminalCapabilitySet") SEQ, 3, 5, 5, STOP | EXT, 0, NULL}, @@ -1379,7 +1372,7 @@ static field_t _RequestMessage[] = { /* CHOICE */ NULL}, }; -static field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ +static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0, _H222LogicalChannelParameters}, {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT, @@ -1388,7 +1381,7 @@ static field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexP h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, }; -static field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = { /* SEQUENCE */ +static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = { /* SEQUENCE */ {FNAME("reverseLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, {FNAME("multiplexParameters") CHOICE, 0, 1, 2, DECODE | EXT | OPT, @@ -1398,11 +1391,11 @@ static field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = { /* S {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, }; -static field_t _H2250LogicalChannelAckParameters_nonStandard[] = { /* SEQUENCE OF */ +static const struct field_t _H2250LogicalChannelAckParameters_nonStandard[] = { /* SEQUENCE OF */ {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, }; -static field_t _H2250LogicalChannelAckParameters[] = { /* SEQUENCE */ +static const struct field_t _H2250LogicalChannelAckParameters[] = { /* SEQUENCE */ {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, _H2250LogicalChannelAckParameters_nonStandard}, {FNAME("sessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL}, @@ -1417,20 +1410,22 @@ static field_t _H2250LogicalChannelAckParameters[] = { /* SEQUENCE */ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, }; -static field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = { /* CHOICE */ +static const struct field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = { /* CHOICE */ {FNAME("h2250LogicalChannelAckParameters") SEQ, 5, 5, 7, DECODE | EXT, offsetof(OpenLogicalChannelAck_forwardMultiplexAckParameters, h2250LogicalChannelAckParameters), _H2250LogicalChannelAckParameters}, }; -static field_t _OpenLogicalChannelAck[] = { /* SEQUENCE */ +static const struct field_t _OpenLogicalChannelAck[] = { /* SEQUENCE */ {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("reverseLogicalChannelParameters") SEQ, 2, 3, 4, DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck, reverseLogicalChannelParameters), _OpenLogicalChannelAck_reverseLogicalChannelParameters}, - {FNAME("separateStack") SEQ, 2, 4, 5, SKIP | EXT | OPT, 0, NULL}, + {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT, + offsetof(OpenLogicalChannelAck, separateStack), + _NetworkAccessParameters}, {FNAME("forwardMultiplexAckParameters") CHOICE, 0, 1, 1, DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck, forwardMultiplexAckParameters), @@ -1438,7 +1433,7 @@ static field_t _OpenLogicalChannelAck[] = { /* SEQUENCE */ {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, }; -static field_t _ResponseMessage[] = { /* CHOICE */ +static const struct field_t _ResponseMessage[] = { /* CHOICE */ {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, {FNAME("masterSlaveDeterminationAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, @@ -1474,7 +1469,7 @@ static field_t _ResponseMessage[] = { /* CHOICE */ {FNAME("logicalChannelRateReject") SEQ, 1, 4, 4, STOP | EXT, 0, NULL}, }; -static field_t _MultimediaSystemControlMessage[] = { /* CHOICE */ +static const struct field_t _MultimediaSystemControlMessage[] = { /* CHOICE */ {FNAME("request") CHOICE, 4, 11, 15, DECODE | EXT, offsetof(MultimediaSystemControlMessage, request), _RequestMessage}, {FNAME("response") CHOICE, 5, 19, 24, DECODE | EXT, @@ -1484,14 +1479,14 @@ static field_t _MultimediaSystemControlMessage[] = { /* CHOICE */ {FNAME("indication") CHOICE, 4, 14, 23, STOP | EXT, 0, NULL}, }; -static field_t _H323_UU_PDU_h245Control[] = { /* SEQUENCE OF */ +static const struct field_t _H323_UU_PDU_h245Control[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 2, 4, 4, DECODE | OPEN | EXT, sizeof(MultimediaSystemControlMessage), _MultimediaSystemControlMessage} , }; -static field_t _H323_UU_PDU[] = { /* SEQUENCE */ +static const struct field_t _H323_UU_PDU[] = { /* SEQUENCE */ {FNAME("h323-message-body") CHOICE, 3, 7, 13, DECODE | EXT, offsetof(H323_UU_PDU, h323_message_body), _H323_UU_PDU_h323_message_body}, @@ -1512,13 +1507,13 @@ static field_t _H323_UU_PDU[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _H323_UserInformation[] = { /* SEQUENCE */ +static const struct field_t _H323_UserInformation[] = { /* SEQUENCE */ {FNAME("h323-uu-pdu") SEQ, 1, 2, 11, DECODE | EXT, offsetof(H323_UserInformation, h323_uu_pdu), _H323_UU_PDU}, {FNAME("user-data") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL}, }; -static field_t _GatekeeperRequest[] = { /* SEQUENCE */ +static const struct field_t _GatekeeperRequest[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, @@ -1542,7 +1537,7 @@ static field_t _GatekeeperRequest[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _GatekeeperConfirm[] = { /* SEQUENCE */ +static const struct field_t _GatekeeperConfirm[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, @@ -1562,23 +1557,23 @@ static field_t _GatekeeperConfirm[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _RegistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ +static const struct field_t _RegistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, sizeof(TransportAddress), _TransportAddress} , }; -static field_t _RegistrationRequest_rasAddress[] = { /* SEQUENCE OF */ +static const struct field_t _RegistrationRequest_rasAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, sizeof(TransportAddress), _TransportAddress} , }; -static field_t _RegistrationRequest_terminalAlias[] = { /* SEQUENCE OF */ +static const struct field_t _RegistrationRequest_terminalAlias[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _RegistrationRequest[] = { /* SEQUENCE */ +static const struct field_t _RegistrationRequest[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, @@ -1626,17 +1621,17 @@ static field_t _RegistrationRequest[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _RegistrationConfirm_callSignalAddress[] = { /* SEQUENCE OF */ +static const struct field_t _RegistrationConfirm_callSignalAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, sizeof(TransportAddress), _TransportAddress} , }; -static field_t _RegistrationConfirm_terminalAlias[] = { /* SEQUENCE OF */ +static const struct field_t _RegistrationConfirm_terminalAlias[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _RegistrationConfirm[] = { /* SEQUENCE */ +static const struct field_t _RegistrationConfirm[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, @@ -1672,13 +1667,13 @@ static field_t _RegistrationConfirm[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _UnregistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ +static const struct field_t _UnregistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, sizeof(TransportAddress), _TransportAddress} , }; -static field_t _UnregistrationRequest[] = { /* SEQUENCE */ +static const struct field_t _UnregistrationRequest[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, offsetof(UnregistrationRequest, callSignalAddress), @@ -1699,24 +1694,24 @@ static field_t _UnregistrationRequest[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _CallModel[] = { /* CHOICE */ +static const struct field_t _CallModel[] = { /* CHOICE */ {FNAME("direct") NUL, FIXD, 0, 0, SKIP, 0, NULL}, {FNAME("gatekeeperRouted") NUL, FIXD, 0, 0, SKIP, 0, NULL}, }; -static field_t _AdmissionRequest_destinationInfo[] = { /* SEQUENCE OF */ +static const struct field_t _AdmissionRequest_destinationInfo[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _AdmissionRequest_destExtraCallInfo[] = { /* SEQUENCE OF */ +static const struct field_t _AdmissionRequest_destExtraCallInfo[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _AdmissionRequest_srcInfo[] = { /* SEQUENCE OF */ +static const struct field_t _AdmissionRequest_srcInfo[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _AdmissionRequest[] = { /* SEQUENCE */ +static const struct field_t _AdmissionRequest[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType}, {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _CallModel}, @@ -1760,7 +1755,7 @@ static field_t _AdmissionRequest[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _AdmissionConfirm[] = { /* SEQUENCE */ +static const struct field_t _AdmissionConfirm[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("bandWidth") INT, CONS, 0, 0, SKIP, 0, NULL}, {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT, 0, _CallModel}, @@ -1795,11 +1790,11 @@ static field_t _AdmissionConfirm[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _LocationRequest_destinationInfo[] = { /* SEQUENCE OF */ +static const struct field_t _LocationRequest_destinationInfo[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, }; -static field_t _LocationRequest[] = { /* SEQUENCE */ +static const struct field_t _LocationRequest[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP, 0, @@ -1823,7 +1818,7 @@ static field_t _LocationRequest[] = { /* SEQUENCE */ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, }; -static field_t _LocationConfirm[] = { /* SEQUENCE */ +static const struct field_t _LocationConfirm[] = { /* SEQUENCE */ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, {FNAME("callSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT, offsetof(LocationConfirm, callSignalAddress), _TransportAddress}, @@ -1849,13 +1844,13 @@ static field_t _LocationConfirm[] = { /* SEQUENCE */ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _InfoRequestResponse_callSignalAddress[] = { /* SEQUENCE OF */ +static const struct field_t _InfoRequestResponse_callSignalAddress[] = { /* SEQUENCE OF */ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, sizeof(TransportAddress), _TransportAddress} , }; -static field_t _InfoRequestResponse[] = { /* SEQUENCE */ +static const struct field_t _InfoRequestResponse[] = { /* SEQUENCE */ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, _NonStandardParameter}, {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, @@ -1878,7 +1873,7 @@ static field_t _InfoRequestResponse[] = { /* SEQUENCE */ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, }; -static field_t _RasMessage[] = { /* CHOICE */ +static const struct field_t _RasMessage[] = { /* CHOICE */ {FNAME("gatekeeperRequest") SEQ, 4, 8, 18, DECODE | EXT, offsetof(RasMessage, gatekeeperRequest), _GatekeeperRequest}, {FNAME("gatekeeperConfirm") SEQ, 2, 5, 14, DECODE | EXT, diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 0743be4434b..5b3eae7d4c9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -15,141 +16,487 @@ #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> -#include <linux/slab.h> #include <linux/random.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_log.h> -static __read_mostly LIST_HEAD(helpers); +static DEFINE_MUTEX(nf_ct_helper_mutex); +struct hlist_head *nf_ct_helper_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_helper_hash); +unsigned int nf_ct_helper_hsize __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); +static unsigned int nf_ct_helper_count __read_mostly; -struct nf_conntrack_helper * +static bool nf_ct_auto_assign_helper __read_mostly = true; +module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); +MODULE_PARM_DESC(nf_conntrack_helper, + "Enable automatic conntrack helper assignment (default 1)"); + +#ifdef CONFIG_SYSCTL +static struct ctl_table helper_sysctl_table[] = { + { + .procname = "nf_conntrack_helper", + .data = &init_net.ct.sysctl_auto_assign_helper, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_auto_assign_helper; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.helper_sysctl_header = + register_net_sysctl(net, "net/netfilter", table); + + if (!net->ct.helper_sysctl_header) { + pr_err("nf_conntrack_helper: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.helper_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.helper_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +/* Stupid hash, but collision free for the default registrations of the + * helpers currently in the kernel. */ +static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) +{ + return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^ + (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; +} + +static struct nf_conntrack_helper * __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_helper *h; + struct nf_conntrack_helper *helper; + struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; + unsigned int h; + + if (!nf_ct_helper_count) + return NULL; - list_for_each_entry(h, &helpers, list) { - if (nf_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask)) - return h; + h = helper_hash(tuple); + hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) + return helper; } return NULL; } struct nf_conntrack_helper * -nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple) +__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { - struct nf_conntrack_helper *helper; - - /* need nf_conntrack_lock to assure that helper exists until - * try_module_get() is called */ - read_lock_bh(&nf_conntrack_lock); + struct nf_conntrack_helper *h; + unsigned int i; - helper = __nf_ct_helper_find(tuple); - if (helper) { - /* need to increase module usage count to assure helper will - * not go away while the caller is e.g. busy putting a - * conntrack in the hash that uses the helper */ - if (!try_module_get(helper->me)) - helper = NULL; + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(h->name, name) && + h->tuple.src.l3num == l3num && + h->tuple.dst.protonum == protonum) + return h; + } } + return NULL; +} +EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); + +struct nf_conntrack_helper * +nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; - read_unlock_bh(&nf_conntrack_lock); + h = __nf_conntrack_helper_find(name, l3num, protonum); +#ifdef CONFIG_MODULES + if (h == NULL) { + if (request_module("nfct-helper-%s", name) == 0) + h = __nf_conntrack_helper_find(name, l3num, protonum); + } +#endif + if (h != NULL && !try_module_get(h->me)) + h = NULL; - return helper; + return h; } -EXPORT_SYMBOL_GPL(nf_ct_helper_find_get); +EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); -void nf_ct_helper_put(struct nf_conntrack_helper *helper) +struct nf_conn_help * +nf_ct_helper_ext_add(struct nf_conn *ct, + struct nf_conntrack_helper *helper, gfp_t gfp) { - module_put(helper->me); + struct nf_conn_help *help; + + help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER, + helper->data_len, gfp); + if (help) + INIT_HLIST_HEAD(&help->expectations); + else + pr_debug("failed to add helper extension area"); + return help; } -EXPORT_SYMBOL_GPL(nf_ct_helper_put); +EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -struct nf_conntrack_helper * -__nf_conntrack_helper_find_byname(const char *name) +int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, + gfp_t flags) { - struct nf_conntrack_helper *h; + struct nf_conntrack_helper *helper = NULL; + struct nf_conn_help *help; + struct net *net = nf_ct_net(ct); + int ret = 0; + + /* We already got a helper explicitly attached. The function + * nf_conntrack_alter_reply - in case NAT is in use - asks for looking + * the helper up again. Since now the user is in full control of + * making consistent helper configurations, skip this automatic + * re-lookup, otherwise we'll lose the helper. + */ + if (test_bit(IPS_HELPER_BIT, &ct->status)) + return 0; - list_for_each_entry(h, &helpers, list) { - if (!strcmp(h->name, name)) - return h; + if (tmpl != NULL) { + help = nfct_help(tmpl); + if (help != NULL) { + helper = help->helper; + set_bit(IPS_HELPER_BIT, &ct->status); + } } - return NULL; + help = nfct_help(ct); + if (net->ct.sysctl_auto_assign_helper && helper == NULL) { + helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + if (unlikely(!net->ct.auto_assign_helper_warned && helper)) { + pr_info("nf_conntrack: automatic helper " + "assignment is deprecated and it will " + "be removed soon. Use the iptables CT target " + "to attach helpers instead.\n"); + net->ct.auto_assign_helper_warned = true; + } + } + + if (helper == NULL) { + if (help) + RCU_INIT_POINTER(help->helper, NULL); + goto out; + } + + if (help == NULL) { + help = nf_ct_helper_ext_add(ct, helper, flags); + if (help == NULL) { + ret = -ENOMEM; + goto out; + } + } else { + /* We only allow helper re-assignment of the same sort since + * we cannot reallocate the helper extension area. + */ + struct nf_conntrack_helper *tmp = rcu_dereference(help->helper); + + if (tmp && tmp->help != helper->help) { + RCU_INIT_POINTER(help->helper, NULL); + goto out; + } + } + + rcu_assign_pointer(help->helper, helper); +out: + return ret; } -EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find_byname); +EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); +/* appropiate ct lock protecting must be taken by caller */ static inline int unhelp(struct nf_conntrack_tuple_hash *i, const struct nf_conntrack_helper *me) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_help *help = nfct_help(ct); - if (help && help->helper == me) { + if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); - help->helper = NULL; + RCU_INIT_POINTER(help->helper, NULL); } return 0; } -int nf_conntrack_helper_register(struct nf_conntrack_helper *me) +void nf_ct_helper_destroy(struct nf_conn *ct) { - int size, ret; + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_helper *helper; - BUG_ON(me->timeout == 0); + if (help) { + rcu_read_lock(); + helper = rcu_dereference(help->helper); + if (helper && helper->destroy) + helper->destroy(ct); + rcu_read_unlock(); + } +} - size = ALIGN(sizeof(struct nf_conn), __alignof__(struct nf_conn_help)) + - sizeof(struct nf_conn_help); - ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help", - size); - if (ret < 0) { - printk(KERN_ERR "nf_conntrack_helper_register: Unable to create slab cache for conntracks\n"); - return ret; +static LIST_HEAD(nf_ct_helper_expectfn_list); + +void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) +{ + spin_lock_bh(&nf_conntrack_expect_lock); + list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); + spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); + +void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) +{ + spin_lock_bh(&nf_conntrack_expect_lock); + list_del_rcu(&n->head); + spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); + +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_name(const char *name) +{ + struct nf_ct_helper_expectfn *cur; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { + if (!strcmp(cur->name, name)) { + found = true; + break; + } } - write_lock_bh(&nf_conntrack_lock); - list_add(&me->list, &helpers); - write_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); + return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); - return 0; +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_symbol(const void *symbol) +{ + struct nf_ct_helper_expectfn *cur; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { + if (cur->expectfn == symbol) { + found = true; + break; + } + } + rcu_read_unlock(); + return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); + +__printf(3, 4) +void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, + const char *fmt, ...) +{ + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + /* Called from the helper function, this call never fails */ + help = nfct_help(ct); + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + + nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, + "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_log); + +int nf_conntrack_helper_register(struct nf_conntrack_helper *me) +{ + int ret = 0; + struct nf_conntrack_helper *cur; + unsigned int h = helper_hash(&me->tuple); + + BUG_ON(me->expect_policy == NULL); + BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); + BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + + mutex_lock(&nf_ct_helper_mutex); + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { + if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 && + cur->tuple.src.l3num == me->tuple.src.l3num && + cur->tuple.dst.protonum == me->tuple.dst.protonum) { + ret = -EEXIST; + goto out; + } + } + hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); + nf_ct_helper_count++; +out: + mutex_unlock(&nf_ct_helper_mutex); + return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); -void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, + struct net *net) { - unsigned int i; struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_expect *exp, *tmp; - - /* Need write lock here, to delete helper. */ - write_lock_bh(&nf_conntrack_lock); - list_del(&me->list); + struct nf_conntrack_expect *exp; + const struct hlist_node *next; + const struct hlist_nulls_node *nn; + unsigned int i; + int cpu; /* Get rid of expectations */ - list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) { - struct nf_conn_help *help = nfct_help(exp->master); - if ((help->helper == me || exp->helper == me) && - del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); - nf_conntrack_expect_put(exp); + spin_lock_bh(&nf_conntrack_expect_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, next, + &net->ct.expect_hash[i], hnode) { + struct nf_conn_help *help = nfct_help(exp->master); + if ((rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_expect_lock) + ) == me || exp->helper == me) && + del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } } } + spin_unlock_bh(&nf_conntrack_expect_lock); /* Get rid of expecteds, set helpers to NULL. */ - list_for_each_entry(h, &unconfirmed, list) - unhelp(h, me); - for (i = 0; i < nf_conntrack_htable_size; i++) { - list_for_each_entry(h, &nf_conntrack_hash[i], list) + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) unhelp(h, me); + spin_unlock_bh(&pcpu->lock); + } + local_bh_disable(); + for (i = 0; i < net->ct.htable_size; i++) { + spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + if (i < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + unhelp(h, me); + } + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); } - write_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ + struct net *net; - /* Someone could be still looking at the helper in a bh. */ - synchronize_net(); + mutex_lock(&nf_ct_helper_mutex); + hlist_del_rcu(&me->hnode); + nf_ct_helper_count--; + mutex_unlock(&nf_ct_helper_mutex); + + /* Make sure every nothing is still using the helper unless its a + * connection in the hash. + */ + synchronize_rcu(); + + rtnl_lock(); + for_each_net(net) + __nf_conntrack_helper_unregister(me, net); + rtnl_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); + +static struct nf_ct_ext_type helper_extend __read_mostly = { + .len = sizeof(struct nf_conn_help), + .align = __alignof__(struct nf_conn_help), + .id = NF_CT_EXT_HELPER, +}; + +int nf_conntrack_helper_pernet_init(struct net *net) +{ + net->ct.auto_assign_helper_warned = false; + net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; + return nf_conntrack_helper_init_sysctl(net); +} + +void nf_conntrack_helper_pernet_fini(struct net *net) +{ + nf_conntrack_helper_fini_sysctl(net); +} + +int nf_conntrack_helper_init(void) +{ + int ret; + nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ + nf_ct_helper_hash = + nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); + if (!nf_ct_helper_hash) + return -ENOMEM; + + ret = nf_ct_extend_register(&helper_extend); + if (ret < 0) { + pr_err("nf_ct_helper: Unable to register helper extension.\n"); + goto out_extend; + } + + return 0; +out_extend: + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + return ret; +} + +void nf_conntrack_helper_fini(void) +{ + nf_ct_extend_unregister(&helper_extend); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); +} diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 43ccd0e2e8a..0fd2976db7e 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -1,6 +1,7 @@ /* IRC extension for IP connection tracking, Version 1.21 * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> * based on RR's ip_conntrack_ftp.c + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -12,8 +13,10 @@ #include <linux/moduleparam.h> #include <linux/skbuff.h> #include <linux/in.h> +#include <linux/ip.h> #include <linux/tcp.h> #include <linux/netfilter.h> +#include <linux/slab.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_expect.h> @@ -22,15 +25,16 @@ #define MAX_PORTS 8 static unsigned short ports[MAX_PORTS]; -static int ports_c; +static unsigned int ports_c; static unsigned int max_dcc_channels = 8; static unsigned int dcc_timeout __read_mostly = 300; /* This is slow, but it's simple. --RR */ static char *irc_buffer; static DEFINE_SPINLOCK(irc_buffer_lock); -unsigned int (*nf_nat_irc_hook)(struct sk_buff **pskb, +unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned int matchoff, unsigned int matchlen, struct nf_conntrack_expect *exp) __read_mostly; @@ -40,6 +44,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_irc"); +MODULE_ALIAS_NFCT_HELPER("irc"); module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); @@ -49,19 +54,12 @@ MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per " module_param(dcc_timeout, uint, 0400); MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); -static const char *dccprotos[] = { +static const char *const dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; #define MINMATCHLEN 5 -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \ - __FILE__, __FUNCTION__ , ## args) -#else -#define DEBUGP(format, args...) -#endif - /* tries to get the ip_addr and port out of a dcc command * return value: -1 on failure, 0 on success * data pointer to first byte of DCC command data @@ -71,16 +69,26 @@ static const char *dccprotos[] = { * ad_beg_p returns pointer to first byte of addr data * ad_end_p returns pointer to last byte of addr data */ -static int parse_dcc(char *data, char *data_end, u_int32_t *ip, +static int parse_dcc(char *data, const char *data_end, __be32 *ip, u_int16_t *port, char **ad_beg_p, char **ad_end_p) { + char *tmp; + /* at least 12: "AAAAAAAA P\1\n" */ while (*data++ != ' ') if (data > data_end - 12) return -1; + /* Make sure we have a newline character within the packet boundaries + * because simple_strtoul parses until the first invalid character. */ + for (tmp = data; tmp <= data_end; tmp++) + if (*tmp == '\n') + break; + if (tmp > data_end || *tmp != '\n') + return -1; + *ad_beg_p = data; - *ip = simple_strtoul(data, &data, 10); + *ip = cpu_to_be32(simple_strtoul(data, &data, 10)); /* skip blanks between ip and port */ while (*data == ' ') { @@ -95,16 +103,19 @@ static int parse_dcc(char *data, char *data_end, u_int32_t *ip, return 0; } -static int help(struct sk_buff **pskb, unsigned int protoff, +static int help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { unsigned int dataoff; - struct tcphdr _tcph, *th; - char *data, *data_limit, *ib_ptr; + const struct iphdr *iph; + const struct tcphdr *th; + struct tcphdr _tcph; + const char *data_limit; + char *data, *ib_ptr; int dir = CTINFO2DIR(ctinfo); struct nf_conntrack_expect *exp; struct nf_conntrack_tuple *tuple; - u_int32_t dcc_ip; + __be32 dcc_ip; u_int16_t dcc_port; __be16 port; int i, ret = NF_ACCEPT; @@ -116,27 +127,26 @@ static int help(struct sk_buff **pskb, unsigned int protoff, return NF_ACCEPT; /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; /* Not a full tcp header? */ - th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); if (th == NULL) return NF_ACCEPT; /* No data? */ dataoff = protoff + th->doff*4; - if (dataoff >= (*pskb)->len) + if (dataoff >= skb->len) return NF_ACCEPT; spin_lock_bh(&irc_buffer_lock); - ib_ptr = skb_header_pointer(*pskb, dataoff, (*pskb)->len - dataoff, + ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff, irc_buffer); BUG_ON(ib_ptr == NULL); data = ib_ptr; - data_limit = ib_ptr + (*pskb)->len - dataoff; + data_limit = ib_ptr + skb->len - dataoff; /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ @@ -148,9 +158,10 @@ static int help(struct sk_buff **pskb, unsigned int protoff, data += 5; /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ - DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest)); + iph = ip_hdr(skb); + pr_debug("DCC found in master %pI4:%u %pI4:%u\n", + &iph->saddr, ntohs(th->source), + &iph->daddr, ntohs(th->dest)); for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { @@ -158,52 +169,56 @@ static int help(struct sk_buff **pskb, unsigned int protoff, continue; } data += strlen(dccprotos[i]); - DEBUGP("DCC %s detected\n", dccprotos[i]); + pr_debug("DCC %s detected\n", dccprotos[i]); /* we have at least * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid * data left (== 14/13 bytes) */ - if (parse_dcc((char *)data, data_limit, &dcc_ip, + if (parse_dcc(data, data_limit, &dcc_ip, &dcc_port, &addr_beg_p, &addr_end_p)) { - DEBUGP("unable to parse dcc command\n"); + pr_debug("unable to parse dcc command\n"); continue; } - DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n", - HIPQUAD(dcc_ip), dcc_port); + + pr_debug("DCC bound ip/port: %pI4:%u\n", + &dcc_ip, dcc_port); /* dcc_ip can be the internal OR external (NAT'ed) IP */ tuple = &ct->tuplehash[dir].tuple; - if (tuple->src.u3.ip != htonl(dcc_ip) && - tuple->dst.u3.ip != htonl(dcc_ip)) { - if (net_ratelimit()) - printk(KERN_WARNING - "Forged DCC command from " - "%u.%u.%u.%u: %u.%u.%u.%u:%u\n", - NIPQUAD(tuple->src.u3.ip), - HIPQUAD(dcc_ip), dcc_port); + if (tuple->src.u3.ip != dcc_ip && + tuple->dst.u3.ip != dcc_ip) { + net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", + &tuple->src.u3.ip, + &dcc_ip, dcc_port); continue; } - exp = nf_conntrack_expect_alloc(ct); + exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, + "cannot alloc expectation"); ret = NF_DROP; goto out; } tuple = &ct->tuplehash[!dir].tuple; port = htons(dcc_port); - nf_conntrack_expect_init(exp, tuple->src.l3num, - NULL, &tuple->dst.u3, - IPPROTO_TCP, NULL, &port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + tuple->src.l3num, + NULL, &tuple->dst.u3, + IPPROTO_TCP, NULL, &port); nf_nat_irc = rcu_dereference(nf_nat_irc_hook); if (nf_nat_irc && ct->status & IPS_NAT_MASK) - ret = nf_nat_irc(pskb, ctinfo, + ret = nf_nat_irc(skb, ctinfo, protoff, addr_beg_p - ib_ptr, addr_end_p - addr_beg_p, exp); - else if (nf_conntrack_expect_related(exp) != 0) + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, + "cannot add expectation"); ret = NF_DROP; - nf_conntrack_expect_put(exp); + } + nf_ct_expect_put(exp); goto out; } } @@ -213,20 +228,22 @@ static int help(struct sk_buff **pskb, unsigned int protoff, } static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; -static char irc_names[MAX_PORTS][sizeof("irc-65535")] __read_mostly; +static struct nf_conntrack_expect_policy irc_exp_policy; static void nf_conntrack_irc_fini(void); static int __init nf_conntrack_irc_init(void) { int i, ret; - char *tmpname; if (max_dcc_channels < 1) { - printk("nf_ct_irc: max_dcc_channels must not be zero\n"); + printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n"); return -EINVAL; } + irc_exp_policy.max_expected = max_dcc_channels; + irc_exp_policy.timeout = dcc_timeout; + irc_buffer = kmalloc(65536, GFP_KERNEL); if (!irc_buffer) return -ENOMEM; @@ -239,24 +256,18 @@ static int __init nf_conntrack_irc_init(void) irc[i].tuple.src.l3num = AF_INET; irc[i].tuple.src.u.tcp.port = htons(ports[i]); irc[i].tuple.dst.protonum = IPPROTO_TCP; - irc[i].mask.src.l3num = 0xFFFF; - irc[i].mask.src.u.tcp.port = htons(0xFFFF); - irc[i].mask.dst.protonum = 0xFF; - irc[i].max_expected = max_dcc_channels; - irc[i].timeout = dcc_timeout; + irc[i].expect_policy = &irc_exp_policy; irc[i].me = THIS_MODULE; irc[i].help = help; - tmpname = &irc_names[i][0]; if (ports[i] == IRC_PORT) - sprintf(tmpname, "irc"); + sprintf(irc[i].name, "irc"); else - sprintf(tmpname, "irc-%u", i); - irc[i].name = tmpname; + sprintf(irc[i].name, "irc-%u", i); ret = nf_conntrack_helper_register(&irc[i]); if (ret) { - printk("nf_ct_irc: failed to register helper " + printk(KERN_ERR "nf_ct_irc: failed to register helper " "for pf: %u port: %u\n", irc[i].tuple.src.l3num, ports[i]); nf_conntrack_irc_fini(); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c index cbd96f3c1b8..e7eb807fe07 100644 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -31,28 +31,22 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) +static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) { memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - return 1; + return true; } -static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - return 1; + return true; } static int generic_print_tuple(struct seq_file *s, @@ -61,35 +55,20 @@ static int generic_print_tuple(struct seq_file *s, return 0; } -static int generic_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - -static int -generic_prepare(struct sk_buff **pskb, unsigned int hooknum, - unsigned int *dataoff, u_int8_t *protonum) +static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + unsigned int *dataoff, u_int8_t *protonum) { /* Never track !!! */ return -NF_ACCEPT; } -static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple) - -{ - return NF_CT_F_BASIC; -} - -struct nf_conntrack_l3proto nf_conntrack_l3proto_generic = { +struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { .l3proto = PF_UNSPEC, .name = "unknown", .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, .print_tuple = generic_print_tuple, - .print_conntrack = generic_print_conntrack, - .prepare = generic_prepare, - .get_features = generic_get_features, + .get_l4proto = generic_get_l4proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c new file mode 100644 index 00000000000..bb53f120e79 --- /dev/null +++ b/net/netfilter/nf_conntrack_labels.c @@ -0,0 +1,108 @@ +/* + * test/set flag bits stored in conntrack extension area. + * + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/export.h> +#include <linux/types.h> + +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +static unsigned int label_bits(const struct nf_conn_labels *l) +{ + unsigned int longs = l->words; + return longs * BITS_PER_LONG; +} + +bool nf_connlabel_match(const struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return false; + + return bit < label_bits(labels) && test_bit(bit, labels->bits); +} +EXPORT_SYMBOL_GPL(nf_connlabel_match); + +int nf_connlabel_set(struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels || bit >= label_bits(labels)) + return -ENOSPC; + + if (test_bit(bit, labels->bits)) + return 0; + + if (!test_and_set_bit(bit, labels->bits)) + nf_conntrack_event_cache(IPCT_LABEL, ct); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabel_set); + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) +static void replace_u32(u32 *address, u32 mask, u32 new) +{ + u32 old, tmp; + + do { + old = *address; + tmp = (old & mask) ^ new; + } while (cmpxchg(address, old, tmp) != old); +} + +int nf_connlabels_replace(struct nf_conn *ct, + const u32 *data, + const u32 *mask, unsigned int words32) +{ + struct nf_conn_labels *labels; + unsigned int size, i; + u32 *dst; + + labels = nf_ct_labels_find(ct); + if (!labels) + return -ENOSPC; + + size = labels->words * sizeof(long); + if (size < (words32 * sizeof(u32))) + words32 = size / sizeof(u32); + + dst = (u32 *) labels->bits; + if (words32) { + for (i = 0; i < words32; i++) + replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); + } + + size /= sizeof(u32); + for (i = words32; i < size; i++) /* pad */ + replace_u32(&dst[i], 0, 0); + + nf_conntrack_event_cache(IPCT_LABEL, ct); + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_replace); +#endif + +static struct nf_ct_ext_type labels_extend __read_mostly = { + .len = sizeof(struct nf_conn_labels), + .align = __alignof__(struct nf_conn_labels), + .id = NF_CT_EXT_LABELS, +}; + +int nf_conntrack_labels_init(void) +{ + return nf_ct_extend_register(&labels_extend); +} + +void nf_conntrack_labels_fini(void) +{ + nf_ct_extend_unregister(&labels_extend); +} diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 1093478cc00..4c8f30a3d6d 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -18,14 +18,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_addr.h> #include <linux/in.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <net/route.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> @@ -37,84 +30,35 @@ MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_netbios_ns"); +MODULE_ALIAS_NFCT_HELPER("netbios_ns"); static unsigned int timeout __read_mostly = 3; -module_param(timeout, uint, 0400); +module_param(timeout, uint, S_IRUSR); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); -static int help(struct sk_buff **pskb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) -{ - struct nf_conntrack_expect *exp; - struct iphdr *iph = ip_hdr(*pskb); - struct rtable *rt = (struct rtable *)(*pskb)->dst; - struct in_device *in_dev; - __be32 mask = 0; - - /* we're only interested in locally generated packets */ - if ((*pskb)->sk == NULL) - goto out; - if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) - goto out; - if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - goto out; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); - if (in_dev != NULL) { - for_primary_ifa(in_dev) { - if (ifa->ifa_broadcast == iph->daddr) { - mask = ifa->ifa_mask; - break; - } - } endfor_ifa(in_dev); - } - rcu_read_unlock(); - - if (mask == 0) - goto out; - - exp = nf_conntrack_expect_alloc(ct); - if (exp == NULL) - goto out; - - exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->tuple.src.u.udp.port = htons(NMBD_PORT); - - exp->mask.src.u3.ip = mask; - exp->mask.src.u.udp.port = htons(0xFFFF); - exp->mask.dst.u3.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.udp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - - exp->expectfn = NULL; - exp->flags = NF_CT_EXPECT_PERMANENT; - exp->helper = NULL; - - nf_conntrack_expect_related(exp); - nf_conntrack_expect_put(exp); +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; - nf_ct_refresh(ct, *pskb, timeout * HZ); -out: - return NF_ACCEPT; +static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); } static struct nf_conntrack_helper helper __read_mostly = { .name = "netbios-ns", - .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(NMBD_PORT), + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), .tuple.dst.protonum = IPPROTO_UDP, - .mask.src.l3num = 0xFFFF, - .mask.src.u.udp.port = __constant_htons(0xFFFF), - .mask.dst.protonum = 0xFF, - .max_expected = 1, .me = THIS_MODULE, - .help = help, + .help = netbios_ns_help, + .expect_policy = &exp_policy, }; static int __init nf_conntrack_netbios_ns_init(void) { - helper.timeout = timeout; + exp_policy.timeout = timeout; return nf_conntrack_helper_register(&helper); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d6d39e24132..300ed1eec72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -4,7 +4,7 @@ * (C) 2001 by Jay Schulist <jschlst@samba.org> * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org> * (C) 2003 by Patrick Mchardy <kaber@trash.net> - * (C) 2005-2006 by Pablo Neira Ayuso <pablo@eurodev.net> + * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial connection tracking via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) @@ -18,27 +18,37 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/rculist.h> +#include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> +#include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> -#include <linux/notifier.h> +#include <linux/slab.h> #include <linux/netfilter.h> #include <net/netlink.h> +#include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_labels.h> #ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_protocol.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_helper.h> #endif #include <linux/netfilter/nfnetlink.h> @@ -54,18 +64,22 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, struct nf_conntrack_l4proto *l4proto) { int ret = 0; - struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); + struct nlattr *nest_parms; - NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum)) + goto nla_put_failure; - if (likely(l4proto->tuple_to_nfattr)) - ret = l4proto->tuple_to_nfattr(skb, tuple); + if (likely(l4proto->tuple_to_nlattr)) + ret = l4proto->tuple_to_nlattr(skb, tuple); - NFA_NEST_END(skb, nest_parms); + nla_nest_end(skb, nest_parms); return ret; -nfattr_failure: +nla_put_failure: return -1; } @@ -75,20 +89,24 @@ ctnetlink_dump_tuples_ip(struct sk_buff *skb, struct nf_conntrack_l3proto *l3proto) { int ret = 0; - struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); + struct nlattr *nest_parms; - if (likely(l3proto->tuple_to_nfattr)) - ret = l3proto->tuple_to_nfattr(skb, tuple); + nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; - NFA_NEST_END(skb, nest_parms); + if (likely(l3proto->tuple_to_nlattr)) + ret = l3proto->tuple_to_nlattr(skb, tuple); + + nla_nest_end(skb, nest_parms); return ret; -nfattr_failure: +nla_put_failure: return -1; } -static inline int +static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { @@ -96,313 +114,630 @@ ctnetlink_dump_tuples(struct sk_buff *skb, struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; - l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); - nf_ct_l3proto_put(l3proto); - - if (unlikely(ret < 0)) - return ret; - - l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); - ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); - nf_ct_l4proto_put(l4proto); + if (ret >= 0) { + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, + tuple->dst.protonum); + ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); + } + rcu_read_unlock(); return ret; } static inline int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 status = htonl((u_int32_t) ct->status); - NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); + if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) + goto nla_put_failure; return 0; -nfattr_failure: +nla_put_failure: return -1; } static inline int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) { - long timeout_l = ct->timeout.expires - jiffies; - __be32 timeout; + long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ; - if (timeout_l < 0) + if (timeout < 0) timeout = 0; - else - timeout = htonl(timeout_l / HZ); - NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) + goto nla_put_failure; return 0; -nfattr_failure: +nla_put_failure: return -1; } static inline int -ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) { - struct nf_conntrack_l4proto *l4proto = nf_ct_l4proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - struct nfattr *nest_proto; + struct nf_conntrack_l4proto *l4proto; + struct nlattr *nest_proto; int ret; - if (!l4proto->to_nfattr) { - nf_ct_l4proto_put(l4proto); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (!l4proto->to_nlattr) return 0; - } - - nest_proto = NFA_NEST(skb, CTA_PROTOINFO); - ret = l4proto->to_nfattr(skb, nest_proto, ct); + nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED); + if (!nest_proto) + goto nla_put_failure; - nf_ct_l4proto_put(l4proto); + ret = l4proto->to_nlattr(skb, nest_proto, ct); - NFA_NEST_END(skb, nest_proto); + nla_nest_end(skb, nest_proto); return ret; -nfattr_failure: - nf_ct_l4proto_put(l4proto); +nla_put_failure: return -1; } static inline int ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) { - struct nfattr *nest_helper; + struct nlattr *nest_helper; const struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_helper *helper; - if (!help || !help->helper) + if (!help) return 0; - nest_helper = NFA_NEST(skb, CTA_HELP); - NFA_PUT(skb, CTA_HELP_NAME, strlen(help->helper->name), help->helper->name); + helper = rcu_dereference(help->helper); + if (!helper) + goto out; - if (help->helper->to_nfattr) - help->helper->to_nfattr(skb, ct); + nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED); + if (!nest_helper) + goto nla_put_failure; + if (nla_put_string(skb, CTA_HELP_NAME, helper->name)) + goto nla_put_failure; - NFA_NEST_END(skb, nest_helper); + if (helper->to_nlattr) + helper->to_nlattr(skb, ct); + nla_nest_end(skb, nest_helper); +out: return 0; -nfattr_failure: +nla_put_failure: return -1; } -#ifdef CONFIG_NF_CT_ACCT -static inline int -ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, - enum ip_conntrack_dir dir) +static int +dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, + enum ip_conntrack_dir dir, int type) { - enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; - struct nfattr *nest_count = NFA_NEST(skb, type); - __be32 tmp; + enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nf_conn_counter *counter = acct->counter; + struct nlattr *nest_count; + u64 pkts, bytes; + + if (type == IPCTNL_MSG_CT_GET_CTRZERO) { + pkts = atomic64_xchg(&counter[dir].packets, 0); + bytes = atomic64_xchg(&counter[dir].bytes, 0); + } else { + pkts = atomic64_read(&counter[dir].packets); + bytes = atomic64_read(&counter[dir].bytes); + } - tmp = htonl(ct->counters[dir].packets); - NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); + nest_count = nla_nest_start(skb, attr | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; - tmp = htonl(ct->counters[dir].bytes); - NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp); + if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) || + nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes))) + goto nla_put_failure; - NFA_NEST_END(skb, nest_count); + nla_nest_end(skb, nest_count); return 0; -nfattr_failure: +nla_put_failure: + return -1; +} + +static int +ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type) +{ + struct nf_conn_acct *acct = nf_conn_acct_find(ct); + + if (!acct) + return 0; + + if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0) + return -1; + if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0) + return -1; + + return 0; +} + +static int +ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_count; + const struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (!tstamp) + return 0; + + nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; + + if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) || + (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, + cpu_to_be64(tstamp->stop)))) + goto nla_put_failure; + nla_nest_end(skb, nest_count); + + return 0; + +nla_put_failure: return -1; } -#else -#define ctnetlink_dump_counters(a, b, c) (0) -#endif #ifdef CONFIG_NF_CONNTRACK_MARK static inline int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 mark = htonl(ct->mark); - - NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) + goto nla_put_failure; return 0; -nfattr_failure: +nla_put_failure: return -1; } #else #define ctnetlink_dump_mark(a, b) (0) #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK static inline int -ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 id = htonl(ct->id); - NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + struct nlattr *nest_secctx; + int len, ret; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return 0; + + ret = -1; + nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED); + if (!nest_secctx) + goto nla_put_failure; + + if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) + goto nla_put_failure; + nla_nest_end(skb, nest_secctx); + + ret = 0; +nla_put_failure: + security_release_secctx(secctx, len); + return ret; +} +#else +#define ctnetlink_dump_secctx(a, b) (0) +#endif + +#ifdef CONFIG_NF_CONNTRACK_LABELS +static int ctnetlink_label_size(const struct nf_conn *ct) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return 0; + return nla_total_size(labels->words * sizeof(long)); +} + +static int +ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + unsigned int len, i; + + if (!labels) + return 0; + + len = labels->words * sizeof(long); + i = 0; + do { + if (labels->bits[i] != 0) + return nla_put(skb, CTA_LABELS, len, labels->bits); + i++; + } while (i < labels->words); + return 0; +} +#else +#define ctnetlink_dump_labels(a, b) (0) +#define ctnetlink_label_size(a) (0) +#endif -nfattr_failure: +#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) + +static inline int +ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + if (!(ct->status & IPS_EXPECTED)) + return 0; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static int +dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, + htonl(seq->correction_pos)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, + htonl(seq->offset_before)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, + htonl(seq->offset_after))) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: return -1; } static inline int -ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 use = htonl(atomic_read(&ct->ct_general.use)); + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *seq; + + if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) + return 0; + + seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) + return -1; + + seq = &seqadj->seq[IP_CT_DIR_REPLY]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) + return -1; + + return 0; +} - NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +{ + if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) + goto nla_put_failure; return 0; -nfattr_failure: +nla_put_failure: return -1; } -#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +{ + if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} static int -ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, - int event, int nowait, - const struct nf_conn *ct) +ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + struct nf_conn *ct) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - struct nfattr *nest_parms; - unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest_parms; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; - event |= NFNL_SUBSYS_CTNETLINK << 8; - nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; - nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; - nfmsg->nfgen_family = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = nf_ct_l3num(ct); nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; - nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); - nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); + if (nf_ct_zone(ct) && + nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0 || ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_acct(skb, ct, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0) - goto nfattr_failure; + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0 || + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) + goto nla_put_failure; - nlh->nlmsg_len = skb_tail_pointer(skb) - b; + nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: -nfattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nlmsg_cancel(skb, nlh); return -1; } +static inline size_t +ctnetlink_proto_size(const struct nf_conn *ct) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + size_t len = 0; + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); + len += l3proto->nla_size; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + len += l4proto->nla_size; + rcu_read_unlock(); + + return len; +} + +static inline size_t +ctnetlink_acct_size(const struct nf_conn *ct) +{ + if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) + return 0; + return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ + + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ + ; +} + +static inline int +ctnetlink_secctx_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_SECMARK + int len, ret; + + ret = security_secid_to_secctx(ct->secmark, NULL, &len); + if (ret) + return 0; + + return nla_total_size(0) /* CTA_SECCTX */ + + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ +#else + return 0; +#endif +} + +static inline size_t +ctnetlink_timestamp_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + return 0; + return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); +#else + return 0; +#endif +} + +static inline size_t +ctnetlink_nlmsg_size(const struct nf_conn *ct) +{ + return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + + ctnetlink_acct_size(ct) + + ctnetlink_timestamp_size(ct) + + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + + nla_total_size(0) /* CTA_PROTOINFO */ + + nla_total_size(0) /* CTA_HELP */ + + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + + ctnetlink_secctx_size(ct) +#ifdef CONFIG_NF_NAT_NEEDED + + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + + ctnetlink_proto_size(ct) + + ctnetlink_label_size(ct) + ; +} + #ifdef CONFIG_NF_CONNTRACK_EVENTS -static int ctnetlink_conntrack_event(struct notifier_block *this, - unsigned long events, void *ptr) +static int +ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) { + struct net *net; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - struct nfattr *nest_parms; - struct nf_conn *ct = (struct nf_conn *)ptr; + struct nlattr *nest_parms; + struct nf_conn *ct = item->ct; struct sk_buff *skb; unsigned int type; - sk_buff_data_t b; unsigned int flags = 0, group; + int err; /* ignore our fake conntrack entry */ - if (ct == &nf_conntrack_untracked) - return NOTIFY_DONE; + if (nf_ct_is_untracked(ct)) + return 0; - if (events & IPCT_DESTROY) { + if (events & (1 << IPCT_DESTROY)) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; - } else if (events & (IPCT_NEW | IPCT_RELATED)) { + } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { type = IPCTNL_MSG_CT_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_NEW; - } else if (events & (IPCT_STATUS | IPCT_PROTOINFO)) { + } else if (events) { type = IPCTNL_MSG_CT_NEW; group = NFNLGRP_CONNTRACK_UPDATE; } else - return NOTIFY_DONE; - - if (!nfnetlink_has_listeners(group)) - return NOTIFY_DONE; + return 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); - if (!skb) - return NOTIFY_DONE; + net = nf_ct_net(ct); + if (!item->report && !nfnetlink_has_listeners(net, group)) + return 0; - b = skb->tail; + skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); + if (skb == NULL) + goto errout; type |= NFNL_SUBSYS_CTNETLINK << 8; - nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; - nlh->nlmsg_flags = flags; - nfmsg->nfgen_family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = nf_ct_l3num(ct); nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; - nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - if (events & IPCT_DESTROY) { - if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) - goto nfattr_failure; + rcu_read_lock(); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + if (nf_ct_zone(ct) && + nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + goto nla_put_failure; + + if (ctnetlink_dump_id(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_status(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_DESTROY)) { + if (ctnetlink_dump_acct(skb, ct, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) + goto nla_put_failure; } else { - if (ctnetlink_dump_status(skb, ct) < 0) - goto nfattr_failure; - if (ctnetlink_dump_timeout(skb, ct) < 0) - goto nfattr_failure; + goto nla_put_failure; - if (events & IPCT_PROTOINFO + if (events & (1 << IPCT_PROTOINFO) && ctnetlink_dump_protoinfo(skb, ct) < 0) - goto nfattr_failure; + goto nla_put_failure; - if ((events & IPCT_HELPER || nfct_help(ct)) + if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) && ctnetlink_dump_helpinfo(skb, ct) < 0) - goto nfattr_failure; + goto nla_put_failure; -#ifdef CONFIG_NF_CONNTRACK_MARK - if ((events & IPCT_MARK || ct->mark) - && ctnetlink_dump_mark(skb, ct) < 0) - goto nfattr_failure; +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if ((events & (1 << IPCT_SECMARK) || ct->secmark) + && ctnetlink_dump_secctx(skb, ct) < 0) + goto nla_put_failure; #endif + if (events & (1 << IPCT_LABEL) && + ctnetlink_dump_labels(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_RELATED) && + ctnetlink_dump_master(skb, ct) < 0) + goto nla_put_failure; - if (events & IPCT_COUNTER_FILLING && - (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)) - goto nfattr_failure; + if (events & (1 << IPCT_SEQADJ) && + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) + goto nla_put_failure; } - nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, group, 0); - return NOTIFY_DONE; +#ifdef CONFIG_NF_CONNTRACK_MARK + if ((events & (1 << IPCT_MARK) || ct->mark) + && ctnetlink_dump_mark(skb, ct) < 0) + goto nla_put_failure; +#endif + rcu_read_unlock(); + + nlmsg_end(skb, nlh); + err = nfnetlink_send(skb, net, item->portid, group, item->report, + GFP_ATOMIC); + if (err == -ENOBUFS || err == -EAGAIN) + return -ENOBUFS; + return 0; + +nla_put_failure: + rcu_read_unlock(); + nlmsg_cancel(skb, nlh); nlmsg_failure: -nfattr_failure: kfree_skb(skb); - return NOTIFY_DONE; +errout: + if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0) + return -ENOBUFS; + + return 0; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ @@ -410,60 +745,88 @@ static int ctnetlink_done(struct netlink_callback *cb) { if (cb->args[1]) nf_ct_put((struct nf_conn *)cb->args[1]); + if (cb->data) + kfree(cb->data); return 0; } -#define L3PROTO(ct) ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num +struct ctnetlink_dump_filter { + struct { + u_int32_t val; + u_int32_t mask; + } mark; +}; static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; - struct list_head *i; - struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); + struct hlist_nulls_node *n; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; + int res; + spinlock_t *lockp; + +#ifdef CONFIG_NF_CONNTRACK_MARK + const struct ctnetlink_dump_filter *filter = cb->data; +#endif - read_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; - for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { + + local_bh_disable(); + for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: - list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { - h = (struct nf_conntrack_tuple_hash *) i; + lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (cb->args[0] >= net->ct.htable_size) { + spin_unlock(lockp); + goto out; + } + hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], + hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ - if (l3proto && L3PROTO(ct) != l3proto) + if (l3proto && nf_ct_l3num(ct) != l3proto) continue; if (cb->args[1]) { if (ct != last) continue; cb->args[1] = 0; } - if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - IPCTNL_MSG_CT_NEW, - 1, ct) < 0) { +#ifdef CONFIG_NF_CONNTRACK_MARK + if (filter && !((ct->mark & filter->mark.mask) == + filter->mark.val)) { + continue; + } +#endif + rcu_read_lock(); + res = + ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; + spin_unlock(lockp); goto out; } -#ifdef CONFIG_NF_CT_ACCT - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == - IPCTNL_MSG_CT_GET_CTRZERO) - memset(&ct->counters, 0, sizeof(ct->counters)); -#endif } + spin_unlock(lockp); if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: - read_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (last) nf_ct_put(last); @@ -471,79 +834,98 @@ out: } static inline int -ctnetlink_parse_tuple_ip(struct nfattr *attr, struct nf_conntrack_tuple *tuple) +ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) { - struct nfattr *tb[CTA_IP_MAX]; + struct nlattr *tb[CTA_IP_MAX+1]; struct nf_conntrack_l3proto *l3proto; int ret = 0; - nfattr_parse_nested(tb, CTA_IP_MAX, attr); + ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); + if (ret < 0) + return ret; - l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); - if (likely(l3proto->nfattr_to_tuple)) - ret = l3proto->nfattr_to_tuple(tb, tuple); + if (likely(l3proto->nlattr_to_tuple)) { + ret = nla_validate_nested(attr, CTA_IP_MAX, + l3proto->nla_policy); + if (ret == 0) + ret = l3proto->nlattr_to_tuple(tb, tuple); + } - nf_ct_l3proto_put(l3proto); + rcu_read_unlock(); return ret; } -static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int8_t), +static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_NUM] = { .type = NLA_U8 }, }; static inline int -ctnetlink_parse_tuple_proto(struct nfattr *attr, +ctnetlink_parse_tuple_proto(struct nlattr *attr, struct nf_conntrack_tuple *tuple) { - struct nfattr *tb[CTA_PROTO_MAX]; + struct nlattr *tb[CTA_PROTO_MAX+1]; struct nf_conntrack_l4proto *l4proto; int ret = 0; - nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) - return -EINVAL; + ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy); + if (ret < 0) + return ret; - if (!tb[CTA_PROTO_NUM-1]) + if (!tb[CTA_PROTO_NUM]) return -EINVAL; - tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); - l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); - if (likely(l4proto->nfattr_to_tuple)) - ret = l4proto->nfattr_to_tuple(tb, tuple); + if (likely(l4proto->nlattr_to_tuple)) { + ret = nla_validate_nested(attr, CTA_PROTO_MAX, + l4proto->nla_policy); + if (ret == 0) + ret = l4proto->nlattr_to_tuple(tb, tuple); + } - nf_ct_l4proto_put(l4proto); + rcu_read_unlock(); return ret; } -static inline int -ctnetlink_parse_tuple(struct nfattr *cda[], struct nf_conntrack_tuple *tuple, - enum ctattr_tuple type, u_int8_t l3num) +static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { + [CTA_TUPLE_IP] = { .type = NLA_NESTED }, + [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, +}; + +static int +ctnetlink_parse_tuple(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, + enum ctattr_type type, u_int8_t l3num) { - struct nfattr *tb[CTA_TUPLE_MAX]; + struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; memset(tuple, 0, sizeof(*tuple)); - nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); + err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); + if (err < 0) + return err; - if (!tb[CTA_TUPLE_IP-1]) + if (!tb[CTA_TUPLE_IP]) return -EINVAL; tuple->src.l3num = l3num; - err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); if (err < 0) return err; - if (!tb[CTA_TUPLE_PROTO-1]) + if (!tb[CTA_TUPLE_PROTO]) return -EINVAL; - err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple); if (err < 0) return err; @@ -556,146 +938,119 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct nf_conntrack_tuple *tuple, return 0; } -#ifdef CONFIG_NF_NAT_NEEDED -static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { - [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), - [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), -}; - -static int nfnetlink_parse_nat_proto(struct nfattr *attr, - const struct nf_conn *ct, - struct nf_nat_range *range) +static int +ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) { - struct nfattr *tb[CTA_PROTONAT_MAX]; - struct nf_nat_protocol *npt; - - nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - return -EINVAL; - - npt = nf_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - - if (!npt->nfattr_to_range) { - nf_nat_proto_put(npt); - return 0; - } - - /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ - if (npt->nfattr_to_range(tb, range) > 0) - range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; - - nf_nat_proto_put(npt); + if (attr) +#ifdef CONFIG_NF_CONNTRACK_ZONES + *zone = ntohs(nla_get_be16(attr)); +#else + return -EOPNOTSUPP; +#endif + else + *zone = 0; return 0; } -static const size_t cta_min_nat[CTA_NAT_MAX] = { - [CTA_NAT_MINIP-1] = sizeof(u_int32_t), - [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { + [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, }; static inline int -nfnetlink_parse_nat(struct nfattr *nat, - const struct nf_conn *ct, struct nf_nat_range *range) +ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, + struct nlattr **helpinfo) { - struct nfattr *tb[CTA_NAT_MAX]; int err; + struct nlattr *tb[CTA_HELP_MAX+1]; - memset(range, 0, sizeof(*range)); - - nfattr_parse_nested(tb, CTA_NAT_MAX, nat); - - if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) - return -EINVAL; - - if (tb[CTA_NAT_MINIP-1]) - range->min_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MINIP-1]); - - if (!tb[CTA_NAT_MAXIP-1]) - range->max_ip = range->min_ip; - else - range->max_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); - - if (range->min_ip) - range->flags |= IP_NAT_RANGE_MAP_IPS; - - if (!tb[CTA_NAT_PROTO-1]) - return 0; - - err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); + err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); if (err < 0) return err; - return 0; -} -#endif - -static inline int -ctnetlink_parse_help(struct nfattr *attr, char **helper_name) -{ - struct nfattr *tb[CTA_HELP_MAX]; - - nfattr_parse_nested(tb, CTA_HELP_MAX, attr); - - if (!tb[CTA_HELP_NAME-1]) + if (!tb[CTA_HELP_NAME]) return -EINVAL; - *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); + *helper_name = nla_data(tb[CTA_HELP_NAME]); + + if (tb[CTA_HELP_INFO]) + *helpinfo = tb[CTA_HELP_INFO]; return 0; } -static const size_t cta_min[CTA_MAX] = { - [CTA_STATUS-1] = sizeof(u_int32_t), - [CTA_TIMEOUT-1] = sizeof(u_int32_t), - [CTA_MARK-1] = sizeof(u_int32_t), - [CTA_USE-1] = sizeof(u_int32_t), - [CTA_ID-1] = sizeof(u_int32_t) +static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { + [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, + [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, + [CTA_STATUS] = { .type = NLA_U32 }, + [CTA_PROTOINFO] = { .type = NLA_NESTED }, + [CTA_HELP] = { .type = NLA_NESTED }, + [CTA_NAT_SRC] = { .type = NLA_NESTED }, + [CTA_TIMEOUT] = { .type = NLA_U32 }, + [CTA_MARK] = { .type = NLA_U32 }, + [CTA_ID] = { .type = NLA_U32 }, + [CTA_NAT_DST] = { .type = NLA_NESTED }, + [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, + [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED }, + [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED }, + [CTA_ZONE] = { .type = NLA_U16 }, + [CTA_MARK_MASK] = { .type = NLA_U32 }, + [CTA_LABELS] = { .type = NLA_BINARY, + .len = NF_CT_LABELS_MAX_SIZE }, + [CTA_LABELS_MASK] = { .type = NLA_BINARY, + .len = NF_CT_LABELS_MAX_SIZE }, }; static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[]) + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { + struct net *net = sock_net(ctnl); struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conn *ct; - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - int err = 0; + u16 zone; + int err; - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; - if (cda[CTA_TUPLE_ORIG-1]) + if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); - else if (cda[CTA_TUPLE_REPLY-1]) + else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); else { /* Flush the whole table */ - nf_conntrack_flush(); + nf_conntrack_flush_report(net, + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); return 0; } if (err < 0) return err; - h = nf_conntrack_find_get(&tuple, NULL); + h = nf_conntrack_find_get(net, zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); - if (cda[CTA_ID-1]) { - u_int32_t id = ntohl(*(__be32 *)NFA_DATA(cda[CTA_ID-1])); - if (ct->id != id) { + if (cda[CTA_ID]) { + u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); + if (id != (u32)(unsigned long)ct) { nf_ct_put(ct); return -ENOENT; } } + if (del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); @@ -704,31 +1059,49 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, static int ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[]) + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { + struct net *net = sock_net(ctnl); struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conn *ct; struct sk_buff *skb2 = NULL; - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - int err = 0; + u16 zone; + int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { -#ifndef CONFIG_NF_CT_ACCT - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO) - return -ENOTSUPP; + struct netlink_dump_control c = { + .dump = ctnetlink_dump_table, + .done = ctnetlink_done, + }; +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + struct ctnetlink_dump_filter *filter; + + filter = kzalloc(sizeof(struct ctnetlink_dump_filter), + GFP_ATOMIC); + if (filter == NULL) + return -ENOMEM; + + filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); + filter->mark.mask = + ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + c.data = filter; + } #endif - return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, - ctnetlink_done); + return netlink_dump_start(ctnl, skb, nlh, &c); } - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; - if (cda[CTA_TUPLE_ORIG-1]) + if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); - else if (cda[CTA_TUPLE_REPLY-1]) + else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); else return -EINVAL; @@ -736,26 +1109,28 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - h = nf_conntrack_find_get(&tuple, NULL); + h = nf_conntrack_find_get(net, zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb2) { + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { nf_ct_put(ct); return -ENOMEM; } - err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, - IPCTNL_MSG_CT_NEW, 1, ct); + rcu_read_lock(); + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), ct); + rcu_read_unlock(); nf_ct_put(ct); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (err < 0) goto out; @@ -764,55 +1139,192 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, free: kfree_skb(skb2); out: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +static int ctnetlink_done_list(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); + return 0; +} + +static int +ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) +{ + struct nf_conn *ct, *last; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + int res; + int cpu; + struct hlist_nulls_head *list; + struct net *net = sock_net(skb->sk); + + if (cb->args[2]) + return 0; + + last = (struct nf_conn *)cb->args[1]; + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + struct ct_pcpu *pcpu; + + if (!cpu_possible(cpu)) + continue; + + pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + spin_lock_bh(&pcpu->lock); + list = dying ? &pcpu->dying : &pcpu->unconfirmed; +restart: + hlist_nulls_for_each_entry(h, n, list, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (l3proto && nf_ct_l3num(ct) != l3proto) + continue; + if (cb->args[1]) { + if (ct != last) + continue; + cb->args[1] = 0; + } + rcu_read_lock(); + res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { + if (!atomic_inc_not_zero(&ct->ct_general.use)) + continue; + cb->args[0] = cpu; + cb->args[1] = (unsigned long)ct; + spin_unlock_bh(&pcpu->lock); + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + spin_unlock_bh(&pcpu->lock); + } + cb->args[2] = 1; +out: + if (last) + nf_ct_put(last); + + return skb->len; +} + +static int +ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) +{ + return ctnetlink_dump_list(skb, cb, true); +} + +static int +ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_dying, + .done = ctnetlink_done_list, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return -EOPNOTSUPP; +} + +static int +ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) +{ + return ctnetlink_dump_list(skb, cb, false); +} + +static int +ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_unconfirmed, + .done = ctnetlink_done_list, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return -EOPNOTSUPP; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static int +ctnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; + int err; + + parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook); + if (!parse_nat_setup) { +#ifdef CONFIG_MODULES + rcu_read_unlock(); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); + if (request_module("nf-nat") < 0) { + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + rcu_read_lock(); + return -EOPNOTSUPP; + } + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + rcu_read_lock(); + if (nfnetlink_parse_nat_setup_hook) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } + + err = parse_nat_setup(ct, manip, attr); + if (err == -EAGAIN) { +#ifdef CONFIG_MODULES + rcu_read_unlock(); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); + if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) { + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + rcu_read_lock(); + return -EOPNOTSUPP; + } + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + rcu_read_lock(); +#else + err = -EOPNOTSUPP; +#endif + } return err; } +#endif -static inline int -ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) +static int +ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { unsigned long d; - unsigned int status = ntohl(*(__be32 *)NFA_DATA(cda[CTA_STATUS-1])); + unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); d = ct->status ^ status; if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) /* unchangeable */ - return -EINVAL; + return -EBUSY; if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) /* SEEN_REPLY bit can only be set */ - return -EINVAL; - + return -EBUSY; if (d & IPS_ASSURED && !(status & IPS_ASSURED)) /* ASSURED bit can only be set */ - return -EINVAL; - - if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { -#ifndef CONFIG_NF_NAT_NEEDED - return -EINVAL; -#else - struct nf_nat_range range; - - if (cda[CTA_NAT_DST-1]) { - if (nfnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, - &range) < 0) - return -EINVAL; - if (nf_nat_initialized(ct, - HOOK2MANIP(NF_IP_PRE_ROUTING))) - return -EEXIST; - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); - } - if (cda[CTA_NAT_SRC-1]) { - if (nfnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, - &range) < 0) - return -EINVAL; - if (nf_nat_initialized(ct, - HOOK2MANIP(NF_IP_POST_ROUTING))) - return -EEXIST; - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); - } -#endif - } + return -EBUSY; /* Be careful here, modifying NAT bits can screw up things, * so don't let users modify them directly if they don't pass @@ -821,20 +1333,44 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) return 0; } +static int +ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_NAT_NEEDED + int ret; + + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, + cda[CTA_NAT_DST]); + if (ret < 0) + return ret; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); + return ret; +#else + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; + return -EOPNOTSUPP; +#endif +} static inline int -ctnetlink_change_helper(struct nf_conn *ct, struct nfattr *cda[]) +ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conntrack_helper *helper; struct nf_conn_help *help = nfct_help(ct); - char *helpname; + char *helpname = NULL; + struct nlattr *helpinfo = NULL; int err; /* don't change helper of sibling connections */ if (ct->master) - return -EINVAL; + return -EBUSY; - err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) return err; @@ -842,39 +1378,50 @@ ctnetlink_change_helper(struct nf_conn *ct, struct nfattr *cda[]) if (help && help->helper) { /* we had a helper before ... */ nf_ct_remove_expectations(ct); - help->helper = NULL; + RCU_INIT_POINTER(help->helper, NULL); } return 0; } - if (!help) { - /* FIXME: we need to reallocate and rehash */ - return -EBUSY; - } + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) { +#ifdef CONFIG_MODULES + spin_unlock_bh(&nf_conntrack_expect_lock); - helper = __nf_conntrack_helper_find_byname(helpname); - if (helper == NULL) - return -EINVAL; - - if (help->helper == helper) - return 0; + if (request_module("nfct-helper-%s", helpname) < 0) { + spin_lock_bh(&nf_conntrack_expect_lock); + return -EOPNOTSUPP; + } - if (help->helper) - /* we had a helper before ... */ - nf_ct_remove_expectations(ct); + spin_lock_bh(&nf_conntrack_expect_lock); + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } - /* need to zero data of old helper */ - memset(&help->help, 0, sizeof(help->help)); - help->helper = helper; + if (help) { + if (help->helper == helper) { + /* update private helper data if allowed. */ + if (helper->from_nlattr) + helper->from_nlattr(helpinfo, ct); + return 0; + } else + return -EBUSY; + } - return 0; + /* we cannot set a helper for an existing conntrack */ + return -EOPNOTSUPP; } static inline int -ctnetlink_change_timeout(struct nf_conn *ct, struct nfattr *cda[]) +ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { - u_int32_t timeout = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1])); + u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); if (!del_timer(&ct->timeout)) return -ETIME; @@ -885,174 +1432,813 @@ ctnetlink_change_timeout(struct nf_conn *ct, struct nfattr *cda[]) return 0; } +static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { + [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, + [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, + [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, +}; + static inline int -ctnetlink_change_protoinfo(struct nf_conn *ct, struct nfattr *cda[]) +ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) { - struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1]; + const struct nlattr *attr = cda[CTA_PROTOINFO]; + struct nlattr *tb[CTA_PROTOINFO_MAX+1]; struct nf_conntrack_l4proto *l4proto; - u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; - u_int16_t l3num = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; int err = 0; - nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); - - l4proto = nf_ct_l4proto_find_get(l3num, npt); + err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); + if (err < 0) + return err; - if (l4proto->from_nfattr) - err = l4proto->from_nfattr(tb, ct); - nf_ct_l4proto_put(l4proto); + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->from_nlattr) + err = l4proto->from_nlattr(tb, ct); + rcu_read_unlock(); return err; } +static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { + [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, +}; + +static inline int +change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) +{ + int err; + struct nlattr *cda[CTA_SEQADJ_MAX+1]; + + err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy); + if (err < 0) + return err; + + if (!cda[CTA_SEQADJ_CORRECTION_POS]) + return -EINVAL; + + seq->correction_pos = + ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); + + if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) + return -EINVAL; + + seq->offset_before = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); + + if (!cda[CTA_SEQADJ_OFFSET_AFTER]) + return -EINVAL; + + seq->offset_after = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); + + return 0; +} + +static int +ctnetlink_change_seq_adj(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + int ret = 0; + + if (!seqadj) + return 0; + + if (cda[CTA_SEQ_ADJ_ORIG]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_SEQ_ADJ_ORIG]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + if (cda[CTA_SEQ_ADJ_REPLY]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], + cda[CTA_SEQ_ADJ_REPLY]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + return 0; +} + static int -ctnetlink_change_conntrack(struct nf_conn *ct, struct nfattr *cda[]) +ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS + size_t len = nla_len(cda[CTA_LABELS]); + const void *mask = cda[CTA_LABELS_MASK]; + + if (len & (sizeof(u32)-1)) /* must be multiple of u32 */ + return -EINVAL; + + if (mask) { + if (nla_len(cda[CTA_LABELS_MASK]) == 0 || + nla_len(cda[CTA_LABELS_MASK]) != len) + return -EINVAL; + mask = nla_data(cda[CTA_LABELS_MASK]); + } + + len /= sizeof(u32); + + return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len); +#else + return -EOPNOTSUPP; +#endif +} + +static int +ctnetlink_change_conntrack(struct nf_conn *ct, + const struct nlattr * const cda[]) { int err; - if (cda[CTA_HELP-1]) { + /* only allow NAT changes and master assignation for new conntracks */ + if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER]) + return -EOPNOTSUPP; + + if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) return err; } - if (cda[CTA_TIMEOUT-1]) { + if (cda[CTA_TIMEOUT]) { err = ctnetlink_change_timeout(ct, cda); if (err < 0) return err; } - if (cda[CTA_STATUS-1]) { + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) return err; } - if (cda[CTA_PROTOINFO-1]) { + if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) - if (cda[CTA_MARK-1]) - ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1])); + if (cda[CTA_MARK]) + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); #endif + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_LABELS]) { + err = ctnetlink_attach_labels(ct, cda); + if (err < 0) + return err; + } + return 0; } -static int -ctnetlink_create_conntrack(struct nfattr *cda[], +static struct nf_conn * +ctnetlink_create_conntrack(struct net *net, u16 zone, + const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, - struct nf_conntrack_tuple *rtuple) + struct nf_conntrack_tuple *rtuple, + u8 u3) { struct nf_conn *ct; int err = -EINVAL; - struct nf_conn_help *help; + struct nf_conntrack_helper *helper; + struct nf_conn_tstamp *tstamp; - ct = nf_conntrack_alloc(otuple, rtuple); - if (ct == NULL || IS_ERR(ct)) - return -ENOMEM; + ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); + if (IS_ERR(ct)) + return ERR_PTR(-ENOMEM); - if (!cda[CTA_TIMEOUT-1]) - goto err; - ct->timeout.expires = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1])); + if (!cda[CTA_TIMEOUT]) + goto err1; + ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + + rcu_read_lock(); + if (cda[CTA_HELP]) { + char *helpname = NULL; + struct nlattr *helpinfo = NULL; + + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); + if (err < 0) + goto err2; + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) { + rcu_read_unlock(); +#ifdef CONFIG_MODULES + if (request_module("nfct-helper-%s", helpname) < 0) { + err = -EOPNOTSUPP; + goto err1; + } + + rcu_read_lock(); + helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper) { + err = -EAGAIN; + goto err2; + } + rcu_read_unlock(); +#endif + err = -EOPNOTSUPP; + goto err1; + } else { + struct nf_conn_help *help; + + help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC); + if (help == NULL) { + err = -ENOMEM; + goto err2; + } + /* set private helper data if allowed. */ + if (helper->from_nlattr) + helper->from_nlattr(helpinfo, ct); + + /* not in hash table yet so not strictly necessary */ + RCU_INIT_POINTER(help->helper, helper); + } + } else { + /* try an implicit helper assignation */ + err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); + if (err < 0) + goto err2; + } + + err = ctnetlink_setup_nat(ct, cda); + if (err < 0) + goto err2; + + nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); + nf_ct_labels_ext_add(ct); + + /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; - if (cda[CTA_STATUS-1]) { + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) - goto err; + goto err2; } - if (cda[CTA_PROTOINFO-1]) { + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); + if (err < 0) + goto err2; + } + + memset(&ct->proto, 0, sizeof(ct->proto)); + if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) - goto err; + goto err2; } #if defined(CONFIG_NF_CONNTRACK_MARK) - if (cda[CTA_MARK-1]) - ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1])); + if (cda[CTA_MARK]) + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); #endif - help = nfct_help(ct); - if (help) - help->helper = nf_ct_helper_find_get(rtuple); + /* setup master conntrack: this is a confirmed expectation */ + if (cda[CTA_TUPLE_MASTER]) { + struct nf_conntrack_tuple master; + struct nf_conntrack_tuple_hash *master_h; + struct nf_conn *master_ct; - add_timer(&ct->timeout); - nf_conntrack_hash_insert(ct); + err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3); + if (err < 0) + goto err2; - if (help && help->helper) - nf_ct_helper_put(help->helper); + master_h = nf_conntrack_find_get(net, zone, &master); + if (master_h == NULL) { + err = -ENOENT; + goto err2; + } + master_ct = nf_ct_tuplehash_to_ctrack(master_h); + __set_bit(IPS_EXPECTED_BIT, &ct->status); + ct->master = master_ct; + } + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_to_ns(ktime_get_real()); - return 0; + err = nf_conntrack_hash_check_insert(ct); + if (err < 0) + goto err2; + + rcu_read_unlock(); -err: + return ct; + +err2: + rcu_read_unlock(); +err1: nf_conntrack_free(ct); - return err; + return ERR_PTR(err); } static int ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[]) + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { + struct net *net = sock_net(ctnl); struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; - int err = 0; + u16 zone; + int err; - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; - if (cda[CTA_TUPLE_ORIG-1]) { + if (cda[CTA_TUPLE_ORIG]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); if (err < 0) return err; } - if (cda[CTA_TUPLE_REPLY-1]) { + if (cda[CTA_TUPLE_REPLY]) { err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); if (err < 0) return err; } - write_lock_bh(&nf_conntrack_lock); - if (cda[CTA_TUPLE_ORIG-1]) - h = __nf_conntrack_find(&otuple, NULL); - else if (cda[CTA_TUPLE_REPLY-1]) - h = __nf_conntrack_find(&rtuple, NULL); + if (cda[CTA_TUPLE_ORIG]) + h = nf_conntrack_find_get(net, zone, &otuple); + else if (cda[CTA_TUPLE_REPLY]) + h = nf_conntrack_find_get(net, zone, &rtuple); if (h == NULL) { - write_unlock_bh(&nf_conntrack_lock); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + if (nlh->nlmsg_flags & NLM_F_CREATE) { + enum ip_conntrack_events events; + + if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) + return -EINVAL; + + ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, + &rtuple, u3); + if (IS_ERR(ct)) + return PTR_ERR(ct); + + err = 0; + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + events = IPCT_RELATED; + else + events = IPCT_NEW; + + if (cda[CTA_LABELS] && + ctnetlink_attach_labels(ct, cda) == 0) + events |= (1 << IPCT_LABEL); + + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | + (1 << IPCT_ASSURED) | + (1 << IPCT_HELPER) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_SEQADJ) | + (1 << IPCT_MARK) | events, + ct, NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + nf_ct_put(ct); + } + return err; } /* implicit 'else' */ - /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { - err = -EINVAL; - goto out_unlock; - } - - /* We manipulate the conntrack inside the global conntrack table lock, - * so there's no need to increase the refcount */ err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) - err = ctnetlink_change_conntrack(nf_ct_tuplehash_to_ctrack(h), cda); + ct = nf_ct_tuplehash_to_ctrack(h); + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { + spin_lock_bh(&nf_conntrack_expect_lock); + err = ctnetlink_change_conntrack(ct, cda); + spin_unlock_bh(&nf_conntrack_expect_lock); + if (err == 0) { + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | + (1 << IPCT_ASSURED) | + (1 << IPCT_HELPER) | + (1 << IPCT_LABEL) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_SEQADJ) | + (1 << IPCT_MARK), + ct, NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + } + } -out_unlock: - write_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); return err; } +static int +ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, + __u16 cpu, const struct ip_conntrack_stat *st) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; + + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(cpu); + + if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) || + nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || + nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) || + nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || + nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || + nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) || + nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) || + nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || + nla_put_be32(skb, CTA_STATS_INSERT_FAILED, + htonl(st->insert_failed)) || + nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) || + nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || + nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || + nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, + htonl(st->search_restart))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int cpu; + struct net *net = sock_net(skb->sk); + + if (cb->args[0] == nr_cpu_ids) + return 0; + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + const struct ip_conntrack_stat *st; + + if (!cpu_possible(cpu)) + continue; + + st = per_cpu_ptr(net->ct.stat, cpu); + if (ctnetlink_ct_stat_cpu_fill_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + cpu, st) < 0) + break; + } + cb->args[0] = cpu; + + return skb->len; +} + +static int +ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_ct_stat_cpu_dump, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return 0; +} + +static int +ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + struct net *net) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; + unsigned int nr_conntracks = atomic_read(&net->ct.count); + + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct sk_buff *skb2; + int err; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + sock_net(skb->sk)); + if (err <= 0) + goto free; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (err < 0) + goto out; + + return 0; + +free: + kfree_skb(skb2); +out: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { + [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, + [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, + [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, + [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [CTA_EXPECT_ID] = { .type = NLA_U32 }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, + [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, + [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, +}; + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask); + +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +static size_t +ctnetlink_nfqueue_build_size(const struct nf_conn *ct) +{ + return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + + nla_total_size(0) /* CTA_PROTOINFO */ + + nla_total_size(0) /* CTA_HELP */ + + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + + ctnetlink_secctx_size(ct) +#ifdef CONFIG_NF_NAT_NEEDED + + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + + ctnetlink_proto_size(ct) + ; +} + +static int +ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + rcu_read_lock(); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + if (nf_ct_zone(ct)) { + if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + goto nla_put_failure; + } + + if (ctnetlink_dump_id(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_status(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_timeout(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0) + goto nla_put_failure; +#endif + if (ct->master && ctnetlink_dump_master(skb, ct) < 0) + goto nla_put_failure; + + if ((ct->status & IPS_SEQ_ADJUST) && + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_MARK + if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) + goto nla_put_failure; +#endif + if (ctnetlink_dump_labels(skb, ct) < 0) + goto nla_put_failure; + rcu_read_unlock(); + return 0; + +nla_put_failure: + rcu_read_unlock(); + return -ENOSPC; +} + +static int +ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) +{ + int err; + + if (cda[CTA_TIMEOUT]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_STATUS]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_HELP]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_LABELS]) { + err = ctnetlink_attach_labels(ct, cda); + if (err < 0) + return err; + } +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (cda[CTA_MARK]) { + u32 mask = 0, mark, newmark; + if (cda[CTA_MARK_MASK]) + mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + + mark = ntohl(nla_get_be32(cda[CTA_MARK])); + newmark = (ct->mark & mask) ^ mark; + if (newmark != ct->mark) + ct->mark = newmark; + } +#endif + return 0; +} + +static int +ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) +{ + struct nlattr *cda[CTA_MAX+1]; + int ret; + + ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); + if (ret < 0) + return ret; + + spin_lock_bh(&nf_conntrack_expect_lock); + ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_unlock_bh(&nf_conntrack_expect_lock); + + return ret; +} + +static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, + const struct nf_conn *ct, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) +{ + int err; + + err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, + nf_ct_l3num(ct)); + if (err < 0) + return err; + + return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, + nf_ct_l3num(ct)); +} + +static int +ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report) +{ + struct nlattr *cda[CTA_EXPECT_MAX+1]; + struct nf_conntrack_tuple tuple, mask; + struct nf_conntrack_helper *helper = NULL; + struct nf_conntrack_expect *exp; + int err; + + err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy); + if (err < 0) + return err; + + err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, + ct, &tuple, &mask); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) + return -EOPNOTSUPP; + } + + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, + helper, &tuple, &mask); + if (IS_ERR(exp)) + return PTR_ERR(exp); + + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) { + nf_ct_expect_put(exp); + return err; + } + + return 0; +} + +static struct nfq_ct_hook ctnetlink_nfqueue_hook = { + .build_size = ctnetlink_nfqueue_build_size, + .build = ctnetlink_nfqueue_build, + .parse = ctnetlink_nfqueue_parse, + .attach_expect = ctnetlink_nfqueue_attach_expect, + .seq_adjust = nf_ct_tcp_seqadj_set, +}; +#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ + /*********************************************************************** * EXPECT ***********************************************************************/ @@ -1062,212 +2248,394 @@ ctnetlink_exp_dump_tuple(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, enum ctattr_expect type) { - struct nfattr *nest_parms = NFA_NEST(skb, type); + struct nlattr *nest_parms; + nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; if (ctnetlink_dump_tuples(skb, tuple) < 0) - goto nfattr_failure; - - NFA_NEST_END(skb, nest_parms); + goto nla_put_failure; + nla_nest_end(skb, nest_parms); return 0; -nfattr_failure: +nla_put_failure: return -1; } static inline int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *mask) + const struct nf_conntrack_tuple_mask *mask) { int ret; struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; - struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT_MASK); - - l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); - ret = ctnetlink_dump_tuples_ip(skb, mask, l3proto); - nf_ct_l3proto_put(l3proto); - - if (unlikely(ret < 0)) - goto nfattr_failure; + struct nf_conntrack_tuple m; + struct nlattr *nest_parms; + + memset(&m, 0xFF, sizeof(m)); + memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); + m.src.u.all = mask->src.u.all; + m.dst.protonum = tuple->dst.protonum; + + nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); + if (ret >= 0) { + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, + tuple->dst.protonum); + ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); + } + rcu_read_unlock(); - l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); - ret = ctnetlink_dump_tuples_proto(skb, mask, l4proto); - nf_ct_l4proto_put(l4proto); if (unlikely(ret < 0)) - goto nfattr_failure; + goto nla_put_failure; - NFA_NEST_END(skb, nest_parms); + nla_nest_end(skb, nest_parms); return 0; -nfattr_failure: +nla_put_failure: return -1; } -static inline int +static const union nf_inet_addr any_addr; + +static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; - __be32 timeout = htonl((exp->timeout.expires - jiffies) / HZ); - __be32 id = htonl(exp->id); + long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; + struct nf_conn_help *help; +#ifdef CONFIG_NF_NAT_NEEDED + struct nlattr *nest_parms; + struct nf_conntrack_tuple nat_tuple = {}; +#endif + struct nf_ct_helper_expectfn *expfn; + + if (timeout < 0) + timeout = 0; if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) - goto nfattr_failure; + goto nla_put_failure; if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) - goto nfattr_failure; + goto nla_put_failure; if (ctnetlink_exp_dump_tuple(skb, &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, CTA_EXPECT_MASTER) < 0) - goto nfattr_failure; + goto nla_put_failure; - NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); - NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); +#ifdef CONFIG_NF_NAT_NEEDED + if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || + exp->saved_proto.all) { + nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir))) + goto nla_put_failure; + + nat_tuple.src.l3num = nf_ct_l3num(master); + nat_tuple.src.u3 = exp->saved_addr; + nat_tuple.dst.protonum = nf_ct_protonum(master); + nat_tuple.src.u = exp->saved_proto; + + if (ctnetlink_exp_dump_tuple(skb, &nat_tuple, + CTA_EXPECT_NAT_TUPLE) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + } +#endif + if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || + nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || + nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || + nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) + goto nla_put_failure; + help = nfct_help(master); + if (help) { + struct nf_conntrack_helper *helper; + + helper = rcu_dereference(help->helper); + if (helper && + nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) + goto nla_put_failure; + } + expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); + if (expfn != NULL && + nla_put_string(skb, CTA_EXPECT_FN, expfn->name)) + goto nla_put_failure; return 0; -nfattr_failure: +nla_put_failure: return -1; } static int -ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, - int event, - int nowait, - const struct nf_conntrack_expect *exp) +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, + int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - unsigned char *b = skb_tail_pointer(skb); + unsigned int flags = portid ? NLM_F_MULTI : 0; event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; - nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = exp->tuple.src.l3num; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; if (ctnetlink_exp_dump_expect(skb, exp) < 0) - goto nfattr_failure; + goto nla_put_failure; - nlh->nlmsg_len = skb_tail_pointer(skb) - b; + nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: -nfattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nlmsg_cancel(skb, nlh); return -1; } #ifdef CONFIG_NF_CONNTRACK_EVENTS -static int ctnetlink_expect_event(struct notifier_block *this, - unsigned long events, void *ptr) +static int +ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) { + struct nf_conntrack_expect *exp = item->exp; + struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - struct nf_conntrack_expect *exp = (struct nf_conntrack_expect *)ptr; struct sk_buff *skb; - unsigned int type; - sk_buff_data_t b; + unsigned int type, group; int flags = 0; - if (events & IPEXP_NEW) { + if (events & (1 << IPEXP_DESTROY)) { + type = IPCTNL_MSG_EXP_DELETE; + group = NFNLGRP_CONNTRACK_EXP_DESTROY; + } else if (events & (1 << IPEXP_NEW)) { type = IPCTNL_MSG_EXP_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; + group = NFNLGRP_CONNTRACK_EXP_NEW; } else - return NOTIFY_DONE; - - if (!nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW)) - return NOTIFY_DONE; + return 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); - if (!skb) - return NOTIFY_DONE; + if (!item->report && !nfnetlink_has_listeners(net, group)) + return 0; - b = skb->tail; + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (skb == NULL) + goto errout; type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; - nlh->nlmsg_flags = flags; + nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = exp->tuple.src.l3num; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; + rcu_read_lock(); if (ctnetlink_exp_dump_expect(skb, exp) < 0) - goto nfattr_failure; + goto nla_put_failure; + rcu_read_unlock(); - nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); - return NOTIFY_DONE; + nlmsg_end(skb, nlh); + nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); + return 0; +nla_put_failure: + rcu_read_unlock(); + nlmsg_cancel(skb, nlh); nlmsg_failure: -nfattr_failure: kfree_skb(skb); - return NOTIFY_DONE; +errout: + nfnetlink_set_err(net, 0, 0, -ENOBUFS); + return 0; } #endif +static int ctnetlink_exp_done(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); + return 0; +} static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conntrack_expect *exp = NULL; - struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[0]; - struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); + struct net *net = sock_net(skb->sk); + struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + + rcu_read_lock(); + last = (struct nf_conntrack_expect *)cb->args[1]; + for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { +restart: + hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], + hnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (cb->args[1]) { + if (exp != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_exp_fill_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { + if (!atomic_inc_not_zero(&exp->use)) + continue; + cb->args[1] = (unsigned long)exp; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + } +out: + rcu_read_unlock(); + if (last) + nf_ct_expect_put(last); + + return skb->len; +} + +static int +ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nf_conn *ct = cb->data; + struct nf_conn_help *help = nfct_help(ct); u_int8_t l3proto = nfmsg->nfgen_family; - read_lock_bh(&nf_conntrack_lock); - list_for_each_prev(i, &nf_conntrack_expect_list) { - exp = (struct nf_conntrack_expect *) i; + if (cb->args[0]) + return 0; + + rcu_read_lock(); + last = (struct nf_conntrack_expect *)cb->args[1]; +restart: + hlist_for_each_entry(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; - if (exp->id <= *id) - continue; - if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, + if (cb->args[1]) { + if (exp != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, - 1, exp) < 0) + exp) < 0) { + if (!atomic_inc_not_zero(&exp->use)) + continue; + cb->args[1] = (unsigned long)exp; goto out; - *id = exp->id; + } } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + cb->args[0] = 1; out: - read_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); + if (last) + nf_ct_expect_put(last); return skb->len; } -static const size_t cta_min_exp[CTA_EXPECT_MAX] = { - [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), - [CTA_EXPECT_ID-1] = sizeof(u_int32_t) -}; +static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + int err; + struct net *net = sock_net(ctnl); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + u16 zone = 0; + struct netlink_dump_control c = { + .dump = ctnetlink_exp_ct_dump_table, + .done = ctnetlink_exp_done, + }; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + } + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + c.data = ct; + + err = netlink_dump_start(ctnl, skb, nlh, &c); + nf_ct_put(ct); + + return err; +} static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[]) + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { + struct net *net = sock_net(ctnl); struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct sk_buff *skb2; - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - int err = 0; - - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; + u16 zone; + int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { - return netlink_dump_start(ctnl, skb, nlh, - ctnetlink_exp_dump_table, - ctnetlink_done); + if (cda[CTA_EXPECT_MASTER]) + return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda); + else { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_dump_table, + .done = ctnetlink_exp_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } } - if (cda[CTA_EXPECT_MASTER-1]) + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_TUPLE]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + else if (cda[CTA_EXPECT_MASTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); else return -EINVAL; @@ -1275,128 +2643,272 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - exp = nf_conntrack_expect_find_get(&tuple); + exp = nf_ct_expect_find_get(net, zone, &tuple); if (!exp) return -ENOENT; - if (cda[CTA_EXPECT_ID-1]) { - __be32 id = *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - nf_conntrack_expect_put(exp); + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + if (ntohl(id) != (u32)(unsigned long)exp) { + nf_ct_expect_put(exp); return -ENOENT; } } err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb2) + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + nf_ct_expect_put(exp); goto out; + } - err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, - nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, - 1, exp); + rcu_read_lock(); + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); + rcu_read_unlock(); + nf_ct_expect_put(exp); if (err <= 0) goto free; - nf_conntrack_expect_put(exp); + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (err < 0) + goto out; - return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + return 0; free: kfree_skb(skb2); out: - nf_conntrack_expect_put(exp); - return err; + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; } static int ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[]) + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - struct nf_conntrack_expect *exp, *tmp; + struct net *net = sock_net(ctnl); + struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; - struct nf_conntrack_helper *h; - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct hlist_node *next; u_int8_t u3 = nfmsg->nfgen_family; + unsigned int i; + u16 zone; int err; - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; - - if (cda[CTA_EXPECT_TUPLE-1]) { + if (cda[CTA_EXPECT_TUPLE]) { /* delete a single expect by tuple */ + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); if (err < 0) return err; /* bump usage count to 2 */ - exp = nf_conntrack_expect_find_get(&tuple); + exp = nf_ct_expect_find_get(net, zone, &tuple); if (!exp) return -ENOENT; - if (cda[CTA_EXPECT_ID-1]) { - __be32 id = *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - nf_conntrack_expect_put(exp); + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + if (ntohl(id) != (u32)(unsigned long)exp) { + nf_ct_expect_put(exp); return -ENOENT; } } /* after list removal, usage count == 1 */ - nf_conntrack_unexpect_related(exp); + spin_lock_bh(&nf_conntrack_expect_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + spin_unlock_bh(&nf_conntrack_expect_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ - nf_conntrack_expect_put(exp); - } else if (cda[CTA_EXPECT_HELP_NAME-1]) { - char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); + nf_ct_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME]) { + char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); + struct nf_conn_help *m_help; /* delete all expectations for this helper */ - write_lock_bh(&nf_conntrack_lock); - h = __nf_conntrack_helper_find_byname(name); - if (!h) { - write_unlock_bh(&nf_conntrack_lock); - return -EINVAL; - } - list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, - list) { - struct nf_conn_help *m_help = nfct_help(exp->master); - if (m_help->helper == h - && del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); - nf_conntrack_expect_put(exp); + spin_lock_bh(&nf_conntrack_expect_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, next, + &net->ct.expect_hash[i], + hnode) { + m_help = nfct_help(exp->master); + if (!strcmp(m_help->helper->name, name) && + del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } } } - write_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } else { /* This basically means we have to flush everything*/ - write_lock_bh(&nf_conntrack_lock); - list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, - list) { - if (del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); - nf_conntrack_expect_put(exp); + spin_lock_bh(&nf_conntrack_expect_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, next, + &net->ct.expect_hash[i], + hnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } } } - write_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; } static int -ctnetlink_change_expect(struct nf_conntrack_expect *x, struct nfattr *cda[]) +ctnetlink_change_expect(struct nf_conntrack_expect *x, + const struct nlattr * const cda[]) +{ + if (cda[CTA_EXPECT_TIMEOUT]) { + if (!del_timer(&x->timeout)) + return -ETIME; + + x->timeout.expires = jiffies + + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; + add_timer(&x->timeout); + } + return 0; +} + +static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { + [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, +}; + +static int +ctnetlink_parse_expect_nat(const struct nlattr *attr, + struct nf_conntrack_expect *exp, + u_int8_t u3) { +#ifdef CONFIG_NF_NAT_NEEDED + struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; + struct nf_conntrack_tuple nat_tuple = {}; + int err; + + err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy); + if (err < 0) + return err; + + if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE]) + return -EINVAL; + + err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, + &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3); + if (err < 0) + return err; + + exp->saved_addr = nat_tuple.src.u3; + exp->saved_proto = nat_tuple.src.u; + exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR])); + + return 0; +#else return -EOPNOTSUPP; +#endif +} + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) +{ + u_int32_t class = 0; + struct nf_conntrack_expect *exp; + struct nf_conn_help *help; + int err; + + if (cda[CTA_EXPECT_CLASS] && helper) { + class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); + if (class > helper->expect_class_max) + return ERR_PTR(-EINVAL); + } + exp = nf_ct_expect_alloc(ct); + if (!exp) + return ERR_PTR(-ENOMEM); + + help = nfct_help(ct); + if (!help) { + if (!cda[CTA_EXPECT_TIMEOUT]) { + err = -EINVAL; + goto err_out; + } + exp->timeout.expires = + jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; + + exp->flags = NF_CT_EXPECT_USERSPACE; + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags |= + ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + } + } else { + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + exp->flags &= ~NF_CT_EXPECT_USERSPACE; + } else + exp->flags = 0; + } + if (cda[CTA_EXPECT_FN]) { + const char *name = nla_data(cda[CTA_EXPECT_FN]); + struct nf_ct_helper_expectfn *expfn; + + expfn = nf_ct_helper_expectfn_find_by_name(name); + if (expfn == NULL) { + err = -EINVAL; + goto err_out; + } + exp->expectfn = expfn->expectfn; + } else + exp->expectfn = NULL; + + exp->class = class; + exp->master = ct; + exp->helper = helper; + exp->tuple = *tuple; + exp->mask.src.u3 = mask->src.u3; + exp->mask.src.u.all = mask->src.u.all; + + if (cda[CTA_EXPECT_NAT]) { + err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], + exp, nf_ct_l3num(ct)); + if (err < 0) + goto err_out; + } + return exp; +err_out: + nf_ct_expect_put(exp); + return ERR_PTR(err); } static int -ctnetlink_create_expect(struct nfattr *cda[], u_int8_t u3) +ctnetlink_create_expect(struct net *net, u16 zone, + const struct nlattr * const cda[], + u_int8_t u3, u32 portid, int report) { struct nf_conntrack_tuple tuple, mask, master_tuple; struct nf_conntrack_tuple_hash *h = NULL; + struct nf_conntrack_helper *helper = NULL; struct nf_conntrack_expect *exp; struct nf_conn *ct; - struct nf_conn_help *help; - int err = 0; + int err; /* caller guarantees that those three CTA_EXPECT_* exist */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); @@ -1410,118 +2922,224 @@ ctnetlink_create_expect(struct nfattr *cda[], u_int8_t u3) return err; /* Look for master conntrack of this expectation */ - h = nf_conntrack_find_get(&master_tuple, NULL); + h = nf_conntrack_find_get(net, zone, &master_tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); - help = nfct_help(ct); - if (!help || !help->helper) { - /* such conntrack hasn't got any helper, abort */ - err = -EINVAL; - goto out; - } + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); - exp = nf_conntrack_expect_alloc(ct); - if (!exp) { - err = -ENOMEM; - goto out; + helper = __nf_conntrack_helper_find(helpname, u3, + nf_ct_protonum(ct)); + if (helper == NULL) { +#ifdef CONFIG_MODULES + if (request_module("nfct-helper-%s", helpname) < 0) { + err = -EOPNOTSUPP; + goto err_ct; + } + helper = __nf_conntrack_helper_find(helpname, u3, + nf_ct_protonum(ct)); + if (helper) { + err = -EAGAIN; + goto err_ct; + } +#endif + err = -EOPNOTSUPP; + goto err_ct; + } } - exp->expectfn = NULL; - exp->flags = 0; - exp->master = ct; - exp->helper = NULL; - memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); - memcpy(&exp->mask, &mask, sizeof(struct nf_conntrack_tuple)); + exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto err_ct; + } - err = nf_conntrack_expect_related(exp); - nf_conntrack_expect_put(exp); + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) + goto err_exp; -out: - nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); + return 0; +err_exp: + nf_ct_expect_put(exp); +err_ct: + nf_ct_put(ct); return err; } static int ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[]) + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { + struct net *net = sock_net(ctnl); struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - int err = 0; + u16 zone; + int err; - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + if (!cda[CTA_EXPECT_TUPLE] + || !cda[CTA_EXPECT_MASK] + || !cda[CTA_EXPECT_MASTER]) return -EINVAL; - if (!cda[CTA_EXPECT_TUPLE-1] - || !cda[CTA_EXPECT_MASK-1] - || !cda[CTA_EXPECT_MASTER-1]) - return -EINVAL; + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); if (err < 0) return err; - write_lock_bh(&nf_conntrack_lock); - exp = __nf_conntrack_expect_find(&tuple); + spin_lock_bh(&nf_conntrack_expect_lock); + exp = __nf_ct_expect_find(net, zone, &tuple); if (!exp) { - write_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_expect(cda, u3); + if (nlh->nlmsg_flags & NLM_F_CREATE) { + err = ctnetlink_create_expect(net, zone, cda, + u3, + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + } return err; } err = -EEXIST; if (!(nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); - write_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return err; } +static int +ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, + const struct ip_conntrack_stat *st) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; + + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(cpu); + + if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || + nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || + nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int cpu; + struct net *net = sock_net(skb->sk); + + if (cb->args[0] == nr_cpu_ids) + return 0; + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + const struct ip_conntrack_stat *st; + + if (!cpu_possible(cpu)) + continue; + + st = per_cpu_ptr(net->ct.stat, cpu); + if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + cpu, st) < 0) + break; + } + cb->args[0] = cpu; + + return skb->len; +} + +static int +ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_stat_cpu_dump, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return 0; +} + #ifdef CONFIG_NF_CONNTRACK_EVENTS -static struct notifier_block ctnl_notifier = { - .notifier_call = ctnetlink_conntrack_event, +static struct nf_ct_event_notifier ctnl_notifier = { + .fcn = ctnetlink_conntrack_event, }; -static struct notifier_block ctnl_notifier_exp = { - .notifier_call = ctnetlink_expect_event, +static struct nf_exp_event_notifier ctnl_notifier_exp = { + .fcn = ctnetlink_expect_event, }; #endif -static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { +static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, }, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, }, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, }, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, }, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu }, + [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct }, + [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying }, + [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed }, }; -static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { +static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, }, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, }, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, }, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, + [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu }, }; -static struct nfnetlink_subsystem ctnl_subsys = { +static const struct nfnetlink_subsystem ctnl_subsys = { .name = "conntrack", .subsys_id = NFNL_SUBSYS_CTNETLINK, .cb_count = IPCTNL_MSG_MAX, .cb = ctnl_cb, }; -static struct nfnetlink_subsystem ctnl_exp_subsys = { +static const struct nfnetlink_subsystem ctnl_exp_subsys = { .name = "conntrack_expect", .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, .cb_count = IPCTNL_MSG_EXP_MAX, @@ -1532,45 +3150,84 @@ MODULE_ALIAS("ip_conntrack_netlink"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); +static int __net_init ctnetlink_net_init(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + int ret; + + ret = nf_conntrack_register_notifier(net, &ctnl_notifier); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register notifier.\n"); + goto err_out; + } + + ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp); + if (ret < 0) { + pr_err("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +err_unreg_notifier: + nf_conntrack_unregister_notifier(net, &ctnl_notifier); +err_out: + return ret; +#endif +} + +static void ctnetlink_net_exit(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp); + nf_conntrack_unregister_notifier(net, &ctnl_notifier); +#endif +} + +static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) + ctnetlink_net_exit(net); +} + +static struct pernet_operations ctnetlink_net_ops = { + .init = ctnetlink_net_init, + .exit_batch = ctnetlink_net_exit_batch, +}; + static int __init ctnetlink_init(void) { int ret; - printk("ctnetlink v%s: registering with nfnetlink.\n", version); + pr_info("ctnetlink v%s: registering with nfnetlink.\n", version); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { - printk("ctnetlink_init: cannot register with nfnetlink.\n"); + pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); goto err_out; } ret = nfnetlink_subsys_register(&ctnl_exp_subsys); if (ret < 0) { - printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); + pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n"); goto err_unreg_subsys; } -#ifdef CONFIG_NF_CONNTRACK_EVENTS - ret = nf_conntrack_register_notifier(&ctnl_notifier); + ret = register_pernet_subsys(&ctnetlink_net_ops); if (ret < 0) { - printk("ctnetlink_init: cannot register notifier.\n"); + pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } - - ret = nf_conntrack_expect_register_notifier(&ctnl_notifier_exp); - if (ret < 0) { - printk("ctnetlink_init: cannot expect register notifier.\n"); - goto err_unreg_notifier; - } +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT + /* setup interaction between nf_queue and nf_conntrack_netlink. */ + RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook); #endif - return 0; -#ifdef CONFIG_NF_CONNTRACK_EVENTS -err_unreg_notifier: - nf_conntrack_unregister_notifier(&ctnl_notifier); err_unreg_exp_subsys: nfnetlink_subsys_unregister(&ctnl_exp_subsys); -#endif err_unreg_subsys: nfnetlink_subsys_unregister(&ctnl_subsys); err_out: @@ -1579,16 +3236,14 @@ err_out: static void __exit ctnetlink_exit(void) { - printk("ctnetlink: unregistering from nfnetlink.\n"); - -#ifdef CONFIG_NF_CONNTRACK_EVENTS - nf_conntrack_expect_unregister_notifier(&ctnl_notifier_exp); - nf_conntrack_unregister_notifier(&ctnl_notifier); -#endif + pr_info("ctnetlink: unregistering from nfnetlink.\n"); + unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); - return; +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT + RCU_INIT_POINTER(nfq_ct_hook, NULL); +#endif } module_init(ctnetlink_init); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 115bcb5d5a7..825c3e3f830 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -11,10 +11,12 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * * Limitations: * - We blindly assume that control connections are always * established in PNS->PAC direction. This is a violation - * of RFFC2673 + * of RFC 2637 * - We can only support one single call within each session * TODO: * - testing of incoming PPTP calls @@ -28,6 +30,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> @@ -37,20 +40,21 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); MODULE_ALIAS("ip_conntrack_pptp"); +MODULE_ALIAS_NFCT_HELPER("pptp"); static DEFINE_SPINLOCK(nf_pptp_lock); int -(*nf_nat_pptp_hook_outbound)(struct sk_buff **pskb, +(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, + unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound); int -(*nf_nat_pptp_hook_inbound)(struct sk_buff **pskb, +(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, + unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound); @@ -65,9 +69,9 @@ void struct nf_conntrack_expect *exp) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); -#if 0 +#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) /* PptpControlMessageType names */ -const char *pptp_msg_name[] = { +const char *const pptp_msg_name[] = { "UNKNOWN_MESSAGE", "START_SESSION_REQUEST", "START_SESSION_REPLY", @@ -86,9 +90,6 @@ const char *pptp_msg_name[] = { "SET_LINK_INFO" }; EXPORT_SYMBOL(pptp_msg_name); -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) #endif #define SECS *HZ @@ -101,8 +102,9 @@ EXPORT_SYMBOL(pptp_msg_name); static void pptp_expectfn(struct nf_conn *ct, struct nf_conntrack_expect *exp) { + struct net *net = nf_ct_net(ct); typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn; - DEBUGP("increasing timeouts\n"); + pr_debug("increasing timeouts\n"); /* increase timeout of GRE data channel conntrack entry */ ct->proto.gre.timeout = PPTP_GRE_TIMEOUT; @@ -121,35 +123,37 @@ static void pptp_expectfn(struct nf_conn *ct, /* obviously this tuple inversion only works until you do NAT */ nf_ct_invert_tuplepr(&inv_t, &exp->tuple); - DEBUGP("trying to unexpect other dir: "); - NF_CT_DUMP_TUPLE(&inv_t); + pr_debug("trying to unexpect other dir: "); + nf_ct_dump_tuple(&inv_t); - exp_other = nf_conntrack_expect_find_get(&inv_t); + exp_other = nf_ct_expect_find_get(net, nf_ct_zone(ct), &inv_t); if (exp_other) { /* delete other expectation. */ - DEBUGP("found\n"); - nf_conntrack_unexpect_related(exp_other); - nf_conntrack_expect_put(exp_other); + pr_debug("found\n"); + nf_ct_unexpect_related(exp_other); + nf_ct_expect_put(exp_other); } else { - DEBUGP("not found\n"); + pr_debug("not found\n"); } } rcu_read_unlock(); } -static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) +static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, + const struct nf_conntrack_tuple *t) { - struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; struct nf_conn *sibling; + u16 zone = nf_ct_zone(ct); - DEBUGP("trying to timeout ct or exp for tuple "); - NF_CT_DUMP_TUPLE(t); + pr_debug("trying to timeout ct or exp for tuple "); + nf_ct_dump_tuple(t); - h = nf_conntrack_find_get(t, NULL); + h = nf_conntrack_find_get(net, zone, t); if (h) { sibling = nf_ct_tuplehash_to_ctrack(h); - DEBUGP("setting timeout of conntrack %p to 0\n", sibling); + pr_debug("setting timeout of conntrack %p to 0\n", sibling); sibling->proto.gre.timeout = 0; sibling->proto.gre.stream_timeout = 0; if (del_timer(&sibling->timeout)) @@ -157,11 +161,11 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) nf_ct_put(sibling); return 1; } else { - exp = nf_conntrack_expect_find_get(t); + exp = nf_ct_expect_find_get(net, zone, t); if (exp) { - DEBUGP("unexpect_related of expect %p\n", exp); - nf_conntrack_unexpect_related(exp); - nf_conntrack_expect_put(exp); + pr_debug("unexpect_related of expect %p\n", exp); + nf_ct_unexpect_related(exp); + nf_ct_expect_put(exp); return 1; } } @@ -171,7 +175,8 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) /* timeout GRE data connections */ static void pptp_destroy_siblings(struct nf_conn *ct) { - struct nf_conn_help *help = nfct_help(ct); + struct net *net = nf_ct_net(ct); + const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); struct nf_conntrack_tuple t; nf_ct_gre_keymap_destroy(ct); @@ -179,18 +184,18 @@ static void pptp_destroy_siblings(struct nf_conn *ct) /* try original (pns->pac) tuple */ memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t)); t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id; - t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id; - if (!destroy_sibling_or_exp(&t)) - DEBUGP("failed to timeout original pns->pac ct/exp\n"); + t.src.u.gre.key = ct_pptp_info->pns_call_id; + t.dst.u.gre.key = ct_pptp_info->pac_call_id; + if (!destroy_sibling_or_exp(net, ct, &t)) + pr_debug("failed to timeout original pns->pac ct/exp\n"); /* try reply (pac->pns) tuple */ memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t)); t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id; - t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id; - if (!destroy_sibling_or_exp(&t)) - DEBUGP("failed to timeout reply pac->pns ct/exp\n"); + t.src.u.gre.key = ct_pptp_info->pac_call_id; + t.dst.u.gre.key = ct_pptp_info->pns_call_id; + if (!destroy_sibling_or_exp(net, ct, &t)) + pr_debug("failed to timeout reply pac->pns ct/exp\n"); } /* expect GRE connections (PNS->PAC and PAC->PNS direction) */ @@ -201,36 +206,38 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) int ret = 1; typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre; - exp_orig = nf_conntrack_expect_alloc(ct); + exp_orig = nf_ct_expect_alloc(ct); if (exp_orig == NULL) goto out; - exp_reply = nf_conntrack_expect_alloc(ct); + exp_reply = nf_ct_expect_alloc(ct); if (exp_reply == NULL) goto out_put_orig; /* original direction, PNS->PAC */ dir = IP_CT_DIR_ORIGINAL; - nf_conntrack_expect_init(exp_orig, ct->tuplehash[dir].tuple.src.l3num, - &ct->tuplehash[dir].tuple.src.u3, - &ct->tuplehash[dir].tuple.dst.u3, - IPPROTO_GRE, &peer_callid, &callid); + nf_ct_expect_init(exp_orig, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[dir].tuple.dst.u3, + IPPROTO_GRE, &peer_callid, &callid); exp_orig->expectfn = pptp_expectfn; /* reply direction, PAC->PNS */ dir = IP_CT_DIR_REPLY; - nf_conntrack_expect_init(exp_reply, ct->tuplehash[dir].tuple.src.l3num, - &ct->tuplehash[dir].tuple.src.u3, - &ct->tuplehash[dir].tuple.dst.u3, - IPPROTO_GRE, &callid, &peer_callid); + nf_ct_expect_init(exp_reply, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[dir].tuple.dst.u3, + IPPROTO_GRE, &callid, &peer_callid); exp_reply->expectfn = pptp_expectfn; nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre); if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK) nf_nat_pptp_exp_gre(exp_orig, exp_reply); - if (nf_conntrack_expect_related(exp_orig) != 0) + if (nf_ct_expect_related(exp_orig) != 0) goto out_put_both; - if (nf_conntrack_expect_related(exp_reply) != 0) + if (nf_ct_expect_related(exp_reply) != 0) goto out_unexpect_orig; /* Add GRE keymap entries */ @@ -243,34 +250,34 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) ret = 0; out_put_both: - nf_conntrack_expect_put(exp_reply); + nf_ct_expect_put(exp_reply); out_put_orig: - nf_conntrack_expect_put(exp_orig); + nf_ct_expect_put(exp_orig); out: return ret; out_unexpect_both: - nf_conntrack_unexpect_related(exp_reply); + nf_ct_unexpect_related(exp_reply); out_unexpect_orig: - nf_conntrack_unexpect_related(exp_orig); + nf_ct_unexpect_related(exp_orig); goto out_put_both; } static inline int -pptp_inbound_pkt(struct sk_buff **pskb, +pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq, unsigned int reqlen, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + struct nf_ct_pptp_master *info = nfct_help_data(ct); u_int16_t msg; __be16 cid = 0, pcid = 0; typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; msg = ntohs(ctlh->messageType); - DEBUGP("inbound control message %s\n", pptp_msg_name[msg]); + pr_debug("inbound control message %s\n", pptp_msg_name[msg]); switch (msg) { case PPTP_START_SESSION_REPLY: @@ -305,8 +312,8 @@ pptp_inbound_pkt(struct sk_buff **pskb, pcid = pptpReq->ocack.peersCallID; if (info->pns_call_id != pcid) goto invalid; - DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], - ntohs(cid), ntohs(pcid)); + pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], + ntohs(cid), ntohs(pcid)); if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { info->cstate = PPTP_CALL_OUT_CONF; @@ -322,7 +329,7 @@ pptp_inbound_pkt(struct sk_buff **pskb, goto invalid; cid = pptpReq->icreq.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); info->cstate = PPTP_CALL_IN_REQ; info->pac_call_id = cid; break; @@ -341,7 +348,7 @@ pptp_inbound_pkt(struct sk_buff **pskb, if (info->pns_call_id != pcid) goto invalid; - DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); + pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); info->cstate = PPTP_CALL_IN_CONF; /* we expect a GRE connection from PAC to PNS */ @@ -351,7 +358,7 @@ pptp_inbound_pkt(struct sk_buff **pskb, case PPTP_CALL_DISCONNECT_NOTIFY: /* server confirms disconnect */ cid = pptpReq->disc.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); info->cstate = PPTP_CALL_NONE; /* untrack this call id, unexpect GRE packets */ @@ -359,6 +366,7 @@ pptp_inbound_pkt(struct sk_buff **pskb, break; case PPTP_WAN_ERROR_NOTIFY: + case PPTP_SET_LINK_INFO: case PPTP_ECHO_REQUEST: case PPTP_ECHO_REPLY: /* I don't have to explain these ;) */ @@ -370,33 +378,34 @@ pptp_inbound_pkt(struct sk_buff **pskb, nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound); if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_inbound(pskb, ct, ctinfo, ctlh, pptpReq); + return nf_nat_pptp_inbound(skb, ct, ctinfo, + protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: - DEBUGP("invalid %s: type=%d cid=%u pcid=%u " - "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], - msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, - ntohs(info->pns_call_id), ntohs(info->pac_call_id)); + pr_debug("invalid %s: type=%d cid=%u pcid=%u " + "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, + ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; } static inline int -pptp_outbound_pkt(struct sk_buff **pskb, +pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq, unsigned int reqlen, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + struct nf_ct_pptp_master *info = nfct_help_data(ct); u_int16_t msg; __be16 cid = 0, pcid = 0; typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; msg = ntohs(ctlh->messageType); - DEBUGP("outbound control message %s\n", pptp_msg_name[msg]); + pr_debug("outbound control message %s\n", pptp_msg_name[msg]); switch (msg) { case PPTP_START_SESSION_REQUEST: @@ -418,7 +427,7 @@ pptp_outbound_pkt(struct sk_buff **pskb, info->cstate = PPTP_CALL_OUT_REQ; /* track PNS call id */ cid = pptpReq->ocreq.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); info->pns_call_id = cid; break; @@ -432,8 +441,8 @@ pptp_outbound_pkt(struct sk_buff **pskb, pcid = pptpReq->icack.peersCallID; if (info->pac_call_id != pcid) goto invalid; - DEBUGP("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], - ntohs(cid), ntohs(pcid)); + pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], + ntohs(cid), ntohs(pcid)); if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { /* part two of the three-way handshake */ @@ -465,15 +474,16 @@ pptp_outbound_pkt(struct sk_buff **pskb, nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound); if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_outbound(pskb, ct, ctinfo, ctlh, pptpReq); + return nf_nat_pptp_outbound(skb, ct, ctinfo, + protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: - DEBUGP("invalid %s: type=%d cid=%u pcid=%u " - "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], - msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, - ntohs(info->pns_call_id), ntohs(info->pac_call_id)); + pr_debug("invalid %s: type=%d cid=%u pcid=%u " + "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, + ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; } @@ -495,36 +505,37 @@ static const unsigned int pptp_msg_size[] = { /* track caller id inside control connection, call expect_related */ static int -conntrack_pptp_help(struct sk_buff **pskb, unsigned int protoff, +conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { int dir = CTINFO2DIR(ctinfo); - struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; - struct tcphdr _tcph, *tcph; - struct pptp_pkt_hdr _pptph, *pptph; + const struct nf_ct_pptp_master *info = nfct_help_data(ct); + const struct tcphdr *tcph; + struct tcphdr _tcph; + const struct pptp_pkt_hdr *pptph; + struct pptp_pkt_hdr _pptph; struct PptpControlHeader _ctlh, *ctlh; union pptp_ctrl_union _pptpReq, *pptpReq; - unsigned int tcplen = (*pskb)->len - protoff; + unsigned int tcplen = skb->len - protoff; unsigned int datalen, reqlen, nexthdr_off; int oldsstate, oldcstate; int ret; u_int16_t msg; /* don't do any tracking before tcp handshake complete */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; nexthdr_off = protoff; - tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph); + tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph); BUG_ON(!tcph); nexthdr_off += tcph->doff * 4; datalen = tcplen - tcph->doff * 4; - pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph); + pptph = skb_header_pointer(skb, nexthdr_off, sizeof(_pptph), &_pptph); if (!pptph) { - DEBUGP("no full PPTP header, can't track\n"); + pr_debug("no full PPTP header, can't track\n"); return NF_ACCEPT; } nexthdr_off += sizeof(_pptph); @@ -533,11 +544,11 @@ conntrack_pptp_help(struct sk_buff **pskb, unsigned int protoff, /* if it's not a control message we can't do anything with it */ if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { - DEBUGP("not a control packet\n"); + pr_debug("not a control packet\n"); return NF_ACCEPT; } - ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh); + ctlh = skb_header_pointer(skb, nexthdr_off, sizeof(_ctlh), &_ctlh); if (!ctlh) return NF_ACCEPT; nexthdr_off += sizeof(_ctlh); @@ -550,7 +561,7 @@ conntrack_pptp_help(struct sk_buff **pskb, unsigned int protoff, if (reqlen > sizeof(*pptpReq)) reqlen = sizeof(*pptpReq); - pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq); + pptpReq = skb_header_pointer(skb, nexthdr_off, reqlen, &_pptpReq); if (!pptpReq) return NF_ACCEPT; @@ -563,33 +574,35 @@ conntrack_pptp_help(struct sk_buff **pskb, unsigned int protoff, * established from PNS->PAC. However, RFC makes no guarantee */ if (dir == IP_CT_DIR_ORIGINAL) /* client -> server (PNS -> PAC) */ - ret = pptp_outbound_pkt(pskb, ctlh, pptpReq, reqlen, ct, + ret = pptp_outbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct, ctinfo); else /* server -> client (PAC -> PNS) */ - ret = pptp_inbound_pkt(pskb, ctlh, pptpReq, reqlen, ct, + ret = pptp_inbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct, ctinfo); - DEBUGP("sstate: %d->%d, cstate: %d->%d\n", - oldsstate, info->sstate, oldcstate, info->cstate); + pr_debug("sstate: %d->%d, cstate: %d->%d\n", + oldsstate, info->sstate, oldcstate, info->cstate); spin_unlock_bh(&nf_pptp_lock); return ret; } +static const struct nf_conntrack_expect_policy pptp_exp_policy = { + .max_expected = 2, + .timeout = 5 * 60, +}; + /* control protocol helper */ static struct nf_conntrack_helper pptp __read_mostly = { .name = "pptp", .me = THIS_MODULE, - .max_expected = 2, - .timeout = 5 * 60, + .data_len = sizeof(struct nf_ct_pptp_master), .tuple.src.l3num = AF_INET, - .tuple.src.u.tcp.port = __constant_htons(PPTP_CONTROL_PORT), + .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT), .tuple.dst.protonum = IPPROTO_TCP, - .mask.src.l3num = 0xffff, - .mask.src.u.tcp.port = __constant_htons(0xffff), - .mask.dst.protonum = 0xff, .help = conntrack_pptp_help, .destroy = pptp_destroy_siblings, + .expect_policy = &pptp_exp_policy, }; static int __init nf_conntrack_pptp_init(void) @@ -600,7 +613,6 @@ static int __init nf_conntrack_pptp_init(void) static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - nf_ct_gre_keymap_flush(); } module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 6d947068c58..b65d5864b6d 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -12,13 +13,12 @@ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/mutex.h> -#include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> #include <linux/percpu.h> -#include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -28,35 +28,40 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> -static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; -struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; +static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly; +struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL static int -nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_table *path, - struct ctl_table *table, unsigned int *users) +nf_ct_register_sysctl(struct net *net, + struct ctl_table_header **header, + const char *path, + struct ctl_table *table) { if (*header == NULL) { - *header = nf_register_sysctl_table(path, table); + *header = register_net_sysctl(net, path, table); if (*header == NULL) return -ENOMEM; } - if (users != NULL) - (*users)++; + return 0; } static void nf_ct_unregister_sysctl(struct ctl_table_header **header, - struct ctl_table *table, unsigned int *users) + struct ctl_table **table, + unsigned int users) { - if (users != NULL && --*users > 0) + if (users > 0) return; - nf_unregister_sysctl_table(*header, table); + + unregister_net_sysctl_table(*header); + kfree(*table); *header = NULL; + *table = NULL; } #endif @@ -72,27 +77,6 @@ EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); /* this is guaranteed to always return a valid protocol helper, since * it falls back to generic_protocol */ -struct nf_conntrack_l4proto * -nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto) -{ - struct nf_conntrack_l4proto *p; - - rcu_read_lock(); - p = __nf_ct_l4proto_find(l3proto, l4proto); - if (!try_module_get(p->me)) - p = &nf_conntrack_l4proto_generic; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); - -void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); - struct nf_conntrack_l3proto * nf_ct_l3proto_find_get(u_int16_t l3proto) { @@ -108,12 +92,6 @@ nf_ct_l3proto_find_get(u_int16_t l3proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); -void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_put); - int nf_ct_l3proto_try_module_get(unsigned short l3proto) { @@ -137,153 +115,264 @@ void nf_ct_l3proto_module_put(unsigned short l3proto) { struct nf_conntrack_l3proto *p; - /* rcu_read_lock not necessary since the caller holds a reference */ + /* rcu_read_lock not necessary since the caller holds a reference, but + * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find() + */ + rcu_read_lock(); p = __nf_ct_l3proto_find(l3proto); module_put(p->me); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); +struct nf_conntrack_l4proto * +nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) +{ + struct nf_conntrack_l4proto *p; + + rcu_read_lock(); + p = __nf_ct_l4proto_find(l3num, l4num); + if (!try_module_get(p->me)) + p = &nf_conntrack_l4proto_generic; + rcu_read_unlock(); + + return p; +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); + +void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) +{ + module_put(p->me); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); + static int kill_l3proto(struct nf_conn *i, void *data) { - return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == - ((struct nf_conntrack_l3proto *)data)->l3proto); + return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto; } static int kill_l4proto(struct nf_conn *i, void *data) { struct nf_conntrack_l4proto *l4proto; l4proto = (struct nf_conntrack_l4proto *)data; - return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == - l4proto->l4proto) && - (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == - l4proto->l3proto); + return nf_ct_protonum(i) == l4proto->l4proto && + nf_ct_l3num(i) == l4proto->l3proto; } -static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto) +static struct nf_ip_net *nf_ct_l3proto_net(struct net *net, + struct nf_conntrack_l3proto *l3proto) { - int err = 0; + if (l3proto->l3proto == PF_INET) + return &net->ct.nf_ct_proto; + else + return NULL; +} -#ifdef CONFIG_SYSCTL - if (l3proto->ctl_table != NULL) { - err = nf_ct_register_sysctl(&l3proto->ctl_table_header, +static int nf_ct_l3proto_register_sysctl(struct net *net, + struct nf_conntrack_l3proto *l3proto) +{ + int err = 0; + struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); + /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */ + if (in == NULL) + return 0; + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + if (in->ctl_table != NULL) { + err = nf_ct_register_sysctl(net, + &in->ctl_table_header, l3proto->ctl_table_path, - l3proto->ctl_table, NULL); + in->ctl_table); + if (err < 0) { + kfree(in->ctl_table); + in->ctl_table = NULL; + } } #endif return err; } -static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto) +static void nf_ct_l3proto_unregister_sysctl(struct net *net, + struct nf_conntrack_l3proto *l3proto) { -#ifdef CONFIG_SYSCTL - if (l3proto->ctl_table_header != NULL) - nf_ct_unregister_sysctl(&l3proto->ctl_table_header, - l3proto->ctl_table, NULL); + struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); + + if (in == NULL) + return; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + if (in->ctl_table_header != NULL) + nf_ct_unregister_sysctl(&in->ctl_table_header, + &in->ctl_table, + 0); #endif } -int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; + struct nf_conntrack_l3proto *old; if (proto->l3proto >= AF_MAX) return -EBUSY; + if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size) + return -EINVAL; + mutex_lock(&nf_ct_proto_mutex); - if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { + old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex)); + if (old != &nf_conntrack_l3proto_generic) { ret = -EBUSY; goto out_unlock; } - ret = nf_ct_l3proto_register_sysctl(proto); - if (ret < 0) - goto out_unlock; + if (proto->nlattr_tuple_size) + proto->nla_size = 3 * proto->nlattr_tuple_size(); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); out_unlock: mutex_unlock(&nf_ct_proto_mutex); return ret; + +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); + +int nf_ct_l3proto_pernet_register(struct net *net, + struct nf_conntrack_l3proto *proto) +{ + int ret = 0; + + if (proto->init_net) { + ret = proto->init_net(net); + if (ret < 0) + return ret; + } + + return nf_ct_l3proto_register_sysctl(net, proto); } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); -void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) { BUG_ON(proto->l3proto >= AF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); + BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != proto); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], &nf_conntrack_l3proto_generic); - nf_ct_l3proto_unregister_sysctl(proto); mutex_unlock(&nf_ct_proto_mutex); synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); + +void nf_ct_l3proto_pernet_unregister(struct net *net, + struct nf_conntrack_l3proto *proto) +{ + nf_ct_l3proto_unregister_sysctl(net, proto); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(kill_l3proto, proto); + nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); -static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto) +static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, + struct nf_conntrack_l4proto *l4proto) +{ + if (l4proto->get_net_proto) { + /* statically built-in protocols use static per-net */ + return l4proto->get_net_proto(net); + } else if (l4proto->net_id) { + /* ... and loadable protocols use dynamic per-net */ + return net_generic(net, *l4proto->net_id); + } + return NULL; +} + +static +int nf_ct_l4proto_register_sysctl(struct net *net, + struct nf_proto_net *pn, + struct nf_conntrack_l4proto *l4proto) { int err = 0; #ifdef CONFIG_SYSCTL - if (l4proto->ctl_table != NULL) { - err = nf_ct_register_sysctl(l4proto->ctl_table_header, - nf_net_netfilter_sysctl_path, - l4proto->ctl_table, - l4proto->ctl_table_users); - if (err < 0) - goto out; + if (pn->ctl_table != NULL) { + err = nf_ct_register_sysctl(net, + &pn->ctl_table_header, + "net/netfilter", + pn->ctl_table); + if (err < 0) { + if (!pn->users) { + kfree(pn->ctl_table); + pn->ctl_table = NULL; + } + } } #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->ctl_compat_table != NULL) { - err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header, - nf_net_ipv4_netfilter_sysctl_path, - l4proto->ctl_compat_table, NULL); + if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) { + if (err < 0) { + nf_ct_kfree_compat_sysctl_table(pn); + goto out; + } + err = nf_ct_register_sysctl(net, + &pn->ctl_compat_header, + "net/ipv4/netfilter", + pn->ctl_compat_table); if (err == 0) goto out; - nf_ct_unregister_sysctl(l4proto->ctl_table_header, - l4proto->ctl_table, - l4proto->ctl_table_users); + + nf_ct_kfree_compat_sysctl_table(pn); + nf_ct_unregister_sysctl(&pn->ctl_table_header, + &pn->ctl_table, + pn->users); } -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ out: +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ return err; } -static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto) +static +void nf_ct_l4proto_unregister_sysctl(struct net *net, + struct nf_proto_net *pn, + struct nf_conntrack_l4proto *l4proto) { #ifdef CONFIG_SYSCTL - if (l4proto->ctl_table_header != NULL && - *l4proto->ctl_table_header != NULL) - nf_ct_unregister_sysctl(l4proto->ctl_table_header, - l4proto->ctl_table, - l4proto->ctl_table_users); + if (pn->ctl_table_header != NULL) + nf_ct_unregister_sysctl(&pn->ctl_table_header, + &pn->ctl_table, + pn->users); + #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->ctl_compat_table_header != NULL) - nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header, - l4proto->ctl_compat_table, NULL); + if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL) + nf_ct_unregister_sysctl(&pn->ctl_compat_header, + &pn->ctl_compat_table, + 0); #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ } /* FIXME: Allow NULL functions and sub in pointers to generic for them. --RR */ -int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto) { int ret = 0; if (l4proto->l3proto >= PF_MAX) return -EBUSY; + if ((l4proto->to_nlattr && !l4proto->nlattr_size) + || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) + return -EINVAL; + mutex_lock(&nf_ct_proto_mutex); if (!nf_ct_protos[l4proto->l3proto]) { /* l3proto may be loaded latter. */ - struct nf_conntrack_l4proto **proto_array; + struct nf_conntrack_l4proto __rcu **proto_array; int i; proto_array = kmalloc(MAX_NF_CT_PROTO * @@ -295,54 +384,130 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) } for (i = 0; i < MAX_NF_CT_PROTO; i++) - proto_array[i] = &nf_conntrack_l4proto_generic; + RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic); + + /* Before making proto_array visible to lockless readers, + * we must make sure its content is committed to memory. + */ + smp_wmb(); + nf_ct_protos[l4proto->l3proto] = proto_array; - } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != - &nf_conntrack_l4proto_generic) { + } else if (rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != &nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; } - ret = nf_ct_l4proto_register_sysctl(l4proto); - if (ret < 0) - goto out_unlock; + l4proto->nla_size = 0; + if (l4proto->nlattr_size) + l4proto->nla_size += l4proto->nlattr_size(); + if (l4proto->nlattr_tuple_size) + l4proto->nla_size += 3 * l4proto->nlattr_tuple_size(); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); - out_unlock: mutex_unlock(&nf_ct_proto_mutex); return ret; } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); -void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_pernet_register(struct net *net, + struct nf_conntrack_l4proto *l4proto) +{ + int ret = 0; + struct nf_proto_net *pn = NULL; + + if (l4proto->init_net) { + ret = l4proto->init_net(net, l4proto->l3proto); + if (ret < 0) + goto out; + } + + pn = nf_ct_l4proto_net(net, l4proto); + if (pn == NULL) + goto out; + + ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto); + if (ret < 0) + goto out; + + pn->users++; +out: + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); + +void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) { BUG_ON(l4proto->l3proto >= PF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); + BUG_ON(rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != l4proto); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], &nf_conntrack_l4proto_generic); - nf_ct_l4proto_unregister_sysctl(l4proto); mutex_unlock(&nf_ct_proto_mutex); synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); + +void nf_ct_l4proto_pernet_unregister(struct net *net, + struct nf_conntrack_l4proto *l4proto) +{ + struct nf_proto_net *pn = NULL; + + pn = nf_ct_l4proto_net(net, l4proto); + if (pn == NULL) + return; + + pn->users--; + nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(kill_l4proto, l4proto); + nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0); } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); -int nf_conntrack_proto_init(void) +int nf_conntrack_proto_pernet_init(struct net *net) { - unsigned int i; int err; + struct nf_proto_net *pn = nf_ct_l4proto_net(net, + &nf_conntrack_l4proto_generic); - err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); + err = nf_conntrack_l4proto_generic.init_net(net, + nf_conntrack_l4proto_generic.l3proto); + if (err < 0) + return err; + err = nf_ct_l4proto_register_sysctl(net, + pn, + &nf_conntrack_l4proto_generic); if (err < 0) return err; + pn->users++; + return 0; +} + +void nf_conntrack_proto_pernet_fini(struct net *net) +{ + struct nf_proto_net *pn = nf_ct_l4proto_net(net, + &nf_conntrack_l4proto_generic); + + pn->users--; + nf_ct_l4proto_unregister_sysctl(net, + pn, + &nf_conntrack_l4proto_generic); +} + +int nf_conntrack_proto_init(void) +{ + unsigned int i; for (i = 0; i < AF_MAX; i++) rcu_assign_pointer(nf_ct_l3protos[i], &nf_conntrack_l3proto_generic); @@ -352,9 +517,6 @@ int nf_conntrack_proto_init(void) void nf_conntrack_proto_fini(void) { unsigned int i; - - nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); - /* free l3proto protocol tables */ for (i = 0; i < PF_MAX; i++) kfree(nf_ct_protos[i]); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c new file mode 100644 index 00000000000..cb372f96f10 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -0,0 +1,1006 @@ +/* + * DCCP connection tracking protocol helper + * + * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/dccp.h> +#include <linux/slab.h> + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> + +/* Timeouts are based on values from RFC4340: + * + * - REQUEST: + * + * 8.1.2. Client Request + * + * A client MAY give up on its DCCP-Requests after some time + * (3 minutes, for example). + * + * - RESPOND: + * + * 8.1.3. Server Response + * + * It MAY also leave the RESPOND state for CLOSED after a timeout of + * not less than 4MSL (8 minutes); + * + * - PARTOPEN: + * + * 8.1.5. Handshake Completion + * + * If the client remains in PARTOPEN for more than 4MSL (8 minutes), + * it SHOULD reset the connection with Reset Code 2, "Aborted". + * + * - OPEN: + * + * The DCCP timestamp overflows after 11.9 hours. If the connection + * stays idle this long the sequence number won't be recognized + * as valid anymore. + * + * - CLOSEREQ/CLOSING: + * + * 8.3. Termination + * + * The retransmission timer should initially be set to go off in two + * round-trip times and should back off to not less than once every + * 64 seconds ... + * + * - TIMEWAIT: + * + * 4.3. States + * + * A server or client socket remains in this state for 2MSL (4 minutes) + * after the connection has been town down, ... + */ + +#define DCCP_MSL (2 * 60 * HZ) + +static const char * const dccp_state_names[] = { + [CT_DCCP_NONE] = "NONE", + [CT_DCCP_REQUEST] = "REQUEST", + [CT_DCCP_RESPOND] = "RESPOND", + [CT_DCCP_PARTOPEN] = "PARTOPEN", + [CT_DCCP_OPEN] = "OPEN", + [CT_DCCP_CLOSEREQ] = "CLOSEREQ", + [CT_DCCP_CLOSING] = "CLOSING", + [CT_DCCP_TIMEWAIT] = "TIMEWAIT", + [CT_DCCP_IGNORE] = "IGNORE", + [CT_DCCP_INVALID] = "INVALID", +}; + +#define sNO CT_DCCP_NONE +#define sRQ CT_DCCP_REQUEST +#define sRS CT_DCCP_RESPOND +#define sPO CT_DCCP_PARTOPEN +#define sOP CT_DCCP_OPEN +#define sCR CT_DCCP_CLOSEREQ +#define sCG CT_DCCP_CLOSING +#define sTW CT_DCCP_TIMEWAIT +#define sIG CT_DCCP_IGNORE +#define sIV CT_DCCP_INVALID + +/* + * DCCP state transition table + * + * The assumption is the same as for TCP tracking: + * + * We are the man in the middle. All the packets go through us but might + * get lost in transit to the destination. It is assumed that the destination + * can't receive segments we haven't seen. + * + * The following states exist: + * + * NONE: Initial state, expecting Request + * REQUEST: Request seen, waiting for Response from server + * RESPOND: Response from server seen, waiting for Ack from client + * PARTOPEN: Ack after Response seen, waiting for packet other than Response, + * Reset or Sync from server + * OPEN: Packet other than Response, Reset or Sync seen + * CLOSEREQ: CloseReq from server seen, expecting Close from client + * CLOSING: Close seen, expecting Reset + * TIMEWAIT: Reset seen + * IGNORE: Not determinable whether packet is valid + * + * Some states exist only on one side of the connection: REQUEST, RESPOND, + * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to + * the one it was in before. + * + * Packets are marked as ignored (sIG) if we don't know if they're valid + * (for example a reincarnation of a connection we didn't notice is dead + * already) and the server may send back a connection closing Reset or a + * Response. They're also used for Sync/SyncAck packets, which we don't + * care about. + */ +static const u_int8_t +dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { + [CT_DCCP_ROLE_CLIENT] = { + [DCCP_PKT_REQUEST] = { + /* + * sNO -> sRQ Regular Request + * sRQ -> sRQ Retransmitted Request or reincarnation + * sRS -> sRS Retransmitted Request (apparently Response + * got lost after we saw it) or reincarnation + * sPO -> sIG Ignore, conntrack might be out of sync + * sOP -> sIG Ignore, conntrack might be out of sync + * sCR -> sIG Ignore, conntrack might be out of sync + * sCG -> sIG Ignore, conntrack might be out of sync + * sTW -> sRQ Reincarnation + * + * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ + sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, + }, + [DCCP_PKT_RESPONSE] = { + /* + * sNO -> sIV Invalid + * sRQ -> sIG Ignore, might be response to ignored Request + * sRS -> sIG Ignore, might be response to ignored Request + * sPO -> sIG Ignore, might be response to ignored Request + * sOP -> sIG Ignore, might be response to ignored Request + * sCR -> sIG Ignore, might be response to ignored Request + * sCG -> sIG Ignore, might be response to ignored Request + * sTW -> sIV Invalid, reincarnation in reverse direction + * goes through sRQ + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, + }, + [DCCP_PKT_ACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) + * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN + * sOP -> sOP Regular ACK, remain in OPEN + * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV + }, + [DCCP_PKT_DATA] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) + * sOP -> sOP Regular Data packet + * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, + }, + [DCCP_PKT_DATAACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) + * sPO -> sPO Remain in PARTOPEN state + * sOP -> sOP Regular DataAck packet in OPEN state + * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV + }, + [DCCP_PKT_CLOSEREQ] = { + /* + * CLOSEREQ may only be sent by the server. + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV + }, + [DCCP_PKT_CLOSE] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sCG Client-initiated close + * sOP -> sCG Client-initiated close + * sCR -> sCG Close in response to CloseReq (8.3.) + * sCG -> sCG Retransmit + * sTW -> sIV Late retransmit, already in TIME_WAIT + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV + }, + [DCCP_PKT_RESET] = { + /* + * sNO -> sIV No connection + * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) + * sRS -> sTW Response received without Request + * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) + * sOP -> sTW Connection reset + * sCR -> sTW Connection reset + * sCG -> sTW Connection reset + * sTW -> sIG Ignore (don't refresh timer) + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG + }, + [DCCP_PKT_SYNC] = { + /* + * We currently ignore Sync packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + [DCCP_PKT_SYNCACK] = { + /* + * We currently ignore SyncAck packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + }, + [CT_DCCP_ROLE_SERVER] = { + [DCCP_PKT_REQUEST] = { + /* + * sNO -> sIV Invalid + * sRQ -> sIG Ignore, conntrack might be out of sync + * sRS -> sIG Ignore, conntrack might be out of sync + * sPO -> sIG Ignore, conntrack might be out of sync + * sOP -> sIG Ignore, conntrack might be out of sync + * sCR -> sIG Ignore, conntrack might be out of sync + * sCG -> sIG Ignore, conntrack might be out of sync + * sTW -> sRQ Reincarnation, must reverse roles + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ + }, + [DCCP_PKT_RESPONSE] = { + /* + * sNO -> sIV Response without Request + * sRQ -> sRS Response to clients Request + * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) + * sPO -> sIG Response to an ignored Request or late retransmit + * sOP -> sIG Ignore, might be response to ignored Request + * sCR -> sIG Ignore, might be response to ignored Request + * sCG -> sIG Ignore, might be response to ignored Request + * sTW -> sIV Invalid, Request from client in sTW moves to sRQ + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV + }, + [DCCP_PKT_ACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular Ack in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_DATA] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular Data packet in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_DATAACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular DataAck in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_CLOSEREQ] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) + * sOP -> sCR CloseReq in OPEN state + * sCR -> sCR Retransmit + * sCG -> sCR Simultaneous close, client sends another Close + * sTW -> sIV Already closed + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV + }, + [DCCP_PKT_CLOSE] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP -> sCG Move direcly to CLOSING + * sOP -> sCG Move to CLOSING + * sCR -> sIV Close after CloseReq is invalid + * sCG -> sCG Retransmit + * sTW -> sIV Already closed + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV + }, + [DCCP_PKT_RESET] = { + /* + * sNO -> sIV No connection + * sRQ -> sTW Reset in response to Request + * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) + * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) + * sOP -> sTW + * sCR -> sTW + * sCG -> sTW + * sTW -> sIG Ignore (don't refresh timer) + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ + sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG + }, + [DCCP_PKT_SYNC] = { + /* + * We currently ignore Sync packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + [DCCP_PKT_SYNCACK] = { + /* + * We currently ignore SyncAck packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + }, +}; + +/* this module per-net specifics */ +static int dccp_net_id __read_mostly; +struct dccp_net { + struct nf_proto_net pn; + int dccp_loose; + unsigned int dccp_timeout[CT_DCCP_MAX + 1]; +}; + +static inline struct dccp_net *dccp_pernet(struct net *net) +{ + return net_generic(net, dccp_net_id); +} + +static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct dccp_hdr _hdr, *dh; + + dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (dh == NULL) + return false; + + tuple->src.u.dccp.port = dh->dccph_sport; + tuple->dst.u.dccp.port = dh->dccph_dport; + return true; +} + +static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, + const struct nf_conntrack_tuple *tuple) +{ + inv->src.u.dccp.port = tuple->dst.u.dccp.port; + inv->dst.u.dccp.port = tuple->src.u.dccp.port; + return true; +} + +static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + struct net *net = nf_ct_net(ct); + struct dccp_net *dn; + struct dccp_hdr _dh, *dh; + const char *msg; + u_int8_t state; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + BUG_ON(dh == NULL); + + state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; + switch (state) { + default: + dn = dccp_pernet(net); + if (dn->dccp_loose == 0) { + msg = "nf_ct_dccp: not picking up existing connection "; + goto out_invalid; + } + case CT_DCCP_REQUEST: + break; + case CT_DCCP_INVALID: + msg = "nf_ct_dccp: invalid state transition "; + goto out_invalid; + } + + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.state = CT_DCCP_NONE; + ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; + ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; + ct->proto.dccp.handshake_seq = 0; + return true; + +out_invalid: + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, + NULL, "%s", msg); + return false; +} + +static u64 dccp_ack_seq(const struct dccp_hdr *dh) +{ + const struct dccp_hdr_ack_bits *dhack; + + dhack = (void *)dh + __dccp_basic_hdr_len(dh); + return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + + ntohl(dhack->dccph_ack_nr_low); +} + +static unsigned int *dccp_get_timeouts(struct net *net) +{ + return dccp_pernet(net)->dccp_timeout; +} + +static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info ctinfo, + u_int8_t pf, unsigned int hooknum, + unsigned int *timeouts) +{ + struct net *net = nf_ct_net(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct dccp_hdr _dh, *dh; + u_int8_t type, old_state, new_state; + enum ct_dccp_roles role; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + BUG_ON(dh == NULL); + type = dh->dccph_type; + + if (type == DCCP_PKT_RESET && + !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + /* Tear down connection immediately if only reply is a RESET */ + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_ACCEPT; + } + + spin_lock_bh(&ct->lock); + + role = ct->proto.dccp.role[dir]; + old_state = ct->proto.dccp.state; + new_state = dccp_state_table[role][type][old_state]; + + switch (new_state) { + case CT_DCCP_REQUEST: + if (old_state == CT_DCCP_TIMEWAIT && + role == CT_DCCP_ROLE_SERVER) { + /* Reincarnation in the reverse direction: reopen and + * reverse client/server roles. */ + ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; + } + break; + case CT_DCCP_RESPOND: + if (old_state == CT_DCCP_REQUEST) + ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); + break; + case CT_DCCP_PARTOPEN: + if (old_state == CT_DCCP_RESPOND && + type == DCCP_PKT_ACK && + dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) + set_bit(IPS_ASSURED_BIT, &ct->status); + break; + case CT_DCCP_IGNORE: + /* + * Connection tracking might be out of sync, so we ignore + * packets that might establish a new connection and resync + * if the server responds with a valid Response. + */ + if (ct->proto.dccp.last_dir == !dir && + ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && + type == DCCP_PKT_RESPONSE) { + ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); + new_state = CT_DCCP_RESPOND; + break; + } + ct->proto.dccp.last_dir = dir; + ct->proto.dccp.last_pkt = type; + + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_dccp: invalid packet ignored "); + return NF_ACCEPT; + case CT_DCCP_INVALID: + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_dccp: invalid state transition "); + return -NF_ACCEPT; + } + + ct->proto.dccp.last_dir = dir; + ct->proto.dccp.last_pkt = type; + ct->proto.dccp.state = new_state; + spin_unlock_bh(&ct->lock); + + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); + + return NF_ACCEPT; +} + +static int dccp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, unsigned int hooknum) +{ + struct dccp_hdr _dh, *dh; + unsigned int dccp_len = skb->len - dataoff; + unsigned int cscov; + const char *msg; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + if (dh == NULL) { + msg = "nf_ct_dccp: short packet "; + goto out_invalid; + } + + if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || + dh->dccph_doff * 4 > dccp_len) { + msg = "nf_ct_dccp: truncated/malformed packet "; + goto out_invalid; + } + + cscov = dccp_len; + if (dh->dccph_cscov) { + cscov = (dh->dccph_cscov - 1) * 4; + if (cscov > dccp_len) { + msg = "nf_ct_dccp: bad checksum coverage "; + goto out_invalid; + } + } + + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP, + pf)) { + msg = "nf_ct_dccp: bad checksum "; + goto out_invalid; + } + + if (dh->dccph_type >= DCCP_PKT_INVALID) { + msg = "nf_ct_dccp: reserved packet type "; + goto out_invalid; + } + + return NF_ACCEPT; + +out_invalid: + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg); + return -NF_ACCEPT; +} + +static int dccp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.dccp.port), + ntohs(tuple->dst.u.dccp.port)); +} + +static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) +static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) || + nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || + nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, + cpu_to_be64(ct->proto.dccp.handshake_seq))) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + spin_unlock_bh(&ct->lock); + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; +} + +static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { + [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, +}; + +static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; + struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; + int err; + + if (!attr) + return 0; + + err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr, + dccp_nla_policy); + if (err < 0) + return err; + + if (!tb[CTA_PROTOINFO_DCCP_STATE] || + !tb[CTA_PROTOINFO_DCCP_ROLE] || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { + return -EINVAL; + } + + spin_lock_bh(&ct->lock); + ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); + if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + } else { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; + } + if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { + ct->proto.dccp.handshake_seq = + be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); + } + spin_unlock_bh(&ct->lock); + return 0; +} + +static int dccp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_DCCP */ + + nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1); +} + +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + struct dccp_net *dn = dccp_pernet(net); + unsigned int *timeouts = data; + int i; + + /* set default DCCP timeouts. */ + for (i=0; i<CT_DCCP_MAX; i++) + timeouts[i] = dn->dccp_timeout[i]; + + /* there's a 1:1 mapping between attributes and protocol states. */ + for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { + if (tb[i]) { + timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; + } + } + return 0; +} + +static int +dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + int i; + + for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { + if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) + goto nla_put_failure; + } + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { + [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +/* template, data assigned later */ +static struct ctl_table dccp_sysctl_table[] = { + { + .procname = "nf_conntrack_dccp_timeout_request", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_respond", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_partopen", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_open", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_closereq", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_closing", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_timewait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_loose", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, + struct dccp_net *dn) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(dccp_sysctl_table, + sizeof(dccp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; + pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; + pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; + pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; + pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; + pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; + pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; + pn->ctl_table[7].data = &dn->dccp_loose; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + pn->ctl_table[0].procname = NULL; +#endif + return 0; +} + +static int dccp_init_net(struct net *net, u_int16_t proto) +{ + struct dccp_net *dn = dccp_pernet(net); + struct nf_proto_net *pn = &dn->pn; + + if (!pn->users) { + /* default values */ + dn->dccp_loose = 1; + dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; + } + + return dccp_kmemdup_sysctl_table(net, pn, dn); +} + +static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { + .l3proto = AF_INET, + .l4proto = IPPROTO_DCCP, + .name = "dccp", + .pkt_to_tuple = dccp_pkt_to_tuple, + .invert_tuple = dccp_invert_tuple, + .new = dccp_new, + .packet = dccp_packet, + .get_timeouts = dccp_get_timeouts, + .error = dccp_error, + .print_tuple = dccp_print_tuple, + .print_conntrack = dccp_print_conntrack, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, + .from_nlattr = nlattr_to_dccp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = dccp_timeout_nlattr_to_obj, + .obj_to_nlattr = dccp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_DCCP_MAX, + .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, + .nla_policy = dccp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &dccp_net_id, + .init_net = dccp_init_net, +}; + +static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { + .l3proto = AF_INET6, + .l4proto = IPPROTO_DCCP, + .name = "dccp", + .pkt_to_tuple = dccp_pkt_to_tuple, + .invert_tuple = dccp_invert_tuple, + .new = dccp_new, + .packet = dccp_packet, + .get_timeouts = dccp_get_timeouts, + .error = dccp_error, + .print_tuple = dccp_print_tuple, + .print_conntrack = dccp_print_conntrack, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, + .from_nlattr = nlattr_to_dccp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = dccp_timeout_nlattr_to_obj, + .obj_to_nlattr = dccp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_DCCP_MAX, + .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, + .nla_policy = dccp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &dccp_net_id, + .init_net = dccp_init_net, +}; + +static __net_init int dccp_net_init(struct net *net) +{ + int ret = 0; + ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4); + if (ret < 0) { + pr_err("nf_conntrack_dccp4: pernet registration failed.\n"); + goto out; + } + ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6); + if (ret < 0) { + pr_err("nf_conntrack_dccp6: pernet registration failed.\n"); + goto cleanup_dccp4; + } + return 0; +cleanup_dccp4: + nf_ct_l4proto_pernet_unregister(net, &dccp_proto4); +out: + return ret; +} + +static __net_exit void dccp_net_exit(struct net *net) +{ + nf_ct_l4proto_pernet_unregister(net, &dccp_proto6); + nf_ct_l4proto_pernet_unregister(net, &dccp_proto4); +} + +static struct pernet_operations dccp_net_ops = { + .init = dccp_net_init, + .exit = dccp_net_exit, + .id = &dccp_net_id, + .size = sizeof(struct dccp_net), +}; + +static int __init nf_conntrack_proto_dccp_init(void) +{ + int ret; + + ret = register_pernet_subsys(&dccp_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&dccp_proto4); + if (ret < 0) + goto out_dccp4; + + ret = nf_ct_l4proto_register(&dccp_proto6); + if (ret < 0) + goto out_dccp6; + + return 0; +out_dccp6: + nf_ct_l4proto_unregister(&dccp_proto4); +out_dccp4: + unregister_pernet_subsys(&dccp_net_ops); +out_pernet: + return ret; +} + +static void __exit nf_conntrack_proto_dccp_fini(void) +{ + nf_ct_l4proto_unregister(&dccp_proto6); + nf_ct_l4proto_unregister(&dccp_proto4); + unregister_pernet_subsys(&dccp_net_ops); +} + +module_init(nf_conntrack_proto_dccp_init); +module_exit(nf_conntrack_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("DCCP connection tracking protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 6faf1bed722..d25f2937764 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -14,23 +14,28 @@ static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; -static int generic_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static inline struct nf_generic_net *generic_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.generic; +} + +static bool generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) { tuple->src.u.all = 0; tuple->dst.u.all = 0; - return 1; + return true; } -static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { tuple->src.u.all = 0; tuple->dst.u.all = 0; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ @@ -40,80 +45,172 @@ static int generic_print_tuple(struct seq_file *s, return 0; } -/* Print out the private part of the conntrack. */ -static int generic_print_conntrack(struct seq_file *s, - const struct nf_conn *state) +static unsigned int *generic_get_timeouts(struct net *net) { - return 0; + return &(generic_pernet(net)->timeout); } /* Returns verdict for packet, or -1 for invalid. */ -static int packet(struct nf_conn *conntrack, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - int pf, - unsigned int hooknum) +static int generic_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeout) { - nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static int new(struct nf_conn *conntrack, const struct sk_buff *skb, - unsigned int dataoff) +static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) { - return 1; + return true; } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_generic_net *gn = generic_pernet(net); + + if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ; + else { + /* Set default generic timeout. */ + *timeout = gn->timeout; + } + + return 0; +} + +static int +generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { + [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static struct ctl_table_header *generic_sysctl_header; static struct ctl_table generic_sysctl_table[] = { { - .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT, .procname = "nf_conntrack_generic_timeout", - .data = &nf_ct_generic_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT static struct ctl_table generic_compat_sysctl_table[] = { { - .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, .procname = "ip_conntrack_generic_timeout", - .data = &nf_ct_generic_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ -struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = +static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_generic_net *gn) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(generic_sysctl_table, + sizeof(generic_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &gn->timeout; +#endif + return 0; +} + +static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_generic_net *gn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table, + sizeof(generic_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &gn->timeout; +#endif +#endif + return 0; +} + +static int generic_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_generic_net *gn = generic_pernet(net); + struct nf_proto_net *pn = &gn->pn; + + gn->timeout = nf_ct_generic_timeout; + + ret = generic_kmemdup_compat_sysctl_table(pn, gn); + if (ret < 0) + return ret; + + ret = generic_kmemdup_sysctl_table(pn, gn); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + + return ret; +} + +static struct nf_proto_net *generic_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.generic.pn; +} + +struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = { .l3proto = PF_UNSPEC, - .l4proto = 0, + .l4proto = 255, .name = "unknown", .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, .print_tuple = generic_print_tuple, - .print_conntrack = generic_print_conntrack, - .packet = packet, - .new = new, -#ifdef CONFIG_SYSCTL - .ctl_table_header = &generic_sysctl_header, - .ctl_table = generic_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = generic_compat_sysctl_table, -#endif -#endif + .packet = generic_packet, + .get_timeouts = generic_get_timeouts, + .new = generic_new, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = generic_timeout_nlattr_to_obj, + .obj_to_nlattr = generic_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_GENERIC_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = generic_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = generic_init_net, + .get_net_proto = generic_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 5434472420f..d5665739e3b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -21,6 +21,7 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> @@ -29,38 +30,54 @@ #include <linux/list.h> #include <linux/seq_file.h> #include <linux/in.h> +#include <linux/netdevice.h> #include <linux/skbuff.h> - +#include <linux/slab.h> +#include <net/dst.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> -#define GRE_TIMEOUT (30 * HZ) -#define GRE_STREAM_TIMEOUT (180 * HZ) +enum grep_conntrack { + GRE_CT_UNREPLIED, + GRE_CT_REPLIED, + GRE_CT_MAX +}; -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#else -#define DEBUGP(x, args...) -#endif +static unsigned int gre_timeouts[GRE_CT_MAX] = { + [GRE_CT_UNREPLIED] = 30*HZ, + [GRE_CT_REPLIED] = 180*HZ, +}; + +static int proto_gre_net_id __read_mostly; +struct netns_proto_gre { + struct nf_proto_net nf; + rwlock_t keymap_lock; + struct list_head keymap_list; + unsigned int gre_timeouts[GRE_CT_MAX]; +}; -static DEFINE_RWLOCK(nf_ct_gre_lock); -static LIST_HEAD(gre_keymap_list); +static inline struct netns_proto_gre *gre_pernet(struct net *net) +{ + return net_generic(net, proto_gre_net_id); +} -void nf_ct_gre_keymap_flush(void) +static void nf_ct_gre_keymap_flush(struct net *net) { - struct list_head *pos, *n; + struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_ct_gre_keymap *km, *tmp; - write_lock_bh(&nf_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - list_del(pos); - kfree(pos); + write_lock_bh(&net_gre->keymap_lock); + list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) { + list_del(&km->list); + kfree(km); } - write_unlock_bh(&nf_ct_gre_lock); + write_unlock_bh(&net_gre->keymap_lock); } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush); static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) @@ -73,22 +90,23 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, } /* look up the source key for a given tuple */ -static __be16 gre_keymap_lookup(struct nf_conntrack_tuple *t) +static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) { + struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km; __be16 key = 0; - read_lock_bh(&nf_ct_gre_lock); - list_for_each_entry(km, &gre_keymap_list, list) { + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t)) { key = km->tuple.src.u.gre.key; break; } } - read_unlock_bh(&nf_ct_gre_lock); + read_unlock_bh(&net_gre->keymap_lock); - DEBUGP("lookup src key 0x%x for ", key); - NF_CT_DUMP_TUPLE(t); + pr_debug("lookup src key 0x%x for ", key); + nf_ct_dump_tuple(t); return key; } @@ -97,19 +115,24 @@ static __be16 gre_keymap_lookup(struct nf_conntrack_tuple *t) int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t) { - struct nf_conn_help *help = nfct_help(ct); + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); struct nf_ct_gre_keymap **kmp, *km; - BUG_ON(strcmp(help->helper->name, "pptp")); - kmp = &help->help.ct_pptp_info.keymap[dir]; + kmp = &ct_pptp_info->keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ - list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { + if (gre_key_cmpfn(km, t) && km == *kmp) { + read_unlock_bh(&net_gre->keymap_lock); return 0; + } } - DEBUGP("trying to override keymap_%s for ct %p\n", - dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); + read_unlock_bh(&net_gre->keymap_lock); + pr_debug("trying to override keymap_%s for ct %p\n", + dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; } @@ -119,12 +142,12 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, memcpy(&km->tuple, t, sizeof(*t)); *kmp = km; - DEBUGP("adding new entry %p: ", km); - NF_CT_DUMP_TUPLE(&km->tuple); + pr_debug("adding new entry %p: ", km); + nf_ct_dump_tuple(&km->tuple); - write_lock_bh(&nf_ct_gre_lock); - list_add_tail(&km->list, &gre_keymap_list); - write_unlock_bh(&nf_ct_gre_lock); + write_lock_bh(&net_gre->keymap_lock); + list_add_tail(&km->list, &net_gre->keymap_list); + write_unlock_bh(&net_gre->keymap_lock); return 0; } @@ -133,45 +156,48 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); /* destroy the keymap entries associated with specified master ct */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct) { - struct nf_conn_help *help = nfct_help(ct); + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); enum ip_conntrack_dir dir; - DEBUGP("entering for ct %p\n", ct); - BUG_ON(strcmp(help->helper->name, "pptp")); + pr_debug("entering for ct %p\n", ct); - write_lock_bh(&nf_ct_gre_lock); + write_lock_bh(&net_gre->keymap_lock); for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { - if (help->help.ct_pptp_info.keymap[dir]) { - DEBUGP("removing %p from list\n", - help->help.ct_pptp_info.keymap[dir]); - list_del(&help->help.ct_pptp_info.keymap[dir]->list); - kfree(help->help.ct_pptp_info.keymap[dir]); - help->help.ct_pptp_info.keymap[dir] = NULL; + if (ct_pptp_info->keymap[dir]) { + pr_debug("removing %p from list\n", + ct_pptp_info->keymap[dir]); + list_del(&ct_pptp_info->keymap[dir]->list); + kfree(ct_pptp_info->keymap[dir]); + ct_pptp_info->keymap[dir] = NULL; } } - write_unlock_bh(&nf_ct_gre_lock); + write_unlock_bh(&net_gre->keymap_lock); } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ /* invert gre part of tuple */ -static int gre_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { tuple->dst.u.gre.key = orig->src.u.gre.key; tuple->src.u.gre.key = orig->dst.u.gre.key; - return 1; + return true; } /* gre hdr info to tuple */ -static int gre_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) { - struct gre_hdr_pptp _pgrehdr, *pgrehdr; + struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev); + const struct gre_hdr_pptp *pgrehdr; + struct gre_hdr_pptp _pgrehdr; __be16 srckey; - struct gre_hdr _grehdr, *grehdr; + const struct gre_hdr *grehdr; + struct gre_hdr _grehdr; /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); @@ -179,24 +205,24 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; - return 1; + return true; } /* PPTP header is variable length, only need up to the call_id field */ pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); if (!pgrehdr) - return 1; + return true; if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { - DEBUGP("GRE_VERSION_PPTP but unknown proto\n"); - return 0; + pr_debug("GRE_VERSION_PPTP but unknown proto\n"); + return false; } tuple->dst.u.gre.key = pgrehdr->call_id; - srckey = gre_keymap_lookup(tuple); + srckey = gre_keymap_lookup(net, tuple); tuple->src.u.gre.key = srckey; - return 1; + return true; } /* print gre part of tuple */ @@ -209,21 +235,26 @@ static int gre_print_tuple(struct seq_file *s, } /* print private data for conntrack */ -static int gre_print_conntrack(struct seq_file *s, - const struct nf_conn *ct) +static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) { return seq_printf(s, "timeout=%u, stream_timeout=%u ", (ct->proto.gre.timeout / HZ), (ct->proto.gre.stream_timeout / HZ)); } +static unsigned int *gre_get_timeouts(struct net *net) +{ + return gre_pernet(net)->gre_timeouts; +} + /* Returns verdict for packet, and may modify conntrack */ static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, - unsigned int hooknum) + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) { /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ @@ -231,8 +262,8 @@ static int gre_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.stream_timeout); /* Also, more likely to be important, and not a probe. */ - set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.timeout); @@ -241,18 +272,18 @@ static int gre_packet(struct nf_conn *ct, } /* Called when a new connection for this protocol found. */ -static int gre_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) +static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) { - DEBUGP(": "); - NF_CT_DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + pr_debug(": "); + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); /* initialize to sane value. Ideally a conntrack helper * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; - ct->proto.gre.timeout = GRE_TIMEOUT; + ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; + ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; - return 1; + return true; } /* Called when a conntrack entry has already been removed from the hashes @@ -260,16 +291,78 @@ static int gre_new(struct nf_conn *ct, const struct sk_buff *skb, static void gre_destroy(struct nf_conn *ct) { struct nf_conn *master = ct->master; - DEBUGP(" entering\n"); + pr_debug(" entering\n"); if (!master) - DEBUGP("no master !?!\n"); + pr_debug("no master !?!\n"); else nf_ct_gre_keymap_destroy(master); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct netns_proto_gre *net_gre = gre_pernet(net); + + /* set default timeouts for GRE. */ + timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; + timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) { + timeouts[GRE_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_GRE_REPLIED]) { + timeouts[GRE_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ; + } + return 0; +} + +static int +gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED, + htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED, + htonl(timeouts[GRE_CT_REPLIED] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { + [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +static int gre_init_net(struct net *net, u_int16_t proto) +{ + struct netns_proto_gre *net_gre = gre_pernet(net); + int i; + + rwlock_init(&net_gre->keymap_lock); + INIT_LIST_HEAD(&net_gre->keymap_list); + for (i = 0; i < GRE_CT_MAX; i++) + net_gre->gre_timeouts[i] = gre_timeouts[i]; + + return 0; +} + /* protocol helper struct */ -static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { +static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { .l3proto = AF_INET, .l4proto = IPPROTO_GRE, .name = "gre", @@ -277,25 +370,75 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .invert_tuple = gre_invert_tuple, .print_tuple = gre_print_tuple, .print_conntrack = gre_print_conntrack, + .get_timeouts = gre_get_timeouts, .packet = gre_packet, .new = gre_new, .destroy = gre_destroy, .me = THIS_MODULE, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = gre_timeout_nlattr_to_obj, + .obj_to_nlattr = gre_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_GRE_MAX, + .obj_size = sizeof(unsigned int) * GRE_CT_MAX, + .nla_policy = gre_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &proto_gre_net_id, + .init_net = gre_init_net, +}; + +static int proto_gre_net_init(struct net *net) +{ + int ret = 0; + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4); + if (ret < 0) + pr_err("nf_conntrack_gre4: pernet registration failed.\n"); + return ret; +} + +static void proto_gre_net_exit(struct net *net) +{ + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4); + nf_ct_gre_keymap_flush(net); +} + +static struct pernet_operations proto_gre_net_ops = { + .init = proto_gre_net_init, + .exit = proto_gre_net_exit, + .id = &proto_gre_net_id, + .size = sizeof(struct netns_proto_gre), }; static int __init nf_ct_proto_gre_init(void) { - return nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4); + int ret; + + ret = register_pernet_subsys(&proto_gre_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); + if (ret < 0) + goto out_gre4; + + return 0; +out_gre4: + unregister_pernet_subsys(&proto_gre_net_ops); +out_pernet: + return ret; } -static void nf_ct_proto_gre_fini(void) +static void __exit nf_ct_proto_gre_fini(void) { - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); - nf_ct_gre_keymap_flush(); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4); + unregister_pernet_subsys(&proto_gre_net_ops); } module_init(nf_ct_proto_gre_init); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 0d3254b974c..1314d33f6bc 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -1,6 +1,9 @@ /* * Connection tracking protocol helper module for SCTP. * + * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> + * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> + * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. * @@ -25,21 +28,12 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> -#if 0 -#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) -#else -#define DEBUGP(format, args...) -#endif - -/* Protects conntrack->proto.sctp */ -static DEFINE_RWLOCK(sctp_lock); - /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR And so for me for SCTP :D -Kiran */ -static const char *sctp_conntrack_names[] = { +static const char *const sctp_conntrack_names[] = { "NONE", "CLOSED", "COOKIE_WAIT", @@ -55,24 +49,15 @@ static const char *sctp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int nf_ct_sctp_timeout_closed __read_mostly = 10 SECS; -static unsigned int nf_ct_sctp_timeout_cookie_wait __read_mostly = 3 SECS; -static unsigned int nf_ct_sctp_timeout_cookie_echoed __read_mostly = 3 SECS; -static unsigned int nf_ct_sctp_timeout_established __read_mostly = 5 DAYS; -static unsigned int nf_ct_sctp_timeout_shutdown_sent __read_mostly = 300 SECS / 1000; -static unsigned int nf_ct_sctp_timeout_shutdown_recd __read_mostly = 300 SECS / 1000; -static unsigned int nf_ct_sctp_timeout_shutdown_ack_sent __read_mostly = 3 SECS; - -static unsigned int * sctp_timeouts[] -= { NULL, /* SCTP_CONNTRACK_NONE */ - &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ - &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ - &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ - &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ - &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ - &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ - &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ - }; +static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { + [SCTP_CONNTRACK_CLOSED] = 10 SECS, + [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, + [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, + [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, +}; #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED @@ -116,7 +101,7 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ @@ -125,9 +110,9 @@ static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} }, { @@ -139,330 +124,327 @@ static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} } }; -static int sctp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static int sctp_net_id __read_mostly; +struct sctp_net { + struct nf_proto_net pn; + unsigned int timeouts[SCTP_CONNTRACK_MAX]; +}; + +static inline struct sctp_net *sctp_pernet(struct net *net) { - sctp_sctphdr_t _hdr, *hp; + return net_generic(net, sctp_net_id); +} - DEBUGP(__FUNCTION__); - DEBUGP("\n"); +static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct sctphdr *hp; + struct sctphdr _hdr; /* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, 8, &_hdr); if (hp == NULL) - return 0; + return false; tuple->src.u.sctp.port = hp->source; tuple->dst.u.sctp.port = hp->dest; - return 1; + return true; } -static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - tuple->src.u.sctp.port = orig->dst.u.sctp.port; tuple->dst.u.sctp.port = orig->src.u.sctp.port; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ static int sctp_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - return seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), ntohs(tuple->dst.u.sctp.port)); } /* Print out the private part of the conntrack. */ -static int sctp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) +static int sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { enum sctp_conntrack state; - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - read_lock_bh(&sctp_lock); - state = conntrack->proto.sctp.state; - read_unlock_bh(&sctp_lock); + spin_lock_bh(&ct->lock); + state = ct->proto.sctp.state; + spin_unlock_bh(&ct->lock); return seq_printf(s, "%s ", sctp_conntrack_names[state]); } #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ -for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \ - offset < skb->len && \ - (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ - offset += (ntohs(sch->length) + 3) & ~3, count++) +for ((offset) = (dataoff) + sizeof(sctp_sctphdr_t), (count) = 0; \ + (offset) < (skb)->len && \ + ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ + (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ -static int do_basic_checks(struct nf_conn *conntrack, +static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - char *map) + unsigned long *map) { u_int32_t offset, count; sctp_chunkhdr_t _sch, *sch; int flag; - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); + pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK - || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK || + sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; - } /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ - if (((sch->type == SCTP_CID_COOKIE_ACK - || sch->type == SCTP_CID_COOKIE_ECHO - || flag) - && count !=0) || !sch->length) { - DEBUGP("Basic checks failed\n"); + if (((sch->type == SCTP_CID_COOKIE_ACK || + sch->type == SCTP_CID_COOKIE_ECHO || + flag) && + count != 0) || !sch->length) { + pr_debug("Basic checks failed\n"); return 1; } - if (map) { - set_bit(sch->type, (void *)map); - } + if (map) + set_bit(sch->type, map); } - DEBUGP("Basic checks passed\n"); + pr_debug("Basic checks passed\n"); return count == 0; } -static int new_state(enum ip_conntrack_dir dir, - enum sctp_conntrack cur_state, - int chunk_type) +static int sctp_new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) { int i; - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - DEBUGP("Chunk type: %d\n", chunk_type); + pr_debug("Chunk type: %d\n", chunk_type); switch (chunk_type) { - case SCTP_CID_INIT: - DEBUGP("SCTP_CID_INIT\n"); - i = 0; break; - case SCTP_CID_INIT_ACK: - DEBUGP("SCTP_CID_INIT_ACK\n"); - i = 1; break; - case SCTP_CID_ABORT: - DEBUGP("SCTP_CID_ABORT\n"); - i = 2; break; - case SCTP_CID_SHUTDOWN: - DEBUGP("SCTP_CID_SHUTDOWN\n"); - i = 3; break; - case SCTP_CID_SHUTDOWN_ACK: - DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); - i = 4; break; - case SCTP_CID_ERROR: - DEBUGP("SCTP_CID_ERROR\n"); - i = 5; break; - case SCTP_CID_COOKIE_ECHO: - DEBUGP("SCTP_CID_COOKIE_ECHO\n"); - i = 6; break; - case SCTP_CID_COOKIE_ACK: - DEBUGP("SCTP_CID_COOKIE_ACK\n"); - i = 7; break; - case SCTP_CID_SHUTDOWN_COMPLETE: - DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); - i = 8; break; - default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ - DEBUGP("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); - return cur_state; + case SCTP_CID_INIT: + pr_debug("SCTP_CID_INIT\n"); + i = 0; + break; + case SCTP_CID_INIT_ACK: + pr_debug("SCTP_CID_INIT_ACK\n"); + i = 1; + break; + case SCTP_CID_ABORT: + pr_debug("SCTP_CID_ABORT\n"); + i = 2; + break; + case SCTP_CID_SHUTDOWN: + pr_debug("SCTP_CID_SHUTDOWN\n"); + i = 3; + break; + case SCTP_CID_SHUTDOWN_ACK: + pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; + break; + case SCTP_CID_ERROR: + pr_debug("SCTP_CID_ERROR\n"); + i = 5; + break; + case SCTP_CID_COOKIE_ECHO: + pr_debug("SCTP_CID_COOKIE_ECHO\n"); + i = 6; + break; + case SCTP_CID_COOKIE_ACK: + pr_debug("SCTP_CID_COOKIE_ACK\n"); + i = 7; + break; + case SCTP_CID_SHUTDOWN_COMPLETE: + pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; + break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + pr_debug("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; } - DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); return sctp_conntracks[dir][i][cur_state]; } -/* Returns verdict for packet, or -1 for invalid. */ -static int sctp_packet(struct nf_conn *conntrack, +static unsigned int *sctp_get_timeouts(struct net *net) +{ + return sctp_pernet(net)->timeouts; +} + +/* Returns verdict for packet, or -NF_ACCEPT for invalid. */ +static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, - unsigned int hooknum) + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) { - enum sctp_conntrack newconntrack, oldsctpstate; - sctp_sctphdr_t _sctph, *sh; - sctp_chunkhdr_t _sch, *sch; + enum sctp_conntrack new_state, old_state; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + const struct sctphdr *sh; + struct sctphdr _sctph; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) - return -1; + goto out; - if (do_basic_checks(conntrack, skb, dataoff, map) != 0) - return -1; + if (do_basic_checks(ct, skb, dataoff, map) != 0) + goto out; /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) - && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) - && !test_bit(SCTP_CID_ABORT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) - && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - DEBUGP("Verification tag check failed\n"); - return -1; + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out; } - oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + old_state = new_state = SCTP_CONNTRACK_NONE; + spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - write_lock_bh(&sctp_lock); - /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* Sec 8.5.1 (A) */ - if (sh->vtag != 0) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != 0) + goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* Sec 8.5.1 (B) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir]) + goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* Sec 8.5.1 (C) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)] - && (sch->flags & 1))) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir] && + sch->flags & SCTP_CHUNK_FLAG_T) + goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* Sec 8.5.1 (D) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != ct->proto.sctp.vtag[dir]) + goto out_unlock; } - oldsctpstate = conntrack->proto.sctp.state; - newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + old_state = ct->proto.sctp.state; + new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ - if (newconntrack == SCTP_CONNTRACK_MAX) { - DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", - CTINFO2DIR(ctinfo), sch->type, oldsctpstate); - write_unlock_bh(&sctp_lock); - return -1; + if (new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " + "conntrack=%u\n", + dir, sch->type, old_state); + goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK) { + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK) { sctp_inithdr_t _inithdr, *ih; ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), sizeof(_inithdr), &_inithdr); - if (ih == NULL) { - write_unlock_bh(&sctp_lock); - return -1; - } - DEBUGP("Setting vtag %x for dir %d\n", - ih->init_tag, !CTINFO2DIR(ctinfo)); - conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + if (ih == NULL) + goto out_unlock; + pr_debug("Setting vtag %x for dir %d\n", + ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; } - conntrack->proto.sctp.state = newconntrack; - if (oldsctpstate != newconntrack) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); - write_unlock_bh(&sctp_lock); + ct->proto.sctp.state = new_state; + if (old_state != new_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); } + spin_unlock_bh(&ct->lock); - nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED - && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY - && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { - DEBUGP("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &conntrack->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && + dir == IP_CT_DIR_REPLY && + new_state == SCTP_CONNTRACK_ESTABLISHED) { + pr_debug("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); } return NF_ACCEPT; + +out_unlock: + spin_unlock_bh(&ct->lock); +out: + return -NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, - unsigned int dataoff) +static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) { - enum sctp_conntrack newconntrack; - sctp_sctphdr_t _sctph, *sh; - sctp_chunkhdr_t _sch, *sch; + enum sctp_conntrack new_state; + const struct sctphdr *sh; + struct sctphdr _sctph; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) - return 0; + return false; - if (do_basic_checks(conntrack, skb, dataoff, map) != 0) - return 0; + if (do_basic_checks(ct, skb, dataoff, map) != 0) + return false; /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ - if ((test_bit (SCTP_CID_ABORT, (void *)map)) - || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) - || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { - return 0; - } + if (test_bit(SCTP_CID_ABORT, map) || + test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || + test_bit(SCTP_CID_COOKIE_ACK, map)) + return false; - newconntrack = SCTP_CONNTRACK_MAX; + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); + new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Don't need lock here: this conntrack not in circulation yet */ - newconntrack = new_state(IP_CT_DIR_ORIGINAL, - SCTP_CONNTRACK_NONE, sch->type); + new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ - if (newconntrack == SCTP_CONNTRACK_MAX) { - DEBUGP("nf_conntrack_sctp: invalid new deleting.\n"); - return 0; + if (new_state == SCTP_CONNTRACK_NONE || + new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); + return false; } /* Copy the vtag into the state info */ @@ -473,163 +455,336 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), sizeof(_inithdr), &_inithdr); if (ih == NULL) - return 0; + return false; - DEBUGP("Setting vtag %x for new conn\n", - ih->init_tag); + pr_debug("Setting vtag %x for new conn\n", + ih->init_tag); - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else { /* Sec 8.5.1 (A) */ - return 0; + return false; } } /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ else { - DEBUGP("Setting vtag %x for new conn OOTB\n", - sh->vtag); - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + pr_debug("Setting vtag %x for new conn OOTB\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } - conntrack->proto.sctp.state = newconntrack; + ct->proto.sctp.state = new_state; } - return 1; + return true; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) || + nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || + nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, + ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) + goto nla_put_failure; + + spin_unlock_bh(&ct->lock); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; } +static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { + [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, + [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, +}; + +static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; + struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; + int err; + + /* updates may not contain the internal protocol info, skip parsing */ + if (!attr) + return 0; + + err = nla_parse_nested(tb, + CTA_PROTOINFO_SCTP_MAX, + attr, + sctp_nla_policy); + if (err < 0) + return err; + + if (!tb[CTA_PROTOINFO_SCTP_STATE] || + !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || + !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) + return -EINVAL; + + spin_lock_bh(&ct->lock); + ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); + spin_unlock_bh(&ct->lock); + + return 0; +} + +static int sctp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_SCTP */ + + nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1); +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct sctp_net *sn = sctp_pernet(net); + int i; + + /* set default SCTP timeouts. */ + for (i=0; i<SCTP_CONNTRACK_MAX; i++) + timeouts[i] = sn->timeouts[i]; + + /* there's a 1:1 mapping between attributes and protocol states. */ + for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { + if (tb[i]) { + timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; + } + } + return 0; +} + +static int +sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + int i; + + for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { + if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) + goto nla_put_failure; + } + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { + [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + + #ifdef CONFIG_SYSCTL -static unsigned int sctp_sysctl_table_users; -static struct ctl_table_header *sctp_sysctl_header; static struct ctl_table sctp_sysctl_table[] = { { - .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, .procname = "nf_conntrack_sctp_timeout_closed", - .data = &nf_ct_sctp_timeout_closed, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, .procname = "nf_conntrack_sctp_timeout_cookie_wait", - .data = &nf_ct_sctp_timeout_cookie_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, .procname = "nf_conntrack_sctp_timeout_cookie_echoed", - .data = &nf_ct_sctp_timeout_cookie_echoed, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, .procname = "nf_conntrack_sctp_timeout_established", - .data = &nf_ct_sctp_timeout_established, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, .procname = "nf_conntrack_sctp_timeout_shutdown_sent", - .data = &nf_ct_sctp_timeout_shutdown_sent, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, .procname = "nf_conntrack_sctp_timeout_shutdown_recd", - .data = &nf_ct_sctp_timeout_shutdown_recd, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &nf_ct_sctp_timeout_shutdown_ack_sent, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT static struct ctl_table sctp_compat_sysctl_table[] = { { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, .procname = "ip_conntrack_sctp_timeout_closed", - .data = &nf_ct_sctp_timeout_closed, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, .procname = "ip_conntrack_sctp_timeout_cookie_wait", - .data = &nf_ct_sctp_timeout_cookie_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, .procname = "ip_conntrack_sctp_timeout_cookie_echoed", - .data = &nf_ct_sctp_timeout_cookie_echoed, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, .procname = "ip_conntrack_sctp_timeout_established", - .data = &nf_ct_sctp_timeout_established, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, .procname = "ip_conntrack_sctp_timeout_shutdown_sent", - .data = &nf_ct_sctp_timeout_shutdown_sent, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, .procname = "ip_conntrack_sctp_timeout_shutdown_recd", - .data = &nf_ct_sctp_timeout_shutdown_recd, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &nf_ct_sctp_timeout_shutdown_ack_sent, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif -struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { +static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct sctp_net *sn) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(sctp_sysctl_table, + sizeof(sctp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; + pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; + pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; + pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; + pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; + pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; + pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; +#endif + return 0; +} + +static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct sctp_net *sn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table, + sizeof(sctp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; + pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; + pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; + pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; + pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; + pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; + pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; +#endif +#endif + return 0; +} + +static int sctp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct sctp_net *sn = sctp_pernet(net); + struct nf_proto_net *pn = &sn->pn; + + if (!pn->users) { + int i; + + for (i = 0; i < SCTP_CONNTRACK_MAX; i++) + sn->timeouts[i] = sctp_timeouts[i]; + } + + if (proto == AF_INET) { + ret = sctp_kmemdup_compat_sysctl_table(pn, sn); + if (ret < 0) + return ret; + + ret = sctp_kmemdup_sysctl_table(pn, sn); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + } else + ret = sctp_kmemdup_sysctl_table(pn, sn); + + return ret; +} + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, .name = "sctp", @@ -638,19 +793,32 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .print_tuple = sctp_print_tuple, .print_conntrack = sctp_print_conntrack, .packet = sctp_packet, + .get_timeouts = sctp_get_timeouts, .new = sctp_new, .me = THIS_MODULE, -#ifdef CONFIG_SYSCTL - .ctl_table_users = &sctp_sysctl_table_users, - .ctl_table_header = &sctp_sysctl_header, - .ctl_table = sctp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = sctp_compat_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = sctp_to_nlattr, + .nlattr_size = sctp_nlattr_size, + .from_nlattr = nlattr_to_sctp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = sctp_timeout_nlattr_to_obj, + .obj_to_nlattr = sctp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_SCTP_MAX, + .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, + .nla_policy = sctp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &sctp_net_id, + .init_net = sctp_init_net, }; -struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { +static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, .name = "sctp", @@ -659,45 +827,96 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .print_tuple = sctp_print_tuple, .print_conntrack = sctp_print_conntrack, .packet = sctp_packet, + .get_timeouts = sctp_get_timeouts, .new = sctp_new, .me = THIS_MODULE, -#ifdef CONFIG_SYSCTL - .ctl_table_users = &sctp_sysctl_table_users, - .ctl_table_header = &sctp_sysctl_header, - .ctl_table = sctp_sysctl_table, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = sctp_to_nlattr, + .nlattr_size = sctp_nlattr_size, + .from_nlattr = nlattr_to_sctp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = sctp_timeout_nlattr_to_obj, + .obj_to_nlattr = sctp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_SCTP_MAX, + .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, + .nla_policy = sctp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ #endif + .net_id = &sctp_net_id, + .init_net = sctp_init_net, }; -int __init nf_conntrack_proto_sctp_init(void) +static int sctp_net_init(struct net *net) { - int ret; + int ret = 0; - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4); - if (ret) { - printk("nf_conntrack_l4proto_sctp4: protocol register failed\n"); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4); + if (ret < 0) { + pr_err("nf_conntrack_sctp4: pernet registration failed.\n"); goto out; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6); - if (ret) { - printk("nf_conntrack_l4proto_sctp6: protocol register failed\n"); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6); + if (ret < 0) { + pr_err("nf_conntrack_sctp6: pernet registration failed.\n"); goto cleanup_sctp4; } + return 0; +cleanup_sctp4: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); +out: return ret; +} + +static void sctp_net_exit(struct net *net) +{ + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); +} + +static struct pernet_operations sctp_net_ops = { + .init = sctp_net_init, + .exit = sctp_net_exit, + .id = &sctp_net_id, + .size = sizeof(struct sctp_net), +}; + +static int __init nf_conntrack_proto_sctp_init(void) +{ + int ret; + + ret = register_pernet_subsys(&sctp_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4); + if (ret < 0) + goto out_sctp4; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6); + if (ret < 0) + goto out_sctp6; - cleanup_sctp4: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); - out: - DEBUGP("SCTP conntrack module loading %s\n", - ret ? "failed": "succeeded"); + return 0; +out_sctp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +out_sctp4: + unregister_pernet_subsys(&sctp_net_ops); +out_pernet: return ret; } -void __exit nf_conntrack_proto_sctp_fini(void) +static void __exit nf_conntrack_proto_sctp_fini(void) { - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); - DEBUGP("SCTP conntrack module unloaded\n"); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); + unregister_pernet_subsys(&sctp_net_ops); } module_init(nf_conntrack_proto_sctp_init); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index ccdd5d231e0..44d1ea32570 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,5 +1,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -8,7 +10,6 @@ #include <linux/types.h> #include <linux/timer.h> -#include <linux/netfilter.h> #include <linux/module.h> #include <linux/in.h> #include <linux/tcp.h> @@ -16,6 +17,7 @@ #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> +#include <asm/unaligned.h> #include <net/tcp.h> @@ -25,16 +27,11 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> - -#if 0 -#define DEBUGP printk -#define DEBUGP_VARS -#else -#define DEBUGP(format, args...) -#endif - -/* Protects conntrack->proto.tcp */ -static DEFINE_RWLOCK(tcp_lock); +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> /* "Be conservative in what you do, be liberal in what you accept from others." @@ -53,7 +50,7 @@ static int nf_ct_tcp_max_retrans __read_mostly = 3; /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ -static const char *tcp_conntrack_names[] = { +static const char *const tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", @@ -63,7 +60,7 @@ static const char *tcp_conntrack_names[] = { "LAST_ACK", "TIME_WAIT", "CLOSE", - "LISTEN" + "SYN_SENT2", }; #define SECS * HZ @@ -71,32 +68,22 @@ static const char *tcp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int nf_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_established __read_mostly = 5 DAYS; -static unsigned int nf_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close_wait __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_last_ack __read_mostly = 30 SECS; -static unsigned int nf_ct_tcp_timeout_time_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close __read_mostly = 10 SECS; - +static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = { + [TCP_CONNTRACK_SYN_SENT] = 2 MINS, + [TCP_CONNTRACK_SYN_RECV] = 60 SECS, + [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, + [TCP_CONNTRACK_LAST_ACK] = 30 SECS, + [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE] = 10 SECS, + [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ -static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; - -static unsigned int * tcp_timeouts[] = { - NULL, /* TCP_CONNTRACK_NONE */ - &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ - &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ - &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ - &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ - &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ - &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ - &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ - &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ - NULL, /* TCP_CONNTRACK_LISTEN */ - }; + [TCP_CONNTRACK_RETRANS] = 5 MINS, + [TCP_CONNTRACK_UNACK] = 5 MINS, +}; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT @@ -107,7 +94,7 @@ static unsigned int * tcp_timeouts[] = { #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE -#define sLI TCP_CONNTRACK_LISTEN +#define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE @@ -137,15 +124,14 @@ enum tcp_bit_set { * * NONE: initial state * SYN_SENT: SYN-only packet seen + * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen - * CLOSE: closed connection - * - * LISTEN state is not used. + * CLOSE: closed connection (RST) * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid @@ -153,18 +139,18 @@ enum tcp_bit_set { * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): - * if they are invalid - * or we do not support the request (simultaneous open) + * if we regard them as truly invalid packets */ -static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { +static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN - * sSR -> sIG Late retransmitted SYN? + * sS2 -> sS2 Late retransmitted SYN + * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. @@ -175,22 +161,27 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* - * A SYN/ACK from the client is always invalid: - * - either it tries to set up a simultaneous open, which is - * not supported; - * - or the firewall has just been inserted between the two hosts - * during the session set-up. The SYN will be retransmitted - * by the true client (or it'll time out). + * sNO -> sIV Too late and no reason to do anything + * sSS -> sIV Client can't send SYN and then SYN/ACK + * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open + * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open + * sES -> sIV Invalid SYN/ACK packets sent by the client + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV + * sCL -> sIV */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. + * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for @@ -201,11 +192,12 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -214,30 +206,32 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 }, /* * sNO -> sIV Never reached. - * sSS -> sIV Simultaneous open, not supported - * sSR -> sIV Simultaneous open, not supported. - * sES -> sIV Server may not initiate a connection. + * sSS -> sS2 Simultaneous open + * sS2 -> sS2 Retransmitted simultaneous SYN + * sSR -> sIV Invalid SYN packets sent by the server + * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV Reopened connection, but server may not do it. * sCL -> sIV */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. - * sSR -> sSR Retransmitted SYN/ACK. + * sS2 -> sSR Simultaneous open + * sSR -> sIG Retransmitted SYN/ACK, ignore it. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN * sCW -> sIG @@ -245,10 +239,11 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sIG * sCL -> sIG */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. + * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. @@ -257,10 +252,11 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. + * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -269,35 +265,40 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; -static int tcp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static inline struct nf_tcp_net *tcp_pernet(struct net *net) { - struct tcphdr _hdr, *hp; + return &net->ct.nf_ct_proto.tcp; +} + +static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct tcphdr *hp; + struct tcphdr _hdr; /* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, 8, &_hdr); if (hp == NULL) - return 0; + return false; tuple->src.u.tcp.port = hp->source; tuple->dst.u.tcp.port = hp->dest; - return 1; + return true; } -static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { tuple->src.u.tcp.port = orig->dst.u.tcp.port; tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ @@ -310,14 +311,13 @@ static int tcp_print_tuple(struct seq_file *s, } /* Print out the private part of the conntrack. */ -static int tcp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) +static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { enum tcp_conntrack state; - read_lock_bh(&tcp_lock); - state = conntrack->proto.tcp.state; - read_unlock_bh(&tcp_lock); + spin_lock_bh(&ct->lock); + state = ct->proto.tcp.state; + spin_unlock_bh(&ct->lock); return seq_printf(s, "%s ", tcp_conntrack_names[state]); } @@ -334,8 +334,8 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph) /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering in IP Filter' by Guido van Rooij. - http://www.nluug.nl/events/sane2000/papers.html - http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + http://www.sane.nl/events/sane2000/papers.html + http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ The boundaries and the conditions are changed according to RFC793: the packet must intersect the window (i.e. segments may be @@ -350,19 +350,20 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph) I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin - III. Upper bound for valid ack: sack <= receiver.td_end - IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + III. Upper bound for valid (s)ack: sack <= receiver.td_end + IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW - where sack is the highest right edge of sack block found in the packet. + where sack is the highest right edge of sack block found in the packet + or ack in the case of packet without SACK option. - The upper bound limit for a valid ack is not ignored - + The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, unsigned int dataoff, - struct tcphdr *tcph) + const struct tcphdr *tcph) { /* XXX Should I use payload length field in IP/IPv6 header ? * - YK */ @@ -381,11 +382,11 @@ static inline __u32 segment_seq_plus_len(__u32 seq, */ static void tcp_options(const struct sk_buff *skb, unsigned int dataoff, - struct tcphdr *tcph, + const struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; + const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) @@ -413,7 +414,7 @@ static void tcp_options(const struct sk_buff *skb, if (opsize < 2) /* "silly options" */ return; if (opsize > length) - break; /* don't parse partial options */ + return; /* don't parse partial options */ if (opcode == TCPOPT_SACK_PERM && opsize == TCPOLEN_SACK_PERM) @@ -436,10 +437,10 @@ static void tcp_options(const struct sk_buff *skb, } static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, - struct tcphdr *tcph, __u32 *sack) + const struct tcphdr *tcph, __u32 *sack) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; + const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); __u32 tmp; @@ -451,7 +452,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, BUG_ON(ptr == NULL); /* Fast path for timestamp-only option */ - if (length == TCPOLEN_TSTAMP_ALIGNED*4 + if (length == TCPOLEN_TSTAMP_ALIGNED && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) @@ -473,7 +474,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, if (opsize < 2) /* "silly options" */ return; if (opsize > length) - break; /* don't parse partial options */ + return; /* don't parse partial options */ if (opcode == TCPOPT_SACK && opsize >= (TCPOLEN_SACK_BASE @@ -483,7 +484,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { - tmp = ntohl(*((__be32 *)(ptr+i)+1)); + tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); if (after(tmp, *sack)) *sack = tmp; @@ -496,18 +497,23 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -static int tcp_in_window(struct ip_ct_tcp *state, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - unsigned int dataoff, - struct tcphdr *tcph, - int pf) +static bool tcp_in_window(const struct nf_conn *ct, + struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u_int8_t pf) { + struct net *net = nf_ct_net(ct); + struct nf_tcp_net *tn = tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; - int res; + s32 receiver_offset; + bool res, in_recv_win; /* * Get the required data from the packet. @@ -520,26 +526,31 @@ static int tcp_in_window(struct ip_ct_tcp *state, if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(skb, dataoff, tcph, &sack); - DEBUGP("tcp_in_window: START\n"); - DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "seq=%u ack=%u sack=%u win=%u end=%u\n", - NIPQUAD(iph->saddr), ntohs(tcph->source), - NIPQUAD(iph->daddr), ntohs(tcph->dest), - seq, ack, sack, win, end); - DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - - if (sender->td_end == 0) { + /* Take into account NAT sequence number mangling */ + receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); + ack -= receiver_offset; + sack -= receiver_offset; + + pr_debug("tcp_in_window: START\n"); + pr_debug("tcp_in_window: "); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); + pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_maxwin == 0) { /* * Initialize sender data. */ - if (tcph->syn && tcph->ack) { + if (tcph->syn) { /* - * Outgoing SYN-ACK in reply to a SYN. + * SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. */ sender->td_end = sender->td_maxend = end; @@ -555,6 +566,9 @@ static int tcp_in_window(struct ip_ct_tcp *state, && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) sender->td_scale = receiver->td_scale = 0; + if (!tcph->ack) + /* Simultaneous open */ + return true; } else { /* * We are in the middle of a connection, @@ -562,8 +576,16 @@ static int tcp_in_window(struct ip_ct_tcp *state, * Let's try to use the data from the packet. */ sender->td_end = end; - sender->td_maxwin = (win == 0 ? 1 : win); + swin = win << sender->td_scale; + sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; + /* + * We haven't seen traffic in the other direction yet + * but we have to tweak window tracking to pass III + * and IV until that happens. + */ + if (receiver->td_maxwin == 0) + receiver->td_end = receiver->td_maxend = sack; } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) @@ -597,40 +619,37 @@ static int tcp_in_window(struct ip_ct_tcp *state, ack = sack = receiver->td_end; } - if (seq == end - && (!tcph->rst - || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* - * Packets contains no data: we assume it is valid - * and check the ack value only. - * However RST segments are always validated by their - * SEQ number, except when seq == 0 (reset sent answering - * SYN. + * RST sent answering SYN. */ seq = end = sender->td_end; - DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "seq=%u ack=%u sack =%u win=%u end=%u\n", - NIPQUAD(iph->saddr), ntohs(tcph->source), - NIPQUAD(iph->daddr), ntohs(tcph->dest), - seq, ack, sack, win, end); - DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - - DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", - before(seq, sender->td_maxend + 1), - after(end, sender->td_end - receiver->td_maxwin - 1), - before(sack, receiver->td_end + 1), - after(ack, receiver->td_end - MAXACKWINDOW(sender))); + pr_debug("tcp_in_window: "); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); + pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + /* Is the ending sequence in the receive window (if available)? */ + in_recv_win = !receiver->td_maxwin || + after(end, sender->td_end - receiver->td_maxwin - 1); + + pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + (in_recv_win ? 1 : 0), + before(sack, receiver->td_end + 1), + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && - after(end, sender->td_end - receiver->td_maxwin - 1) && + in_recv_win && before(sack, receiver->td_end + 1) && - after(ack, receiver->td_end - MAXACKWINDOW(sender))) { + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* * Take into account window scaling (RFC 1323). */ @@ -643,18 +662,30 @@ static int tcp_in_window(struct ip_ct_tcp *state, swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; - if (after(end, sender->td_end)) + if (after(end, sender->td_end)) { sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) + sender->td_maxack = ack; + } + /* * Update receiver data. */ - if (after(end, sender->td_maxend)) + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) receiver->td_maxwin += end - sender->td_maxend; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; if (win == 0) receiver->td_maxend++; } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; /* * Check retransmissions. @@ -675,115 +706,74 @@ static int tcp_in_window(struct ip_ct_tcp *state, state->retrans = 0; } } - res = 1; + res = true; } else { - res = 0; + res = false; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - nf_ct_tcp_be_liberal) - res = 1; - if (!res && LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + tn->tcp_be_liberal) + res = true; + if (!res && LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? - after(end, sender->td_end - receiver->td_maxwin - 1) ? + in_recv_win ? before(sack, receiver->td_end + 1) ? - after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" : "ACK is over the upper bound (ACKed data not seen yet)" : "SEQ is under the lower bound (already ACKed data retransmitted)" : "SEQ is over the upper bound (over the window of the receiver)"); } - DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " - "receiver end=%u maxend=%u maxwin=%u\n", - res, sender->td_end, sender->td_maxend, sender->td_maxwin, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); return res; } -#ifdef CONFIG_NF_NAT_NEEDED -/* Update sender->td_end after NAT successfully mangled the packet */ -/* Caller must linearize skb at tcp header. */ -void nf_conntrack_tcp_update(struct sk_buff *skb, - unsigned int dataoff, - struct nf_conn *conntrack, - int dir) -{ - struct tcphdr *tcph = (void *)skb->data + dataoff; - __u32 end; -#ifdef DEBUGP_VARS - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; -#endif - - end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); - - write_lock_bh(&tcp_lock); - /* - * We have to worry for the ack in the reply packet only... - */ - if (after(end, conntrack->proto.tcp.seen[dir].td_end)) - conntrack->proto.tcp.seen[dir].td_end = end; - conntrack->proto.tcp.last_end = end; - write_unlock_bh(&tcp_lock); - DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); -} -EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update); -#endif - -#define TH_FIN 0x01 -#define TH_SYN 0x02 -#define TH_RST 0x04 -#define TH_PUSH 0x08 -#define TH_ACK 0x10 -#define TH_URG 0x20 -#define TH_ECE 0x40 -#define TH_CWR 0x80 - /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ -static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = +static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| + TCPHDR_URG) + 1] = { - [TH_SYN] = 1, - [TH_SYN|TH_URG] = 1, - [TH_SYN|TH_ACK] = 1, - [TH_RST] = 1, - [TH_RST|TH_ACK] = 1, - [TH_FIN|TH_ACK] = 1, - [TH_FIN|TH_ACK|TH_URG] = 1, - [TH_ACK] = 1, - [TH_ACK|TH_URG] = 1, + [TCPHDR_SYN] = 1, + [TCPHDR_SYN|TCPHDR_URG] = 1, + [TCPHDR_SYN|TCPHDR_ACK] = 1, + [TCPHDR_RST] = 1, + [TCPHDR_RST|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, + [TCPHDR_ACK] = 1, + [TCPHDR_ACK|TCPHDR_URG] = 1, }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct sk_buff *skb, +static int tcp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { - struct tcphdr _tcph, *th; + const struct tcphdr *th; + struct tcphdr _tcph; unsigned int tcplen = skb->len - dataoff; u_int8_t tcpflags; /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; } /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -793,21 +783,19 @@ static int tcp_error(struct sk_buff *skb, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && - ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } /* Check TCP flags. */ - tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH)); + tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -815,107 +803,190 @@ static int tcp_error(struct sk_buff *skb, return NF_ACCEPT; } +static unsigned int *tcp_get_timeouts(struct net *net) +{ + return tcp_pernet(net)->timeouts; +} + /* Returns verdict for packet, or -1 for invalid. */ -static int tcp_packet(struct nf_conn *conntrack, +static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, - unsigned int hooknum) + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) { + struct net *net = nf_ct_net(ct); + struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; enum ip_conntrack_dir dir; - struct tcphdr *th, _tcph; + const struct tcphdr *th; + struct tcphdr _tcph; unsigned long timeout; unsigned int index; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); - write_lock_bh(&tcp_lock); - old_state = conntrack->proto.tcp.state; + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; + tuple = &ct->tuplehash[dir].tuple; switch (new_state) { + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + /* RFC 1122: "When a connection is closed actively, + * it MUST linger in TIME-WAIT state for a time 2xMSL + * (Maximum Segment Lifetime). However, it MAY accept + * a new SYN from the remote TCP to reopen the connection + * directly from TIME-WAIT state, if..." + * We ignore the conditions because we are in the + * TIME-WAIT state anyway. + * + * Handle aborted connections: we and the server + * think there is an existing connection but the client + * aborts it and starts a new one. + */ + if (((ct->proto.tcp.seen[dir].flags + | ct->proto.tcp.seen[!dir].flags) + & IP_CT_TCP_FLAG_CLOSE_INIT) + || (ct->proto.tcp.last_dir == dir + && ct->proto.tcp.last_index == TCP_RST_SET)) { + /* Attempt to reopen a closed/aborted connection. + * Delete this connection and look up again. */ + spin_unlock_bh(&ct->lock); + + /* Only repeat if we can actually remove the timer. + * Destruction may already be in progress in process + * context and we must give it a chance to terminate. + */ + if (nf_ct_kill(ct)) + return -NF_REPEAT; + return NF_DROP; + } + /* Fall through */ case TCP_CONNTRACK_IGNORE: /* Ignored packets: * + * Our connection entry may be out of sync, so ignore + * packets which may signal the real connection between + * the client and the server. + * * a) SYN in ORIGINAL * b) SYN/ACK in REPLY * c) ACK in reply direction after initial SYN in original. + * + * If the ignored packet is invalid, the receiver will send + * a RST we'll catch below. */ if (index == TCP_SYNACK_SET - && conntrack->proto.tcp.last_index == TCP_SYN_SET - && conntrack->proto.tcp.last_dir != dir - && ntohl(th->ack_seq) == - conntrack->proto.tcp.last_end) { - /* This SYN/ACK acknowledges a SYN that we earlier + && ct->proto.tcp.last_index == TCP_SYN_SET + && ct->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { + /* b) This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is - * not. We kill this session and block the SYN/ACK so - * that the client cannot but retransmit its SYN and - * thus initiate a clean new session. + * not. We get in sync from the previously annotated + * values. */ - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: killing out of sync session "); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_DROP; + old_state = TCP_CONNTRACK_SYN_SENT; + new_state = TCP_CONNTRACK_SYN_RECV; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = + ct->proto.tcp.last_win == 0 ? + 1 : ct->proto.tcp.last_win; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = + ct->proto.tcp.last_wscale; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = + ct->proto.tcp.last_flags; + memset(&ct->proto.tcp.seen[dir], 0, + sizeof(struct ip_ct_tcp_state)); + break; } - conntrack->proto.tcp.last_index = index; - conntrack->proto.tcp.last_dir = dir; - conntrack->proto.tcp.last_seq = ntohl(th->seq); - conntrack->proto.tcp.last_end = + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + ct->proto.tcp.last_seq = ntohl(th->seq); + ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid packed ignored "); + ct->proto.tcp.last_win = ntohs(th->window); + + /* a) This is a SYN in ORIGINAL. The client and the server + * may be in sync but we are not. In that case, we annotate + * the TCP options and let the packet go through. If it is a + * valid SYN packet, the server will reply with a SYN/ACK, and + * then we'll get in sync. Otherwise, the server ignores it. */ + if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { + struct ip_ct_tcp_state seen = {}; + + ct->proto.tcp.last_flags = + ct->proto.tcp.last_wscale = 0; + tcp_options(skb, dataoff, th, &seen); + if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + ct->proto.tcp.last_wscale = seen.td_scale; + } + if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_SACK_PERM; + } + } + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid packet ignored in " + "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: + /* Special case for SYN proxy: when the SYN to the server or + * the SYN/ACK from the server is lost, the client may transmit + * a keep-alive packet while in SYN_SENT state. This needs to + * be associated with the original conntrack entry in order to + * generate a new SYN with the correct sequence number. + */ + if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { + pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + } + /* Invalid packet */ - DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", - dir, get_conntrack_index(th), - old_state); - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), old_state); + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; - case TCP_CONNTRACK_SYN_SENT: - if (old_state < TCP_CONNTRACK_TIME_WAIT) - break; - if ((conntrack->proto.tcp.seen[dir].flags & - IP_CT_TCP_FLAG_CLOSE_INIT) - || after(ntohl(th->seq), - conntrack->proto.tcp.seen[dir].td_end)) { - /* Attempt to reopen a closed connection. - * Delete this connection and look up again. */ - write_unlock_bh(&tcp_lock); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_REPEAT; - } else { - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, - NULL, "nf_ct_tcp: invalid SYN"); + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) + && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { + /* Invalid RST */ + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid RST "); return -NF_ACCEPT; } - case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET - && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET) - || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_ACK_SET)) - && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_ACK_SET)) + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * @@ -933,457 +1004,638 @@ static int tcp_packet(struct nf_conn *conntrack, break; } - if (!tcp_in_window(&conntrack->proto.tcp, dir, index, + if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, skb, dataoff, th, pf)) { - write_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } in_window: /* From now on we have got in-window packets */ - conntrack->proto.tcp.last_index = index; + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; - DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest), - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); + pr_debug("tcp_conntracks: "); + nf_ct_dump_tuple(tuple); + pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); - conntrack->proto.tcp.state = new_state; + ct->proto.tcp.state = new_state; if (old_state != new_state - && (new_state == TCP_CONNTRACK_FIN_WAIT - || new_state == TCP_CONNTRACK_CLOSE)) - conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans - && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans - ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; - write_unlock_bh(&tcp_lock); - - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + && new_state == TCP_CONNTRACK_FIN_WAIT) + ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && + timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) + timeout = timeouts[TCP_CONNTRACK_RETRANS]; + else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & + IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && + timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) + timeout = timeouts[TCP_CONNTRACK_UNACK]; + else + timeout = timeouts[new_state]; + spin_unlock_bh(&ct->lock); + if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); - if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); + nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } - } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection + * pickup with loose=1. Avoid large ESTABLISHED timeout. + */ + if (new_state == TCP_CONNTRACK_ESTABLISHED && + timeout > timeouts[TCP_CONNTRACK_UNACK]) + timeout = timeouts[TCP_CONNTRACK_UNACK]; + } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { /* Set ASSURED if we see see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); } - nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static int tcp_new(struct nf_conn *conntrack, - const struct sk_buff *skb, - unsigned int dataoff) +static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) { enum tcp_conntrack new_state; - struct tcphdr *th, _tcph; -#ifdef DEBUGP_VARS - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; -#endif + const struct tcphdr *th; + struct tcphdr _tcph; + struct net *net = nf_ct_net(ct); + struct nf_tcp_net *tn = tcp_pernet(net); + const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; + const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); /* Don't need lock here: this conntrack not in circulation yet */ - new_state - = tcp_conntracks[0][get_conntrack_index(th)] - [TCP_CONNTRACK_NONE]; + new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { - DEBUGP("nf_ct_tcp: invalid new deleting.\n"); - return 0; + pr_debug("nf_ct_tcp: invalid new deleting.\n"); + return false; } if (new_state == TCP_CONNTRACK_SYN_SENT) { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* SYN packet */ - conntrack->proto.tcp.seen[0].td_end = + ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end; - - tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]); - conntrack->proto.tcp.seen[1].flags = 0; - } else if (nf_ct_tcp_loose == 0) { + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); + } else if (tn->tcp_loose == 0) { /* Don't try to pick up connections. */ - return 0; + return false; } else { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ - conntrack->proto.tcp.seen[0].td_end = + ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end + - conntrack->proto.tcp.seen[0].td_maxwin; - conntrack->proto.tcp.seen[0].td_scale = 0; + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end + + ct->proto.tcp.seen[0].td_maxwin; /* We assume SACK and liberal window checking to handle * window scaling */ - conntrack->proto.tcp.seen[0].flags = - conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | - IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[0].flags = + ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | + IP_CT_TCP_FLAG_BE_LIBERAL; } - conntrack->proto.tcp.seen[1].td_end = 0; - conntrack->proto.tcp.seen[1].td_maxend = 0; - conntrack->proto.tcp.seen[1].td_maxwin = 1; - conntrack->proto.tcp.seen[1].td_scale = 0; - /* tcp_packet will set them */ - conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; - conntrack->proto.tcp.last_index = TCP_NONE_SET; - - DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - return 1; + ct->proto.tcp.last_index = TCP_NONE_SET; + + pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return true; } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> -static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, - const struct nf_conn *ct) +static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) { - struct nfattr *nest_parms; + struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; - read_lock_bh(&tcp_lock); - nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); - NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), - &ct->proto.tcp.state); - - NFA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, sizeof(u_int8_t), - &ct->proto.tcp.seen[0].td_scale); + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; - NFA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, sizeof(u_int8_t), - &ct->proto.tcp.seen[1].td_scale); + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) || + nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + ct->proto.tcp.seen[0].td_scale) || + nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, + ct->proto.tcp.seen[1].td_scale)) + goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[0].flags; - NFA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, - sizeof(struct nf_ct_tcp_flags), &tmp); + if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, + sizeof(struct nf_ct_tcp_flags), &tmp)) + goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[1].flags; - NFA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, - sizeof(struct nf_ct_tcp_flags), &tmp); - read_unlock_bh(&tcp_lock); + if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, + sizeof(struct nf_ct_tcp_flags), &tmp)) + goto nla_put_failure; + spin_unlock_bh(&ct->lock); - NFA_NEST_END(skb, nest_parms); + nla_nest_end(skb, nest_parms); return 0; -nfattr_failure: - read_unlock_bh(&tcp_lock); +nla_put_failure: + spin_unlock_bh(&ct->lock); return -1; } -static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { - [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), - [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL-1] = sizeof(u_int8_t), - [CTA_PROTOINFO_TCP_WSCALE_REPLY-1] = sizeof(u_int8_t), - [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL-1] = sizeof(struct nf_ct_tcp_flags), - [CTA_PROTOINFO_TCP_FLAGS_REPLY-1] = sizeof(struct nf_ct_tcp_flags) +static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { + [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, + [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; -static int nfattr_to_tcp(struct nfattr *cda[], struct nf_conn *ct) +static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { - struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; - struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; + struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; + struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; + int err; /* updates could not contain anything about the private * protocol info, in that case skip the parsing */ - if (!attr) + if (!pattr) return 0; - nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy); + if (err < 0) + return err; - if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + if (tb[CTA_PROTOINFO_TCP_STATE] && + nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) return -EINVAL; - if (!tb[CTA_PROTOINFO_TCP_STATE-1]) - return -EINVAL; - - write_lock_bh(&tcp_lock); - ct->proto.tcp.state = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]); + spin_lock_bh(&ct->lock); + if (tb[CTA_PROTOINFO_TCP_STATE]) + ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); - if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL-1]) { + if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = - NFA_DATA(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL-1]); + nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); ct->proto.tcp.seen[0].flags &= ~attr->mask; ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; } - if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY-1]) { + if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { struct nf_ct_tcp_flags *attr = - NFA_DATA(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY-1]); + nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); ct->proto.tcp.seen[1].flags &= ~attr->mask; ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; } - if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL-1] && - tb[CTA_PROTOINFO_TCP_WSCALE_REPLY-1] && + if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && + tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { - ct->proto.tcp.seen[0].td_scale = *(u_int8_t *) - NFA_DATA(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL-1]); - ct->proto.tcp.seen[1].td_scale = *(u_int8_t *) - NFA_DATA(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY-1]); + ct->proto.tcp.seen[0].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); + ct->proto.tcp.seen[1].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } - write_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return 0; } + +static int tcp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_TCP */ + + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); +} + +static int tcp_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct nf_tcp_net *tn = tcp_pernet(net); + int i; + + /* set default TCP timeouts. */ + for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) + timeouts[i] = tn->timeouts[i]; + + if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { + timeouts[TCP_CONNTRACK_SYN_SENT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { + timeouts[TCP_CONNTRACK_SYN_RECV] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { + timeouts[TCP_CONNTRACK_ESTABLISHED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { + timeouts[TCP_CONNTRACK_FIN_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { + timeouts[TCP_CONNTRACK_CLOSE_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { + timeouts[TCP_CONNTRACK_LAST_ACK] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { + timeouts[TCP_CONNTRACK_TIME_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_CLOSE]) { + timeouts[TCP_CONNTRACK_CLOSE] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { + timeouts[TCP_CONNTRACK_SYN_SENT2] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_RETRANS]) { + timeouts[TCP_CONNTRACK_RETRANS] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_UNACK]) { + timeouts[TCP_CONNTRACK_UNACK] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; + } + return 0; +} + +static int +tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, + htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, + htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, + htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, + htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, + htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, + htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, + htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, + htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, + htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, + htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, + htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { + [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static unsigned int tcp_sysctl_table_users; -static struct ctl_table_header *tcp_sysctl_header; static struct ctl_table tcp_sysctl_table[] = { { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "nf_conntrack_tcp_timeout_syn_sent", - .data = &nf_ct_tcp_timeout_syn_sent, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, .procname = "nf_conntrack_tcp_timeout_syn_recv", - .data = &nf_ct_tcp_timeout_syn_recv, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, .procname = "nf_conntrack_tcp_timeout_established", - .data = &nf_ct_tcp_timeout_established, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, .procname = "nf_conntrack_tcp_timeout_fin_wait", - .data = &nf_ct_tcp_timeout_fin_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, .procname = "nf_conntrack_tcp_timeout_close_wait", - .data = &nf_ct_tcp_timeout_close_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, .procname = "nf_conntrack_tcp_timeout_last_ack", - .data = &nf_ct_tcp_timeout_last_ack, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, .procname = "nf_conntrack_tcp_timeout_time_wait", - .data = &nf_ct_tcp_timeout_time_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, .procname = "nf_conntrack_tcp_timeout_close", - .data = &nf_ct_tcp_timeout_close, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, .procname = "nf_conntrack_tcp_timeout_max_retrans", - .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_unacknowledged", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE, .procname = "nf_conntrack_tcp_loose", - .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL, .procname = "nf_conntrack_tcp_be_liberal", - .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS, .procname = "nf_conntrack_tcp_max_retrans", - .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { - .ctl_name = 0 - } + { } }; #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT static struct ctl_table tcp_compat_sysctl_table[] = { { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "ip_conntrack_tcp_timeout_syn_sent", - .data = &nf_ct_tcp_timeout_syn_sent, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_syn_sent2", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, .procname = "ip_conntrack_tcp_timeout_syn_recv", - .data = &nf_ct_tcp_timeout_syn_recv, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, .procname = "ip_conntrack_tcp_timeout_established", - .data = &nf_ct_tcp_timeout_established, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, .procname = "ip_conntrack_tcp_timeout_fin_wait", - .data = &nf_ct_tcp_timeout_fin_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, .procname = "ip_conntrack_tcp_timeout_close_wait", - .data = &nf_ct_tcp_timeout_close_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, .procname = "ip_conntrack_tcp_timeout_last_ack", - .data = &nf_ct_tcp_timeout_last_ack, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, .procname = "ip_conntrack_tcp_timeout_time_wait", - .data = &nf_ct_tcp_timeout_time_wait, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, .procname = "ip_conntrack_tcp_timeout_close", - .data = &nf_ct_tcp_timeout_close, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, .procname = "ip_conntrack_tcp_timeout_max_retrans", - .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, .procname = "ip_conntrack_tcp_loose", - .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, .procname = "ip_conntrack_tcp_be_liberal", - .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, .procname = "ip_conntrack_tcp_max_retrans", - .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ -struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = +static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_tcp_net *tn) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(tcp_sysctl_table, + sizeof(tcp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; + pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; + pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; + pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; + pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; + pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; + pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; + pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; + pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK]; + pn->ctl_table[10].data = &tn->tcp_loose; + pn->ctl_table[11].data = &tn->tcp_be_liberal; + pn->ctl_table[12].data = &tn->tcp_max_retrans; +#endif + return 0; +} + +static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_tcp_net *tn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table, + sizeof(tcp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; + pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2]; + pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; + pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; + pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; + pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; + pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; + pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; + pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; + pn->ctl_compat_table[10].data = &tn->tcp_loose; + pn->ctl_compat_table[11].data = &tn->tcp_be_liberal; + pn->ctl_compat_table[12].data = &tn->tcp_max_retrans; +#endif +#endif + return 0; +} + +static int tcp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_proto_net *pn = &tn->pn; + + if (!pn->users) { + int i; + + for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) + tn->timeouts[i] = tcp_timeouts[i]; + + tn->tcp_loose = nf_ct_tcp_loose; + tn->tcp_be_liberal = nf_ct_tcp_be_liberal; + tn->tcp_max_retrans = nf_ct_tcp_max_retrans; + } + + if (proto == AF_INET) { + ret = tcp_kmemdup_compat_sysctl_table(pn, tn); + if (ret < 0) + return ret; + + ret = tcp_kmemdup_sysctl_table(pn, tn); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + } else + ret = tcp_kmemdup_sysctl_table(pn, tn); + + return ret; +} + +static struct nf_proto_net *tcp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.tcp.pn; +} + +struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_TCP, @@ -1393,26 +1645,34 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .print_tuple = tcp_print_tuple, .print_conntrack = tcp_print_conntrack, .packet = tcp_packet, + .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .to_nfattr = tcp_to_nfattr, - .from_nfattr = nfattr_to_tcp, - .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, -#endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &tcp_sysctl_table_users, - .ctl_table_header = &tcp_sysctl_header, - .ctl_table = tcp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = tcp_compat_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, + .from_nlattr = nlattr_to_tcp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = tcp_timeout_nlattr_to_obj, + .obj_to_nlattr = tcp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_TCP_MAX, + .obj_size = sizeof(unsigned int) * + TCP_CONNTRACK_TIMEOUT_MAX, + .nla_policy = tcp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = tcp_init_net, + .get_net_proto = tcp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); -struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = +struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, @@ -1422,18 +1682,29 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = .print_tuple = tcp_print_tuple, .print_conntrack = tcp_print_conntrack, .packet = tcp_packet, + .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .to_nfattr = tcp_to_nfattr, - .from_nfattr = nfattr_to_tcp, - .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, -#endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &tcp_sysctl_table_users, - .ctl_table_header = &tcp_sysctl_header, - .ctl_table = tcp_sysctl_table, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, + .from_nlattr = nlattr_to_tcp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = tcp_timeout_nlattr_to_obj, + .obj_to_nlattr = tcp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_TCP_MAX, + .obj_size = sizeof(unsigned int) * + TCP_CONNTRACK_TIMEOUT_MAX, + .nla_policy = tcp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = tcp_init_net, + .get_net_proto = tcp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3620ecc095f..9d7721cbce4 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,7 +10,6 @@ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> -#include <linux/netfilter.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <linux/skbuff.h> @@ -22,33 +22,44 @@ #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> + +static unsigned int udp_timeouts[UDP_CT_MAX] = { + [UDP_CT_UNREPLIED] = 30*HZ, + [UDP_CT_REPLIED] = 180*HZ, +}; -static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ; -static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ; +static inline struct nf_udp_net *udp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.udp; +} -static int udp_pkt_to_tuple(const struct sk_buff *skb, +static bool udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { - struct udphdr _hdr, *hp; + const struct udphdr *hp; + struct udphdr _hdr; /* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) - return 0; + return false; tuple->src.u.udp.port = hp->source; tuple->dst.u.udp.port = hp->dest; - return 1; + return true; } -static int udp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { tuple->src.u.udp.port = orig->dst.u.udp.port; tuple->dst.u.udp.port = orig->src.u.udp.port; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ @@ -60,63 +71,64 @@ static int udp_print_tuple(struct seq_file *s, ntohs(tuple->dst.u.udp.port)); } -/* Print out the private part of the conntrack. */ -static int udp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) +static unsigned int *udp_get_timeouts(struct net *net) { - return 0; + return udp_pernet(net)->timeouts; } /* Returns verdict for packet, and may modify conntracktype */ -static int udp_packet(struct nf_conn *conntrack, +static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, - unsigned int hooknum) + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) { /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - nf_ct_refresh_acct(conntrack, ctinfo, skb, - nf_ct_udp_timeout_stream); + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_REPLIED]); /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) - nf_conntrack_event_cache(IPCT_STATUS, skb); - } else - nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout); - + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_UNREPLIED]); + } return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb, - unsigned int dataoff) +static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) { - return 1; + return true; } -static int udp_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, - int pf, +static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info *ctinfo, + u_int8_t pf, unsigned int hooknum) { unsigned int udplen = skb->len - dataoff; - struct udphdr _hdr, *hdr; + const struct udphdr *hdr; + struct udphdr _hdr; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: short packet "); return -NF_ACCEPT; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -129,12 +141,10 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff, * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && - ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } @@ -142,56 +152,158 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff, return NF_ACCEPT; } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct nf_udp_net *un = udp_pernet(net); + + /* set default timeouts for UDP. */ + timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; + timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { + timeouts[UDP_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_UDP_REPLIED]) { + timeouts[UDP_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; + } + return 0; +} + +static int +udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED, + htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED, + htonl(timeouts[UDP_CT_REPLIED] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { + [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static unsigned int udp_sysctl_table_users; -static struct ctl_table_header *udp_sysctl_header; static struct ctl_table udp_sysctl_table[] = { { - .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT, .procname = "nf_conntrack_udp_timeout", - .data = &nf_ct_udp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, .procname = "nf_conntrack_udp_timeout_stream", - .data = &nf_ct_udp_timeout_stream, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT static struct ctl_table udp_compat_sysctl_table[] = { { - .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, .procname = "ip_conntrack_udp_timeout", - .data = &nf_ct_udp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, .procname = "ip_conntrack_udp_timeout_stream", - .data = &nf_ct_udp_timeout_stream, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ -struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = +static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_udp_net *un) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + pn->ctl_table = kmemdup(udp_sysctl_table, + sizeof(udp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; + pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED]; +#endif + return 0; +} + +static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_udp_net *un) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table, + sizeof(udp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; + pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED]; +#endif +#endif + return 0; +} + +static int udp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_udp_net *un = udp_pernet(net); + struct nf_proto_net *pn = &un->pn; + + if (!pn->users) { + int i; + + for (i = 0; i < UDP_CT_MAX; i++) + un->timeouts[i] = udp_timeouts[i]; + } + + if (proto == AF_INET) { + ret = udp_kmemdup_compat_sysctl_table(pn, un); + if (ret < 0) + return ret; + + ret = udp_kmemdup_sysctl_table(pn, un); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + } else + ret = udp_kmemdup_sysctl_table(pn, un); + + return ret; +} + +static struct nf_proto_net *udp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.udp.pn; +} + +struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_UDP, @@ -199,26 +311,31 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, - .print_conntrack = udp_print_conntrack, .packet = udp_packet, + .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, -#endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &udp_sysctl_table_users, - .ctl_table_header = &udp_sysctl_header, - .ctl_table = udp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = udp_compat_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = udp_init_net, + .get_net_proto = udp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); -struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = +struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, @@ -226,18 +343,26 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, - .print_conntrack = udp_print_conntrack, .packet = udp_packet, + .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, -#endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &udp_sysctl_table_users, - .ctl_table_header = &udp_sysctl_header, - .ctl_table = udp_sysctl_table, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = udp_init_net, + .get_net_proto = udp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c new file mode 100644 index 00000000000..2750e6c69f8 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -0,0 +1,405 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/udp.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/checksum.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> + +enum udplite_conntrack { + UDPLITE_CT_UNREPLIED, + UDPLITE_CT_REPLIED, + UDPLITE_CT_MAX +}; + +static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = { + [UDPLITE_CT_UNREPLIED] = 30*HZ, + [UDPLITE_CT_REPLIED] = 180*HZ, +}; + +static int udplite_net_id __read_mostly; +struct udplite_net { + struct nf_proto_net pn; + unsigned int timeouts[UDPLITE_CT_MAX]; +}; + +static inline struct udplite_net *udplite_pernet(struct net *net) +{ + return net_generic(net, udplite_net_id); +} + +static bool udplite_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct udphdr *hp; + struct udphdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + return true; +} + +static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int udplite_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +static unsigned int *udplite_get_timeouts(struct net *net) +{ + return udplite_pernet(net)->timeouts; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udplite_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDPLITE_CT_REPLIED]); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDPLITE_CT_UNREPLIED]); + } + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + return true; +} + +static int udplite_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + unsigned int cscov; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: short packet "); + return -NF_ACCEPT; + } + + cscov = ntohs(hdr->len); + if (cscov == 0) + cscov = udplen; + else if (cscov < sizeof(*hdr) || cscov > udplen) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: invalid checksum coverage "); + return -NF_ACCEPT; + } + + /* UDPLITE mandates checksums */ + if (!hdr->check) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: checksum missing "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, + pf)) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: bad UDPLite checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct udplite_net *un = udplite_pernet(net); + + /* set default timeouts for UDPlite. */ + timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED]; + timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) { + timeouts[UDPLITE_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) { + timeouts[UDPLITE_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ; + } + return 0; +} + +static int +udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED, + htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED, + htonl(timeouts[UDPLITE_CT_REPLIED] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = { + [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static struct ctl_table udplite_sysctl_table[] = { + { + .procname = "nf_conntrack_udplite_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_udplite_timeout_stream", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct udplite_net *un) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(udplite_sysctl_table, + sizeof(udplite_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED]; + pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED]; +#endif + return 0; +} + +static int udplite_init_net(struct net *net, u_int16_t proto) +{ + struct udplite_net *un = udplite_pernet(net); + struct nf_proto_net *pn = &un->pn; + + if (!pn->users) { + int i; + + for (i = 0 ; i < UDPLITE_CT_MAX; i++) + un->timeouts[i] = udplite_timeouts[i]; + } + + return udplite_kmemdup_sysctl_table(pn, un); +} + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .pkt_to_tuple = udplite_pkt_to_tuple, + .invert_tuple = udplite_invert_tuple, + .print_tuple = udplite_print_tuple, + .packet = udplite_packet, + .get_timeouts = udplite_get_timeouts, + .new = udplite_new, + .error = udplite_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udplite_timeout_nlattr_to_obj, + .obj_to_nlattr = udplite_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, + .obj_size = sizeof(unsigned int) * + CTA_TIMEOUT_UDPLITE_MAX, + .nla_policy = udplite_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &udplite_net_id, + .init_net = udplite_init_net, +}; + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .pkt_to_tuple = udplite_pkt_to_tuple, + .invert_tuple = udplite_invert_tuple, + .print_tuple = udplite_print_tuple, + .packet = udplite_packet, + .get_timeouts = udplite_get_timeouts, + .new = udplite_new, + .error = udplite_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udplite_timeout_nlattr_to_obj, + .obj_to_nlattr = udplite_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, + .obj_size = sizeof(unsigned int) * + CTA_TIMEOUT_UDPLITE_MAX, + .nla_policy = udplite_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &udplite_net_id, + .init_net = udplite_init_net, +}; + +static int udplite_net_init(struct net *net) +{ + int ret = 0; + + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4); + if (ret < 0) { + pr_err("nf_conntrack_udplite4: pernet registration failed.\n"); + goto out; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6); + if (ret < 0) { + pr_err("nf_conntrack_udplite6: pernet registration failed.\n"); + goto cleanup_udplite4; + } + return 0; + +cleanup_udplite4: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); +out: + return ret; +} + +static void udplite_net_exit(struct net *net) +{ + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); +} + +static struct pernet_operations udplite_net_ops = { + .init = udplite_net_init, + .exit = udplite_net_exit, + .id = &udplite_net_id, + .size = sizeof(struct udplite_net), +}; + +static int __init nf_conntrack_proto_udplite_init(void) +{ + int ret; + + ret = register_pernet_subsys(&udplite_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4); + if (ret < 0) + goto out_udplite4; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6); + if (ret < 0) + goto out_udplite6; + + return 0; +out_udplite6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +out_udplite4: + unregister_pernet_subsys(&udplite_net_ops); +out_pernet: + return ret; +} + +static void __exit nf_conntrack_proto_udplite_exit(void) +{ + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); + unregister_pernet_subsys(&udplite_net_ops); +} + +module_init(nf_conntrack_proto_udplite_init); +module_exit(nf_conntrack_proto_udplite_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index eb2d1dc46d4..4a2134fd3fc 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> +#include <linux/slab.h> #include <linux/in.h> #include <linux/tcp.h> #include <net/netfilter/nf_conntrack.h> @@ -30,6 +31,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>"); MODULE_DESCRIPTION("SANE connection tracking helper"); +MODULE_ALIAS_NFCT_HELPER("sane"); static char *sane_buffer; @@ -40,12 +42,6 @@ static u_int16_t ports[MAX_PORTS]; static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - struct sane_request { __be32 RPC_code; #define SANE_NET_START 7 /* RPC code */ @@ -62,50 +58,49 @@ struct sane_reply_net_start { /* other fields aren't interesting for conntrack */ }; -static int help(struct sk_buff **pskb, +static int help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { unsigned int dataoff, datalen; - struct tcphdr _tcph, *th; - char *sb_ptr; + const struct tcphdr *th; + struct tcphdr _tcph; + void *sb_ptr; int ret = NF_ACCEPT; int dir = CTINFO2DIR(ctinfo); - struct nf_ct_sane_master *ct_sane_info; + struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; struct nf_conntrack_tuple *tuple; struct sane_request *req; struct sane_reply_net_start *reply; - int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - ct_sane_info = &nfct_help(ct)->help.ct_sane_info; /* Until there's been traffic both ways, don't look in packets. */ if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) + ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; /* Not a full tcp header? */ - th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); if (th == NULL) return NF_ACCEPT; /* No data? */ dataoff = protoff + th->doff * 4; - if (dataoff >= (*pskb)->len) + if (dataoff >= skb->len) return NF_ACCEPT; - datalen = (*pskb)->len - dataoff; + datalen = skb->len - dataoff; spin_lock_bh(&nf_sane_lock); - sb_ptr = skb_header_pointer(*pskb, dataoff, datalen, sane_buffer); + sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer); BUG_ON(sb_ptr == NULL); if (dir == IP_CT_DIR_ORIGINAL) { if (datalen != sizeof(struct sane_request)) goto out; - req = (struct sane_request *)sb_ptr; + req = sb_ptr; if (req->RPC_code != htonl(SANE_NET_START)) { /* Not an interesting command */ ct_sane_info->state = SANE_STATE_NORMAL; @@ -125,15 +120,15 @@ static int help(struct sk_buff **pskb, ct_sane_info->state = SANE_STATE_NORMAL; if (datalen < sizeof(struct sane_reply_net_start)) { - DEBUGP("nf_ct_sane: NET_START reply too short\n"); + pr_debug("nf_ct_sane: NET_START reply too short\n"); goto out; } - reply = (struct sane_reply_net_start *)sb_ptr; + reply = sb_ptr; if (reply->status != htonl(SANE_STATUS_SUCCESS)) { /* saned refused the command */ - DEBUGP("nf_ct_sane: unsuccessful SANE_STATUS = %u\n", - ntohl(reply->status)); + pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n", + ntohl(reply->status)); goto out; } @@ -141,35 +136,40 @@ static int help(struct sk_buff **pskb, if (reply->zero != 0) goto out; - exp = nf_conntrack_expect_alloc(ct); + exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - nf_conntrack_expect_init(exp, family, - &tuple->src.u3, &tuple->dst.u3, - IPPROTO_TCP, - NULL, &reply->port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_TCP, NULL, &reply->port); - DEBUGP("nf_ct_sane: expect: "); - NF_CT_DUMP_TUPLE(&exp->tuple); - NF_CT_DUMP_TUPLE(&exp->mask); + pr_debug("nf_ct_sane: expect: "); + nf_ct_dump_tuple(&exp->tuple); /* Can't expect this? Best to drop packet now. */ - if (nf_conntrack_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; + } - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); out: spin_unlock_bh(&nf_sane_lock); return ret; } -static struct nf_conntrack_helper sane[MAX_PORTS][2]; -static char sane_names[MAX_PORTS][2][sizeof("sane-65535")]; +static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; + +static const struct nf_conntrack_expect_policy sane_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_sane_fini(void) @@ -178,9 +178,9 @@ static void nf_conntrack_sane_fini(void) for (i = 0; i < ports_c; i++) { for (j = 0; j < 2; j++) { - DEBUGP("nf_ct_sane: unregistering helper for pf: %d " - "port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); + pr_debug("nf_ct_sane: unregistering helper for pf: %d " + "port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); nf_conntrack_helper_unregister(&sane[i][j]); } } @@ -191,7 +191,6 @@ static void nf_conntrack_sane_fini(void) static int __init nf_conntrack_sane_init(void) { int i, j = -1, ret = 0; - char *tmpname; sane_buffer = kmalloc(65536, GFP_KERNEL); if (!sane_buffer) @@ -206,24 +205,20 @@ static int __init nf_conntrack_sane_init(void) sane[i][0].tuple.src.l3num = PF_INET; sane[i][1].tuple.src.l3num = PF_INET6; for (j = 0; j < 2; j++) { + sane[i][j].data_len = sizeof(struct nf_ct_sane_master); sane[i][j].tuple.src.u.tcp.port = htons(ports[i]); sane[i][j].tuple.dst.protonum = IPPROTO_TCP; - sane[i][j].mask.src.u.tcp.port = 0xFFFF; - sane[i][j].mask.dst.protonum = 0xFF; - sane[i][j].max_expected = 1; - sane[i][j].timeout = 5 * 60; /* 5 Minutes */ + sane[i][j].expect_policy = &sane_exp_policy; sane[i][j].me = THIS_MODULE; sane[i][j].help = help; - tmpname = &sane_names[i][j][0]; if (ports[i] == SANE_PORT) - sprintf(tmpname, "sane"); + sprintf(sane[i][j].name, "sane"); else - sprintf(tmpname, "sane-%d", ports[i]); - sane[i][j].name = tmpname; + sprintf(sane[i][j].name, "sane-%d", ports[i]); - DEBUGP("nf_ct_sane: registering helper for pf: %d " - "port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); + pr_debug("nf_ct_sane: registering helper for pf: %d " + "port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); ret = nf_conntrack_helper_register(&sane[i][j]); if (ret) { printk(KERN_ERR "nf_ct_sane: failed to " diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c new file mode 100644 index 00000000000..f6e2ae91a80 --- /dev/null +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -0,0 +1,243 @@ +#include <linux/types.h> +#include <linux/netfilter.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> + +int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_conn_seqadj *seqadj; + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + seqadj = nfct_seqadj(ct); + this_way = &seqadj->seq[dir]; + this_way->offset_before = off; + this_way->offset_after = off; + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); + +int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + if (unlikely(!seqadj)) { + WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); + return 0; + } + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + spin_lock_bh(&ct->lock); + this_way = &seqadj->seq[dir]; + if (this_way->offset_before == this_way->offset_after || + before(this_way->correction_pos, ntohl(seq))) { + this_way->correction_pos = ntohl(seq); + this_way->offset_before = this_way->offset_after; + this_way->offset_after += off; + } + spin_unlock_bh(&ct->lock); + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); + +void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP) + return; + + th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} +EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); + +/* Adjust one found SACK option including checksum correction */ +static void nf_ct_sack_block_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct nf_ct_seqadj *seq) +{ + while (sackoff < sackend) { + struct tcp_sack_block_wire *sack; + __be32 new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - seq->offset_before, + seq->correction_pos)) + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_after); + else + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_before); + + if (after(ntohl(sack->end_seq) - seq->offset_before, + seq->correction_pos)) + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_after); + else + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_before); + + pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *tcph, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + tcph->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK && + op[1] >= 2+TCPOLEN_SACK_PERBLOCK && + ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + nf_ct_sack_block_adjust(skb, tcph, optoff + 2, + optoff+op[1], + &seqadj->seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct tcphdr *tcph; + __be32 newseq, newack; + s32 seqoff, ackoff; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way, *other_way; + int res; + + this_way = &seqadj->seq[dir]; + other_way = &seqadj->seq[!dir]; + + if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + return 0; + + tcph = (void *)skb->data + protoff; + spin_lock_bh(&ct->lock); + if (after(ntohl(tcph->seq), this_way->correction_pos)) + seqoff = this_way->offset_after; + else + seqoff = this_way->offset_before; + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + ackoff = other_way->offset_after; + else + ackoff = other_way->offset_before; + + newseq = htonl(ntohl(tcph->seq) + seqoff); + newack = htonl(ntohl(tcph->ack_seq) - ackoff); + + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + + pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); + spin_unlock_bh(&ct->lock); + + return res; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); + +s32 nf_ct_seq_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way; + + if (!seqadj) + return 0; + + this_way = &seqadj->seq[dir]; + return after(seq, this_way->correction_pos) ? + this_way->offset_after : this_way->offset_before; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_offset); + +static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = { + .len = sizeof(struct nf_conn_seqadj), + .align = __alignof__(struct nf_conn_seqadj), + .id = NF_CT_EXT_SEQADJ, +}; + +int nf_conntrack_seqadj_init(void) +{ + return nf_ct_extend_register(&nf_ct_seqadj_extend); +} + +void nf_conntrack_seqadj_fini(void) +{ + nf_ct_extend_unregister(&nf_ct_seqadj_extend); +} diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 7aaa8c91b29..4c3ba1c8d68 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -2,6 +2,8 @@ * * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> * based on RR's ip_conntrack_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -14,27 +16,25 @@ #include <linux/inet.h> #include <linux/in.h> #include <linux/udp.h> +#include <linux/tcp.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_conntrack_sip.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); MODULE_DESCRIPTION("SIP connection tracking helper"); MODULE_ALIAS("ip_conntrack_sip"); +MODULE_ALIAS_NFCT_HELPER("sip"); #define MAX_PORTS 8 static unsigned short ports[MAX_PORTS]; -static int ports_c; +static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of SIP servers"); @@ -42,250 +42,145 @@ static unsigned int sip_timeout __read_mostly = SIP_TIMEOUT; module_param(sip_timeout, uint, 0600); MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); -unsigned int (*nf_nat_sip_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct nf_conn *ct, - const char **dptr) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_hook); - -unsigned int (*nf_nat_sdp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_expect *exp, - const char *dptr) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_hook); - -static int digits_len(struct nf_conn *, const char *, const char *, int *); -static int epaddr_len(struct nf_conn *, const char *, const char *, int *); -static int skp_digits_len(struct nf_conn *, const char *, const char *, int *); -static int skp_epaddr_len(struct nf_conn *, const char *, const char *, int *); - -struct sip_header_nfo { - const char *lname; - const char *sname; - const char *ln_str; - size_t lnlen; - size_t snlen; - size_t ln_strlen; - int case_sensitive; - int (*match_len)(struct nf_conn *, const char *, - const char *, int *); -}; +static int sip_direct_signalling __read_mostly = 1; +module_param(sip_direct_signalling, int, 0600); +MODULE_PARM_DESC(sip_direct_signalling, "expect incoming calls from registrar " + "only (default 1)"); -static const struct sip_header_nfo ct_sip_hdrs[] = { - [POS_REG_REQ_URI] = { /* SIP REGISTER request URI */ - .lname = "sip:", - .lnlen = sizeof("sip:") - 1, - .ln_str = ":", - .ln_strlen = sizeof(":") - 1, - .match_len = epaddr_len, - }, - [POS_REQ_URI] = { /* SIP request URI */ - .lname = "sip:", - .lnlen = sizeof("sip:") - 1, - .ln_str = "@", - .ln_strlen = sizeof("@") - 1, - .match_len = epaddr_len, - }, - [POS_FROM] = { /* SIP From header */ - .lname = "From:", - .lnlen = sizeof("From:") - 1, - .sname = "\r\nf:", - .snlen = sizeof("\r\nf:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len, - }, - [POS_TO] = { /* SIP To header */ - .lname = "To:", - .lnlen = sizeof("To:") - 1, - .sname = "\r\nt:", - .snlen = sizeof("\r\nt:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len - }, - [POS_VIA] = { /* SIP Via header */ - .lname = "Via:", - .lnlen = sizeof("Via:") - 1, - .sname = "\r\nv:", - .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ - .ln_str = "UDP ", - .ln_strlen = sizeof("UDP ") - 1, - .match_len = epaddr_len, - }, - [POS_CONTACT] = { /* SIP Contact header */ - .lname = "Contact:", - .lnlen = sizeof("Contact:") - 1, - .sname = "\r\nm:", - .snlen = sizeof("\r\nm:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len - }, - [POS_CONTENT] = { /* SIP Content length header */ - .lname = "Content-Length:", - .lnlen = sizeof("Content-Length:") - 1, - .sname = "\r\nl:", - .snlen = sizeof("\r\nl:") - 1, - .ln_str = ":", - .ln_strlen = sizeof(":") - 1, - .match_len = skp_digits_len - }, - [POS_MEDIA] = { /* SDP media info */ - .case_sensitive = 1, - .lname = "\nm=", - .lnlen = sizeof("\nm=") - 1, - .sname = "\rm=", - .snlen = sizeof("\rm=") - 1, - .ln_str = "audio ", - .ln_strlen = sizeof("audio ") - 1, - .match_len = digits_len - }, - [POS_OWNER_IP4] = { /* SDP owner address*/ - .case_sensitive = 1, - .lname = "\no=", - .lnlen = sizeof("\no=") - 1, - .sname = "\ro=", - .snlen = sizeof("\ro=") - 1, - .ln_str = "IN IP4 ", - .ln_strlen = sizeof("IN IP4 ") - 1, - .match_len = epaddr_len - }, - [POS_CONNECTION_IP4] = {/* SDP connection info */ - .case_sensitive = 1, - .lname = "\nc=", - .lnlen = sizeof("\nc=") - 1, - .sname = "\rc=", - .snlen = sizeof("\rc=") - 1, - .ln_str = "IN IP4 ", - .ln_strlen = sizeof("IN IP4 ") - 1, - .match_len = epaddr_len - }, - [POS_OWNER_IP6] = { /* SDP owner address*/ - .case_sensitive = 1, - .lname = "\no=", - .lnlen = sizeof("\no=") - 1, - .sname = "\ro=", - .snlen = sizeof("\ro=") - 1, - .ln_str = "IN IP6 ", - .ln_strlen = sizeof("IN IP6 ") - 1, - .match_len = epaddr_len - }, - [POS_CONNECTION_IP6] = {/* SDP connection info */ - .case_sensitive = 1, - .lname = "\nc=", - .lnlen = sizeof("\nc=") - 1, - .sname = "\rc=", - .snlen = sizeof("\rc=") - 1, - .ln_str = "IN IP6 ", - .ln_strlen = sizeof("IN IP6 ") - 1, - .match_len = epaddr_len - }, - [POS_SDP_HEADER] = { /* SDP version header */ - .case_sensitive = 1, - .lname = "\nv=", - .lnlen = sizeof("\nv=") - 1, - .sname = "\rv=", - .snlen = sizeof("\rv=") - 1, - .ln_str = "=", - .ln_strlen = sizeof("=") - 1, - .match_len = digits_len - } -}; +static int sip_direct_media __read_mostly = 1; +module_param(sip_direct_media, int, 0600); +MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling " + "endpoints only (default 1)"); -/* get line lenght until first CR or LF seen. */ -int ct_sip_lnlen(const char *line, const char *limit) -{ - const char *k = line; +const struct nf_nat_sip_hooks *nf_nat_sip_hooks; +EXPORT_SYMBOL_GPL(nf_nat_sip_hooks); - while ((line <= limit) && (*line == '\r' || *line == '\n')) - line++; +static int string_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = 0; - while (line <= limit) { - if (*line == '\r' || *line == '\n') - break; - line++; + while (dptr < limit && isalpha(*dptr)) { + dptr++; + len++; } - return line - k; + return len; } -EXPORT_SYMBOL_GPL(ct_sip_lnlen); -/* Linear string search, case sensitive. */ -const char *ct_sip_search(const char *needle, const char *haystack, - size_t needle_len, size_t haystack_len, - int case_sensitive) +static int digits_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) { - const char *limit = haystack + (haystack_len - needle_len); - - while (haystack <= limit) { - if (case_sensitive) { - if (strncmp(haystack, needle, needle_len) == 0) - return haystack; - } else { - if (strnicmp(haystack, needle, needle_len) == 0) - return haystack; - } - haystack++; + int len = 0; + while (dptr < limit && isdigit(*dptr)) { + dptr++; + len++; } - return NULL; + return len; } -EXPORT_SYMBOL_GPL(ct_sip_search); -static int digits_len(struct nf_conn *ct, const char *dptr, - const char *limit, int *shift) +static int iswordc(const char c) +{ + if (isalnum(c) || c == '!' || c == '"' || c == '%' || + (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' || + c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' || + c == '{' || c == '}' || c == '~') + return 1; + return 0; +} + +static int word_len(const char *dptr, const char *limit) { int len = 0; - while (dptr <= limit && isdigit(*dptr)) { + while (dptr < limit && iswordc(*dptr)) { dptr++; len++; } return len; } -/* get digits lenght, skiping blank spaces. */ -static int skp_digits_len(struct nf_conn *ct, const char *dptr, - const char *limit, int *shift) +static int callid_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) { - for (; dptr <= limit && *dptr == ' '; dptr++) - (*shift)++; + int len, domain_len; + + len = word_len(dptr, limit); + dptr += len; + if (!len || dptr == limit || *dptr != '@') + return len; + dptr++; + len++; - return digits_len(ct, dptr, limit, shift); + domain_len = word_len(dptr, limit); + if (!domain_len) + return 0; + return len + domain_len; } -static int parse_addr(struct nf_conn *ct, const char *cp, const char **endp, - union nf_conntrack_address *addr, const char *limit) +/* get media type + port length */ +static int media_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = string_len(ct, dptr, limit, shift); + + dptr += len; + if (dptr >= limit || *dptr != ' ') + return 0; + len++; + dptr++; + + return len + digits_len(ct, dptr, limit, shift); +} + +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit, bool delim) { const char *end; - int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - int ret = 0; + int ret; - switch (family) { + if (!ct) + return 0; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { case AF_INET: ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + if (ret == 0) + return 0; break; case AF_INET6: + if (cp < limit && *cp == '[') + cp++; + else if (delim) + return 0; + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + if (ret == 0) + return 0; + + if (end < limit && *end == ']') + end++; + else if (delim) + return 0; break; default: BUG(); } - if (ret == 0 || end == cp) - return 0; if (endp) *endp = end; return 1; } /* skip ip address. returns its length. */ -static int epaddr_len(struct nf_conn *ct, const char *dptr, +static int epaddr_len(const struct nf_conn *ct, const char *dptr, const char *limit, int *shift) { - union nf_conntrack_address addr; + union nf_inet_addr addr; const char *aux = dptr; - if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { - DEBUGP("ip: %s parse failed.!\n", dptr); + if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) { + pr_debug("ip: %s parse failed.!\n", dptr); return 0; } @@ -298,188 +193,1431 @@ static int epaddr_len(struct nf_conn *ct, const char *dptr, } /* get address length, skiping user info. */ -static int skp_epaddr_len(struct nf_conn *ct, const char *dptr, +static int skp_epaddr_len(const struct nf_conn *ct, const char *dptr, const char *limit, int *shift) { + const char *start = dptr; int s = *shift; /* Search for @, but stop at the end of the line. * We are inside a sip: URI, so we don't need to worry about * continuation lines. */ - while (dptr <= limit && + while (dptr < limit && *dptr != '@' && *dptr != '\r' && *dptr != '\n') { (*shift)++; dptr++; } - if (dptr <= limit && *dptr == '@') { + if (dptr < limit && *dptr == '@') { dptr++; (*shift)++; - } else + } else { + dptr = start; *shift = s; + } return epaddr_len(ct, dptr, limit, shift); } -/* Returns 0 if not found, -1 error parsing. */ -int ct_sip_get_info(struct nf_conn *ct, - const char *dptr, size_t dlen, - unsigned int *matchoff, - unsigned int *matchlen, - enum sip_header_pos pos) +/* Parse a SIP request line of the form: + * + * Request-Line = Method SP Request-URI SP SIP-Version CRLF + * + * and return the offset and length of the address contained in the Request-URI. + */ +int ct_sip_parse_request(const struct nf_conn *ct, + const char *dptr, unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr, __be16 *port) { - const struct sip_header_nfo *hnfo = &ct_sip_hdrs[pos]; - const char *limit, *aux, *k = dptr; + const char *start = dptr, *limit = dptr + datalen, *end; + unsigned int mlen; + unsigned int p; int shift = 0; - limit = dptr + (dlen - hnfo->lnlen); + /* Skip method and following whitespace */ + mlen = string_len(ct, dptr, limit, NULL); + if (!mlen) + return 0; + dptr += mlen; + if (++dptr >= limit) + return 0; - while (dptr <= limit) { - if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && - (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { - dptr++; - continue; + /* Find SIP URI */ + for (; dptr < limit - strlen("sip:"); dptr++) { + if (*dptr == '\r' || *dptr == '\n') + return -1; + if (strnicmp(dptr, "sip:", strlen("sip:")) == 0) { + dptr += strlen("sip:"); + break; } - aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, - ct_sip_lnlen(dptr, limit), - hnfo->case_sensitive); - if (!aux) { - DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, - hnfo->lname); + } + if (!skp_epaddr_len(ct, dptr, limit, &shift)) + return 0; + dptr += shift; + + if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) + return -1; + if (end < limit && *end == ':') { + end++; + p = simple_strtoul(end, (char **)&end, 10); + if (p < 1024 || p > 65535) return -1; + *port = htons(p); + } else + *port = htons(SIP_PORT); + + if (end == dptr) + return 0; + *matchoff = dptr - start; + *matchlen = end - dptr; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_request); + +/* SIP header parsing: SIP headers are located at the beginning of a line, but + * may span several lines, in which case the continuation lines begin with a + * whitespace character. RFC 2543 allows lines to be terminated with CR, LF or + * CRLF, RFC 3261 allows only CRLF, we support both. + * + * Headers are followed by (optionally) whitespace, a colon, again (optionally) + * whitespace and the values. Whitespace in this context means any amount of + * tabs, spaces and continuation lines, which are treated as a single whitespace + * character. + * + * Some headers may appear multiple times. A comma separated list of values is + * equivalent to multiple headers. + */ +static const struct sip_header ct_sip_hdrs[] = { + [SIP_HDR_CSEQ] = SIP_HDR("CSeq", NULL, NULL, digits_len), + [SIP_HDR_FROM] = SIP_HDR("From", "f", "sip:", skp_epaddr_len), + [SIP_HDR_TO] = SIP_HDR("To", "t", "sip:", skp_epaddr_len), + [SIP_HDR_CONTACT] = SIP_HDR("Contact", "m", "sip:", skp_epaddr_len), + [SIP_HDR_VIA_UDP] = SIP_HDR("Via", "v", "UDP ", epaddr_len), + [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len), + [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len), + [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len), + [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len), +}; + +static const char *sip_follow_continuation(const char *dptr, const char *limit) +{ + /* Walk past newline */ + if (++dptr >= limit) + return NULL; + + /* Skip '\n' in CR LF */ + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + return NULL; + } + + /* Continuation line? */ + if (*dptr != ' ' && *dptr != '\t') + return NULL; + + /* skip leading whitespace */ + for (; dptr < limit; dptr++) { + if (*dptr != ' ' && *dptr != '\t') + break; + } + return dptr; +} + +static const char *sip_skip_whitespace(const char *dptr, const char *limit) +{ + for (; dptr < limit; dptr++) { + if (*dptr == ' ') + continue; + if (*dptr != '\r' && *dptr != '\n') + break; + dptr = sip_follow_continuation(dptr, limit); + if (dptr == NULL) + return NULL; + } + return dptr; +} + +/* Search within a SIP header value, dealing with continuation lines */ +static const char *ct_sip_header_search(const char *dptr, const char *limit, + const char *needle, unsigned int len) +{ + for (limit -= len; dptr < limit; dptr++) { + if (*dptr == '\r' || *dptr == '\n') { + dptr = sip_follow_continuation(dptr, limit); + if (dptr == NULL) + break; + continue; + } + + if (strnicmp(dptr, needle, len) == 0) + return dptr; + } + return NULL; +} + +int ct_sip_get_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sip_hdrs[type]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + for (dptr += dataoff; dptr < limit; dptr++) { + /* Find beginning of line */ + if (*dptr != '\r' && *dptr != '\n') + continue; + if (++dptr >= limit) + break; + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + break; } - aux += hnfo->ln_strlen; - *matchlen = hnfo->match_len(ct, aux, limit, &shift); + /* Skip continuation lines */ + if (*dptr == ' ' || *dptr == '\t') + continue; + + /* Find header. Compact headers must be followed by a + * non-alphabetic character to avoid mismatches. */ + if (limit - dptr >= hdr->len && + strnicmp(dptr, hdr->name, hdr->len) == 0) + dptr += hdr->len; + else if (hdr->cname && limit - dptr >= hdr->clen + 1 && + strnicmp(dptr, hdr->cname, hdr->clen) == 0 && + !isalpha(*(dptr + hdr->clen))) + dptr += hdr->clen; + else + continue; + + /* Find and skip colon */ + dptr = sip_skip_whitespace(dptr, limit); + if (dptr == NULL) + break; + if (*dptr != ':' || ++dptr >= limit) + break; + + /* Skip whitespace after colon */ + dptr = sip_skip_whitespace(dptr, limit); + if (dptr == NULL) + break; + + *matchoff = dptr - start; + if (hdr->search) { + dptr = ct_sip_header_search(dptr, limit, hdr->search, + hdr->slen); + if (!dptr) + return -1; + dptr += hdr->slen; + } + + *matchlen = hdr->match_len(ct, dptr, limit, &shift); if (!*matchlen) return -1; + *matchoff = dptr - start + shift; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(ct_sip_get_header); + +/* Get next header field in a list of comma separated values */ +static int ct_sip_next_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sip_hdrs[type]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + dptr += dataoff; + + dptr = ct_sip_header_search(dptr, limit, ",", strlen(",")); + if (!dptr) + return 0; + + dptr = ct_sip_header_search(dptr, limit, hdr->search, hdr->slen); + if (!dptr) + return 0; + dptr += hdr->slen; + + *matchoff = dptr - start; + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff += shift; + return 1; +} + +/* Walk through headers until a parsable one is found or no header of the + * given type is left. */ +static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, int *in_header, + unsigned int *matchoff, unsigned int *matchlen) +{ + int ret; + + if (in_header && *in_header) { + while (1) { + ret = ct_sip_next_header(ct, dptr, dataoff, datalen, + type, matchoff, matchlen); + if (ret > 0) + return ret; + if (ret == 0) + break; + dataoff += *matchoff; + } + *in_header = 0; + } + + while (1) { + ret = ct_sip_get_header(ct, dptr, dataoff, datalen, + type, matchoff, matchlen); + if (ret > 0) + break; + if (ret == 0) + return ret; + dataoff += *matchoff; + } + + if (in_header) + *in_header = 1; + return 1; +} + +/* Locate a SIP header, parse the URI and return the offset and length of + * the address as well as the address and port themselves. A stream of + * headers can be parsed by handing in a non-NULL datalen and in_header + * pointer. + */ +int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, + unsigned int *dataoff, unsigned int datalen, + enum sip_header_types type, int *in_header, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr, __be16 *port) +{ + const char *c, *limit = dptr + datalen; + unsigned int p; + int ret; + + ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen, + type, in_header, matchoff, matchlen); + WARN_ON(ret < 0); + if (ret == 0) + return ret; + + if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) + return -1; + if (*c == ':') { + c++; + p = simple_strtoul(c, (char **)&c, 10); + if (p < 1024 || p > 65535) + return -1; + *port = htons(p); + } else + *port = htons(SIP_PORT); + + if (dataoff) + *dataoff = c - dptr; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_header_uri); + +static int ct_sip_parse_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen) +{ + const char *limit = dptr + datalen; + const char *start; + const char *end; - *matchoff = (aux - k) + shift; + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + start += strlen(name); + + end = ct_sip_header_search(start, limit, ";", strlen(";")); + if (!end) + end = limit; + + *matchoff = start - dptr; + *matchlen = end - start; + return 1; +} + +/* Parse address from header parameter and return address, offset and length */ +int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr, bool delim) +{ + const char *limit = dptr + datalen; + const char *start, *end; - DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, - *matchlen); + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + + start += strlen(name); + if (!sip_parse_addr(ct, start, &end, addr, limit, delim)) + return 0; + *matchoff = start - dptr; + *matchlen = end - start; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_address_param); + +/* Parse numerical header parameter and return value, offset and length */ +int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen, + unsigned int *val) +{ + const char *limit = dptr + datalen; + const char *start; + char *end; + + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + + start += strlen(name); + *val = simple_strtoul(start, &end, 0); + if (start == end) + return 0; + if (matchoff && matchlen) { + *matchoff = start - dptr; + *matchlen = end - start; + } + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_numerical_param); + +static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + u8 *proto) +{ + unsigned int matchoff, matchlen; + + if (ct_sip_parse_param(ct, dptr, dataoff, datalen, "transport=", + &matchoff, &matchlen)) { + if (!strnicmp(dptr + matchoff, "TCP", strlen("TCP"))) + *proto = IPPROTO_TCP; + else if (!strnicmp(dptr + matchoff, "UDP", strlen("UDP"))) + *proto = IPPROTO_UDP; + else + return 0; + + if (*proto != nf_ct_protonum(ct)) + return 0; + } else + *proto = nf_ct_protonum(ct); + + return 1; +} + +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + return dptr - aux; +} + +/* SDP header parsing: a SDP session description contains an ordered set of + * headers, starting with a section containing general session parameters, + * optionally followed by multiple media descriptions. + * + * SDP headers always start at the beginning of a line. According to RFC 2327: + * "The sequence CRLF (0x0d0a) is used to end a record, although parsers should + * be tolerant and also accept records terminated with a single newline + * character". We handle both cases. + */ +static const struct sip_header ct_sdp_hdrs_v4[] = { + [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), + [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), +}; + +static const struct sip_header ct_sdp_hdrs_v6[] = { + [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), + [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), +}; + +/* Linear string search within SDP header values */ +static const char *ct_sdp_header_search(const char *dptr, const char *limit, + const char *needle, unsigned int len) +{ + for (limit -= len; dptr < limit; dptr++) { + if (*dptr == '\r' || *dptr == '\n') + break; + if (strncmp(dptr, needle, len) == 0) + return dptr; + } + return NULL; +} + +/* Locate a SDP header (optionally a substring within the header value), + * optionally stopping at the first occurrence of the term header, parse + * it and return the offset and length of the data we're interested in. + */ +int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sdp_header_types type, + enum sdp_header_types term, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdrs, *hdr, *thdr; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + hdrs = nf_ct_l3num(ct) == NFPROTO_IPV4 ? ct_sdp_hdrs_v4 : ct_sdp_hdrs_v6; + hdr = &hdrs[type]; + thdr = &hdrs[term]; + + for (dptr += dataoff; dptr < limit; dptr++) { + /* Find beginning of line */ + if (*dptr != '\r' && *dptr != '\n') + continue; + if (++dptr >= limit) + break; + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + break; + } + + if (term != SDP_HDR_UNSPEC && + limit - dptr >= thdr->len && + strnicmp(dptr, thdr->name, thdr->len) == 0) + break; + else if (limit - dptr >= hdr->len && + strnicmp(dptr, hdr->name, hdr->len) == 0) + dptr += hdr->len; + else + continue; + + *matchoff = dptr - start; + if (hdr->search) { + dptr = ct_sdp_header_search(dptr, limit, hdr->search, + hdr->slen); + if (!dptr) + return -1; + dptr += hdr->slen; + } + + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff = dptr - start + shift; return 1; } - DEBUGP("%s header not found.\n", hnfo->lname); return 0; } -EXPORT_SYMBOL_GPL(ct_sip_get_info); +EXPORT_SYMBOL_GPL(ct_sip_get_sdp_header); + +static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sdp_header_types type, + enum sdp_header_types term, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr) +{ + int ret; + + ret = ct_sip_get_sdp_header(ct, dptr, dataoff, datalen, type, term, + matchoff, matchlen); + if (ret <= 0) + return ret; + + if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) + return -1; + return 1; +} -static int set_expected_rtp(struct sk_buff **pskb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - union nf_conntrack_address *addr, - __be16 port, - const char *dptr) +static int refresh_signalling_expectation(struct nf_conn *ct, + union nf_inet_addr *addr, + u8 proto, __be16 port, + unsigned int expires) { + struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; + struct hlist_node *next; + int found = 0; + + spin_lock_bh(&nf_conntrack_expect_lock); + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { + if (exp->class != SIP_EXPECT_SIGNALLING || + !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || + exp->tuple.dst.protonum != proto || + exp->tuple.dst.u.udp.port != port) + continue; + if (!del_timer(&exp->timeout)) + continue; + exp->flags &= ~NF_CT_EXPECT_INACTIVE; + exp->timeout.expires = jiffies + expires * HZ; + add_timer(&exp->timeout); + found = 1; + break; + } + spin_unlock_bh(&nf_conntrack_expect_lock); + return found; +} + +static void flush_expectations(struct nf_conn *ct, bool media) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *next; + + spin_lock_bh(&nf_conntrack_expect_lock); + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { + if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) + continue; + if (!del_timer(&exp->timeout)) + continue; + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + if (!media) + break; + } + spin_unlock_bh(&nf_conntrack_expect_lock); +} + +static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + union nf_inet_addr *daddr, __be16 port, + enum sip_expectation_classes class, + unsigned int mediaoff, unsigned int medialen) +{ + struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct net *net = nf_ct_net(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - int family = ct->tuplehash[!dir].tuple.src.l3num; + union nf_inet_addr *saddr; + struct nf_conntrack_tuple tuple; + int direct_rtp = 0, skip_expect = 0, ret = NF_DROP; + u_int16_t base_port; + __be16 rtp_port, rtcp_port; + const struct nf_nat_sip_hooks *hooks; + + saddr = NULL; + if (sip_direct_media) { + if (!nf_inet_addr_cmp(daddr, &ct->tuplehash[dir].tuple.src.u3)) + return NF_ACCEPT; + saddr = &ct->tuplehash[!dir].tuple.src.u3; + } + + /* We need to check whether the registration exists before attempting + * to register it since we can see the same media description multiple + * times on different connections in case multiple endpoints receive + * the same call. + * + * RTP optimization: if we find a matching media channel expectation + * and both the expectation and this connection are SNATed, we assume + * both sides can reach each other directly and use the final + * destination address from the expectation. We still need to keep + * the NATed expectations for media that might arrive from the + * outside, and additionally need to expect the direct RTP stream + * in case it passes through us even without NAT. + */ + memset(&tuple, 0, sizeof(tuple)); + if (saddr) + tuple.src.u3 = *saddr; + tuple.src.l3num = nf_ct_l3num(ct); + tuple.dst.protonum = IPPROTO_UDP; + tuple.dst.u3 = *daddr; + tuple.dst.u.udp.port = port; + + rcu_read_lock(); + do { + exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); + + if (!exp || exp->master == ct || + nfct_help(exp->master)->helper != nfct_help(ct)->helper || + exp->class != class) + break; +#ifdef CONFIG_NF_NAT_NEEDED + if (!direct_rtp && + (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) || + exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) && + ct->status & IPS_NAT_MASK) { + *daddr = exp->saved_addr; + tuple.dst.u3 = exp->saved_addr; + tuple.dst.u.udp.port = exp->saved_proto.udp.port; + direct_rtp = 1; + } else +#endif + skip_expect = 1; + } while (!skip_expect); + + base_port = ntohs(tuple.dst.u.udp.port) & ~1; + rtp_port = htons(base_port); + rtcp_port = htons(base_port + 1); + + if (direct_rtp) { + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && + !hooks->sdp_port(skb, protoff, dataoff, dptr, datalen, + mediaoff, medialen, ntohs(rtp_port))) + goto err1; + } + + if (skip_expect) { + rcu_read_unlock(); + return NF_ACCEPT; + } + + rtp_exp = nf_ct_expect_alloc(ct); + if (rtp_exp == NULL) + goto err1; + nf_ct_expect_init(rtp_exp, class, nf_ct_l3num(ct), saddr, daddr, + IPPROTO_UDP, NULL, &rtp_port); + + rtcp_exp = nf_ct_expect_alloc(ct); + if (rtcp_exp == NULL) + goto err2; + nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr, + IPPROTO_UDP, NULL, &rtcp_port); + + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && ct->status & IPS_NAT_MASK && !direct_rtp) + ret = hooks->sdp_media(skb, protoff, dataoff, dptr, + datalen, rtp_exp, rtcp_exp, + mediaoff, medialen, daddr); + else { + if (nf_ct_expect_related(rtp_exp) == 0) { + if (nf_ct_expect_related(rtcp_exp) != 0) + nf_ct_unexpect_related(rtp_exp); + else + ret = NF_ACCEPT; + } + } + nf_ct_expect_put(rtcp_exp); +err2: + nf_ct_expect_put(rtp_exp); +err1: + rcu_read_unlock(); + return ret; +} + +static const struct sdp_media_type sdp_media_types[] = { + SDP_MEDIA_TYPE("audio ", SIP_EXPECT_AUDIO), + SDP_MEDIA_TYPE("video ", SIP_EXPECT_VIDEO), + SDP_MEDIA_TYPE("image ", SIP_EXPECT_IMAGE), +}; + +static const struct sdp_media_type *sdp_media_type(const char *dptr, + unsigned int matchoff, + unsigned int matchlen) +{ + const struct sdp_media_type *t; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(sdp_media_types); i++) { + t = &sdp_media_types[i]; + if (matchlen < t->len || + strncmp(dptr + matchoff, t->name, t->len)) + continue; + return t; + } + return NULL; +} + +static int process_sdp(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + unsigned int mediaoff, medialen; + unsigned int sdpoff; + unsigned int caddr_len, maddr_len; + unsigned int i; + union nf_inet_addr caddr, maddr, rtp_addr; + const struct nf_nat_sip_hooks *hooks; + unsigned int port; + const struct sdp_media_type *t; + int ret = NF_ACCEPT; + + hooks = rcu_dereference(nf_nat_sip_hooks); + + /* Find beginning of session description */ + if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, + SDP_HDR_VERSION, SDP_HDR_UNSPEC, + &matchoff, &matchlen) <= 0) + return NF_ACCEPT; + sdpoff = matchoff; + + /* The connection information is contained in the session description + * and/or once per media description. The first media description marks + * the end of the session description. */ + caddr_len = 0; + if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen, + SDP_HDR_CONNECTION, SDP_HDR_MEDIA, + &matchoff, &matchlen, &caddr) > 0) + caddr_len = matchlen; + + mediaoff = sdpoff; + for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) { + if (ct_sip_get_sdp_header(ct, *dptr, mediaoff, *datalen, + SDP_HDR_MEDIA, SDP_HDR_UNSPEC, + &mediaoff, &medialen) <= 0) + break; + + /* Get media type and port number. A media port value of zero + * indicates an inactive stream. */ + t = sdp_media_type(*dptr, mediaoff, medialen); + if (!t) { + mediaoff += medialen; + continue; + } + mediaoff += t->len; + medialen -= t->len; + + port = simple_strtoul(*dptr + mediaoff, NULL, 10); + if (port == 0) + continue; + if (port < 1024 || port > 65535) { + nf_ct_helper_log(skb, ct, "wrong port %u", port); + return NF_DROP; + } + + /* The media description overrides the session description. */ + maddr_len = 0; + if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen, + SDP_HDR_CONNECTION, SDP_HDR_MEDIA, + &matchoff, &matchlen, &maddr) > 0) { + maddr_len = matchlen; + memcpy(&rtp_addr, &maddr, sizeof(rtp_addr)); + } else if (caddr_len) + memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); + else { + nf_ct_helper_log(skb, ct, "cannot parse SDP message"); + return NF_DROP; + } + + ret = set_expected_rtp_rtcp(skb, protoff, dataoff, + dptr, datalen, + &rtp_addr, htons(port), t->class, + mediaoff, medialen); + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, + "cannot add expectation for voice"); + return ret; + } + + /* Update media connection address if present */ + if (maddr_len && hooks && ct->status & IPS_NAT_MASK) { + ret = hooks->sdp_addr(skb, protoff, dataoff, + dptr, datalen, mediaoff, + SDP_HDR_CONNECTION, + SDP_HDR_MEDIA, + &rtp_addr); + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, "cannot mangle SDP"); + return ret; + } + } + i++; + } + + /* Update session connection and owner addresses */ + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && ct->status & IPS_NAT_MASK) + ret = hooks->sdp_session(skb, protoff, dataoff, + dptr, datalen, sdpoff, + &rtp_addr); + + return ret; +} +static int process_invite_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); + else if (ct_sip_info->invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_update_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); + else if (ct_sip_info->invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_prack_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); + else if (ct_sip_info->invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_invite_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + unsigned int ret; + + flush_expectations(ct, true); + ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); + if (ret == NF_ACCEPT) + ct_sip_info->invite_cseq = cseq; + return ret; +} + +static int process_bye_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + flush_expectations(ct, true); + return NF_ACCEPT; +} + +/* Parse a REGISTER request and create a permanent expectation for incoming + * signalling connections. The expectation is marked inactive and is activated + * when receiving a response indicating success from the registrar. + */ +static int process_register_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int matchoff, matchlen; + struct nf_conntrack_expect *exp; + union nf_inet_addr *saddr, daddr; + const struct nf_nat_sip_hooks *hooks; + __be16 port; + u8 proto; + unsigned int expires = 0; int ret; - typeof(nf_nat_sdp_hook) nf_nat_sdp; - exp = nf_conntrack_expect_alloc(ct); - if (exp == NULL) + /* Expected connections can not register again. */ + if (ct->status & IPS_EXPECTED) + return NF_ACCEPT; + + /* We must check the expiration time: a value of zero signals the + * registrar to release the binding. We'll remove our expectation + * when receiving the new bindings in the response, but we don't + * want to create new ones. + * + * The expiration time may be contained in Expires: header, the + * Contact: header parameters or the URI parameters. + */ + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, + &matchoff, &matchlen) > 0) + expires = simple_strtoul(*dptr + matchoff, NULL, 10); + + ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + SIP_HDR_CONTACT, NULL, + &matchoff, &matchlen, &daddr, &port); + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse contact"); return NF_DROP; - nf_conntrack_expect_init(exp, family, - &ct->tuplehash[!dir].tuple.src.u3, addr, - IPPROTO_UDP, NULL, &port); + } else if (ret == 0) + return NF_ACCEPT; + + /* We don't support third-party registrations */ + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &daddr)) + return NF_ACCEPT; - nf_nat_sdp = rcu_dereference(nf_nat_sdp_hook); - if (nf_nat_sdp && ct->status & IPS_NAT_MASK) - ret = nf_nat_sdp(pskb, ctinfo, exp, dptr); + if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, *datalen, + &proto) == 0) + return NF_ACCEPT; + + if (ct_sip_parse_numerical_param(ct, *dptr, + matchoff + matchlen, *datalen, + "expires=", NULL, NULL, &expires) < 0) { + nf_ct_helper_log(skb, ct, "cannot parse expires"); + return NF_DROP; + } + + if (expires == 0) { + ret = NF_ACCEPT; + goto store_cseq; + } + + exp = nf_ct_expect_alloc(ct); + if (!exp) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); + return NF_DROP; + } + + saddr = NULL; + if (sip_direct_signalling) + saddr = &ct->tuplehash[!dir].tuple.src.u3; + + nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), + saddr, &daddr, proto, NULL, &port); + exp->timeout.expires = sip_timeout * HZ; + exp->helper = nfct_help(ct)->helper; + exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; + + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && ct->status & IPS_NAT_MASK) + ret = hooks->expect(skb, protoff, dataoff, dptr, datalen, + exp, matchoff, matchlen); else { - if (nf_conntrack_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - else + } else ret = NF_ACCEPT; } - nf_conntrack_expect_put(exp); + nf_ct_expect_put(exp); +store_cseq: + if (ret == NF_ACCEPT) + ct_sip_info->register_cseq = cseq; return ret; } -static int sip_help(struct sk_buff **pskb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) +static int process_register_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) { - int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - union nf_conntrack_address addr; - unsigned int dataoff, datalen; - const char *dptr; - int ret = NF_ACCEPT; - int matchoff, matchlen; - u_int16_t port; - enum sip_header_pos pos; - typeof(nf_nat_sip_hook) nf_nat_sip; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + union nf_inet_addr addr; + __be16 port; + u8 proto; + unsigned int matchoff, matchlen, coff = 0; + unsigned int expires = 0; + int in_contact = 0, ret; - /* No Data ? */ - dataoff = protoff + sizeof(struct udphdr); - if (dataoff >= (*pskb)->len) + /* According to RFC 3261, "UAs MUST NOT send a new registration until + * they have received a final response from the registrar for the + * previous one or the previous REGISTER request has timed out". + * + * However, some servers fail to detect retransmissions and send late + * responses, so we store the sequence number of the last valid + * request and compare it here. + */ + if (ct_sip_info->register_cseq != cseq) + return NF_ACCEPT; + + if (code >= 100 && code <= 199) return NF_ACCEPT; + if (code < 200 || code > 299) + goto flush; - nf_ct_refresh(ct, *pskb, sip_timeout * HZ); + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, + &matchoff, &matchlen) > 0) + expires = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!skb_is_nonlinear(*pskb)) - dptr = (*pskb)->data + dataoff; - else { - DEBUGP("Copy of skbuff not supported yet.\n"); - goto out; - } + while (1) { + unsigned int c_expires = expires; - nf_nat_sip = rcu_dereference(nf_nat_sip_hook); - if (nf_nat_sip && ct->status & IPS_NAT_MASK) { - if (!nf_nat_sip(pskb, ctinfo, ct, &dptr)) { - ret = NF_DROP; - goto out; + ret = ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, + SIP_HDR_CONTACT, &in_contact, + &matchoff, &matchlen, + &addr, &port); + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse contact"); + return NF_DROP; + } else if (ret == 0) + break; + + /* We don't support third-party registrations */ + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &addr)) + continue; + + if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, + *datalen, &proto) == 0) + continue; + + ret = ct_sip_parse_numerical_param(ct, *dptr, + matchoff + matchlen, + *datalen, "expires=", + NULL, NULL, &c_expires); + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse expires"); + return NF_DROP; } + if (c_expires == 0) + break; + if (refresh_signalling_expectation(ct, &addr, proto, port, + c_expires)) + return NF_ACCEPT; } - datalen = (*pskb)->len - dataoff; - if (datalen < sizeof("SIP/2.0 200") - 1) - goto out; +flush: + flush_expectations(ct, false); + return NF_ACCEPT; +} + +static const struct sip_handler sip_handlers[] = { + SIP_HANDLER("INVITE", process_invite_request, process_invite_response), + SIP_HANDLER("UPDATE", process_sdp, process_update_response), + SIP_HANDLER("ACK", process_sdp, NULL), + SIP_HANDLER("PRACK", process_sdp, process_prack_response), + SIP_HANDLER("BYE", process_bye_request, NULL), + SIP_HANDLER("REGISTER", process_register_request, process_register_response), +}; + +static int process_sip_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen, matchend; + unsigned int code, cseq, i; + + if (*datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10); + if (!code) { + nf_ct_helper_log(skb, ct, "cannot get code"); + return NF_DROP; + } + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, + &matchoff, &matchlen) <= 0) { + nf_ct_helper_log(skb, ct, "cannot parse cseq"); + return NF_DROP; + } + cseq = simple_strtoul(*dptr + matchoff, NULL, 10); + if (!cseq) { + nf_ct_helper_log(skb, ct, "cannot get cseq"); + return NF_DROP; + } + matchend = matchoff + matchlen + 1; + + for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + + handler = &sip_handlers[i]; + if (handler->response == NULL) + continue; + if (*datalen < matchend + handler->len || + strnicmp(*dptr + matchend, handler->method, handler->len)) + continue; + return handler->response(skb, protoff, dataoff, dptr, datalen, + cseq, code); + } + return NF_ACCEPT; +} + +static int process_sip_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int matchoff, matchlen; + unsigned int cseq, i; + union nf_inet_addr addr; + __be16 port; + + /* Many Cisco IP phones use a high source port for SIP requests, but + * listen for the response on port 5060. If we are the local + * router for one of these phones, save the port number from the + * Via: header so that nf_nat_sip can redirect the responses to + * the correct port. + */ + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + SIP_HDR_VIA_UDP, NULL, &matchoff, + &matchlen, &addr, &port) > 0 && + port != ct->tuplehash[dir].tuple.src.u.udp.port && + nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3)) + ct_sip_info->forced_dport = port; + + for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + + handler = &sip_handlers[i]; + if (handler->request == NULL) + continue; + if (*datalen < handler->len || + strnicmp(*dptr, handler->method, handler->len)) + continue; + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, + &matchoff, &matchlen) <= 0) { + nf_ct_helper_log(skb, ct, "cannot parse cseq"); + return NF_DROP; + } + cseq = simple_strtoul(*dptr + matchoff, NULL, 10); + if (!cseq) { + nf_ct_helper_log(skb, ct, "cannot get cseq"); + return NF_DROP; + } - /* RTP info only in some SDP pkts */ - if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && - memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { - goto out; + return handler->request(skb, protoff, dataoff, dptr, datalen, + cseq); } - /* Get address and port from SDP packet. */ - pos = family == AF_INET ? POS_CONNECTION_IP4 : POS_CONNECTION_IP6; - if (ct_sip_get_info(ct, dptr, datalen, &matchoff, &matchlen, pos) > 0) { + return NF_ACCEPT; +} + +static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct, + unsigned int protoff, unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + const struct nf_nat_sip_hooks *hooks; + int ret; + + if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0) + ret = process_sip_request(skb, protoff, dataoff, dptr, datalen); + else + ret = process_sip_response(skb, protoff, dataoff, dptr, datalen); - /* We'll drop only if there are parse problems. */ - if (!parse_addr(ct, dptr + matchoff, NULL, &addr, - dptr + datalen)) { + if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && !hooks->msg(skb, protoff, dataoff, + dptr, datalen)) { + nf_ct_helper_log(skb, ct, "cannot NAT SIP message"); ret = NF_DROP; - goto out; } - if (ct_sip_get_info(ct, dptr, datalen, &matchoff, &matchlen, - POS_MEDIA) > 0) { + } + + return ret; +} + +static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + struct tcphdr *th, _tcph; + unsigned int dataoff, datalen; + unsigned int matchoff, matchlen, clen; + unsigned int msglen, origlen; + const char *dptr, *end; + s16 diff, tdiff = 0; + int ret = NF_ACCEPT; + bool term; + + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; - port = simple_strtoul(dptr + matchoff, NULL, 10); - if (port < 1024) { - ret = NF_DROP; - goto out; + /* No Data ? */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + dataoff = protoff + th->doff * 4; + if (dataoff >= skb->len) + return NF_ACCEPT; + + nf_ct_refresh(ct, skb, sip_timeout * HZ); + + if (unlikely(skb_linearize(skb))) + return NF_DROP; + + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + if (datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + + while (1) { + if (ct_sip_get_header(ct, dptr, 0, datalen, + SIP_HDR_CONTENT_LENGTH, + &matchoff, &matchlen) <= 0) + break; + + clen = simple_strtoul(dptr + matchoff, (char **)&end, 10); + if (dptr + matchoff == end) + break; + + term = false; + for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) { + if (end[0] == '\r' && end[1] == '\n' && + end[2] == '\r' && end[3] == '\n') { + term = true; + break; } - ret = set_expected_rtp(pskb, ct, ctinfo, &addr, - htons(port), dptr); } + if (!term) + break; + end += strlen("\r\n\r\n") + clen; + + msglen = origlen = end - dptr; + if (msglen > datalen) + return NF_ACCEPT; + + ret = process_sip_msg(skb, ct, protoff, dataoff, + &dptr, &msglen); + /* process_sip_* functions report why this packet is dropped */ + if (ret != NF_ACCEPT) + break; + diff = msglen - origlen; + tdiff += diff; + + dataoff += msglen; + dptr += msglen; + datalen = datalen + diff - msglen; + } + + if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { + const struct nf_nat_sip_hooks *hooks; + + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks) + hooks->seq_adjust(skb, protoff, tdiff); } -out: + return ret; } -static struct nf_conntrack_helper sip[MAX_PORTS][2] __read_mostly; -static char sip_names[MAX_PORTS][2][sizeof("sip-65535")] __read_mostly; +static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const char *dptr; + + /* No Data ? */ + dataoff = protoff + sizeof(struct udphdr); + if (dataoff >= skb->len) + return NF_ACCEPT; + + nf_ct_refresh(ct, skb, sip_timeout * HZ); + + if (unlikely(skb_linearize(skb))) + return NF_DROP; + + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + if (datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + + return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen); +} + +static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; + +static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { + [SIP_EXPECT_SIGNALLING] = { + .name = "signalling", + .max_expected = 1, + .timeout = 3 * 60, + }, + [SIP_EXPECT_AUDIO] = { + .name = "audio", + .max_expected = 2 * IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, + [SIP_EXPECT_VIDEO] = { + .name = "video", + .max_expected = 2 * IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, + [SIP_EXPECT_IMAGE] = { + .name = "image", + .max_expected = IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, +}; static void nf_conntrack_sip_fini(void) { int i, j; for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) { + for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { if (sip[i][j].me == NULL) continue; nf_conntrack_helper_unregister(&sip[i][j]); @@ -490,7 +1628,6 @@ static void nf_conntrack_sip_fini(void) static int __init nf_conntrack_sip_init(void) { int i, j, ret; - char *tmpname; if (ports_c == 0) ports[ports_c++] = SIP_PORT; @@ -499,31 +1636,37 @@ static int __init nf_conntrack_sip_init(void) memset(&sip[i], 0, sizeof(sip[i])); sip[i][0].tuple.src.l3num = AF_INET; - sip[i][1].tuple.src.l3num = AF_INET6; - for (j = 0; j < 2; j++) { - sip[i][j].tuple.dst.protonum = IPPROTO_UDP; + sip[i][0].tuple.dst.protonum = IPPROTO_UDP; + sip[i][0].help = sip_help_udp; + sip[i][1].tuple.src.l3num = AF_INET; + sip[i][1].tuple.dst.protonum = IPPROTO_TCP; + sip[i][1].help = sip_help_tcp; + + sip[i][2].tuple.src.l3num = AF_INET6; + sip[i][2].tuple.dst.protonum = IPPROTO_UDP; + sip[i][2].help = sip_help_udp; + sip[i][3].tuple.src.l3num = AF_INET6; + sip[i][3].tuple.dst.protonum = IPPROTO_TCP; + sip[i][3].help = sip_help_tcp; + + for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { + sip[i][j].data_len = sizeof(struct nf_ct_sip_master); sip[i][j].tuple.src.u.udp.port = htons(ports[i]); - sip[i][j].mask.src.l3num = 0xFFFF; - sip[i][j].mask.src.u.udp.port = htons(0xFFFF); - sip[i][j].mask.dst.protonum = 0xFF; - sip[i][j].max_expected = 2; - sip[i][j].timeout = 3 * 60; /* 3 minutes */ + sip[i][j].expect_policy = sip_exp_policy; + sip[i][j].expect_class_max = SIP_EXPECT_MAX; sip[i][j].me = THIS_MODULE; - sip[i][j].help = sip_help; - tmpname = &sip_names[i][j][0]; if (ports[i] == SIP_PORT) - sprintf(tmpname, "sip"); + sprintf(sip[i][j].name, "sip"); else - sprintf(tmpname, "sip-%u", i); - sip[i][j].name = tmpname; + sprintf(sip[i][j].name, "sip-%u", i); - DEBUGP("port #%u: %u\n", i, ports[i]); + pr_debug("port #%u: %u\n", i, ports[i]); ret = nf_conntrack_helper_register(&sip[i][j]); if (ret) { - printk("nf_ct_sip: failed to register helper " - "for pf: %u port: %u\n", + printk(KERN_ERR "nf_ct_sip: failed to register" + " helper for pf: %u port: %u\n", sip[i][j].tuple.src.l3num, ports[i]); nf_conntrack_sip_fini(); return ret; diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c new file mode 100644 index 00000000000..87b95a2c270 --- /dev/null +++ b/net/netfilter/nf_conntrack_snmp.c @@ -0,0 +1,78 @@ +/* + * SNMP service broadcast connection tracking helper + * + * (c) 2011 Jiri Olsa <jolsa@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/in.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_snmp.h> + +#define SNMP_PORT 161 + +MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>"); +MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFCT_HELPER("snmp"); + +static unsigned int timeout __read_mostly = 30; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +int (*nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); + +static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + typeof(nf_nat_snmp_hook) nf_nat_snmp; + + nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + + nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); + if (nf_nat_snmp && ct->status & IPS_NAT_MASK) + return nf_nat_snmp(skb, protoff, ct, ctinfo); + + return NF_ACCEPT; +} + +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; + +static struct nf_conntrack_helper helper __read_mostly = { + .name = "snmp", + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .help = snmp_conntrack_help, + .expect_policy = &exp_policy, +}; + +static int __init nf_conntrack_snmp_init(void) +{ + exp_policy.timeout = timeout; + return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_snmp_fini(void) +{ + nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_snmp_init); +module_exit(nf_conntrack_snmp_fini); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 45baeb0e30f..f641751dba9 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -8,12 +9,15 @@ #include <linux/types.h> #include <linux/netfilter.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/netdevice.h> +#include <linux/security.h> +#include <net/net_namespace.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -24,71 +28,67 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif +#include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <linux/rculist_nulls.h> MODULE_LICENSE("GPL"); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_NF_CONNTRACK_PROCFS int print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l3proto *l3proto, - struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) { return l3proto->print_tuple(s, tuple) || l4proto->print_tuple(s, tuple); } EXPORT_SYMBOL_GPL(print_tuple); -#ifdef CONFIG_NF_CT_ACCT -static unsigned int -seq_print_counters(struct seq_file *s, - const struct ip_conntrack_counter *counter) -{ - return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)counter->packets, - (unsigned long long)counter->bytes); -} -#else -#define seq_print_counters(x, y) 0 -#endif - struct ct_iter_state { + struct seq_net_private p; unsigned int bucket; + u_int64_t time_now; }; -static struct list_head *ct_get_first(struct seq_file *seq) +static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; + struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < net->ct.htable_size; st->bucket++) { - if (!list_empty(&nf_conntrack_hash[st->bucket])) - return nf_conntrack_hash[st->bucket].next; + n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + if (!is_a_nulls(n)) + return n; } return NULL; } -static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) +static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, + struct hlist_nulls_node *head) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = head->next; - while (head == &nf_conntrack_hash[st->bucket]) { - if (++st->bucket >= nf_conntrack_htable_size) - return NULL; - head = nf_conntrack_hash[st->bucket].next; + head = rcu_dereference(hlist_nulls_next_rcu(head)); + while (is_a_nulls(head)) { + if (likely(get_nulls_value(head) == st->bucket)) { + if (++st->bucket >= net->ct.htable_size) + return NULL; + } + head = rcu_dereference( + hlist_nulls_first_rcu( + &net->ct.hash[st->bucket])); } return head; } -static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) +static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) { - struct list_head *head = ct_get_first(seq); + struct hlist_nulls_node *head = ct_get_first(seq); if (head) while (pos && (head = ct_get_next(seq, head))) @@ -97,8 +97,12 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { - read_lock_bh(&nf_conntrack_lock); + struct ct_iter_state *st = seq->private; + + st->time_now = ktime_to_ns(ktime_get_real()); + rcu_read_lock(); return ct_get_idx(seq, *pos); } @@ -109,88 +113,143 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) } static void ct_seq_stop(struct seq_file *s, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return 0; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + struct ct_iter_state *st = s->private; + struct nf_conn_tstamp *tstamp; + s64 delta_time; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + delta_time = st->time_now - tstamp->start; + if (delta_time > 0) + delta_time = div_s64(delta_time, NSEC_PER_SEC); + else + delta_time = 0; + + return seq_printf(s, "delta-time=%llu ", + (unsigned long long)delta_time); + } + return 0; +} +#else +static inline int +ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { - read_unlock_bh(&nf_conntrack_lock); + return 0; } +#endif /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { - const struct nf_conntrack_tuple_hash *hash = v; - const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash); - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple_hash *hash = v; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; + int ret = 0; - NF_CT_ASSERT(conntrack); + NF_CT_ASSERT(ct); + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) - return 0; - - l3proto = __nf_ct_l3proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num); + goto release; + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); NF_CT_ASSERT(l3proto); - l4proto = __nf_ct_l4proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); NF_CT_ASSERT(l4proto); + ret = -ENOSPC; if (seq_printf(s, "%-8s %u %-8s %u %ld ", - l3proto->name, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, - l4proto->name, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, - timer_pending(&conntrack->timeout) - ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0) - return -ENOSPC; + l3proto->name, nf_ct_l3num(ct), + l4proto->name, nf_ct_protonum(ct), + timer_pending(&ct->timeout) + ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) + goto release; - if (l3proto->print_conntrack(s, conntrack)) - return -ENOSPC; + if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) + goto release; - if (l4proto->print_conntrack(s, conntrack)) - return -ENOSPC; - - if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l3proto, l4proto)) - return -ENOSPC; + goto release; - if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) - return -ENOSPC; + if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) + goto release; - if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) if (seq_printf(s, "[UNREPLIED] ")) - return -ENOSPC; + goto release; - if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l3proto, l4proto)) - return -ENOSPC; + goto release; - if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) - return -ENOSPC; + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) + goto release; - if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (test_bit(IPS_ASSURED_BIT, &ct->status)) if (seq_printf(s, "[ASSURED] ")) - return -ENOSPC; + goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) - if (seq_printf(s, "mark=%u ", conntrack->mark)) - return -ENOSPC; + if (seq_printf(s, "mark=%u ", ct->mark)) + goto release; #endif -#ifdef CONFIG_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", conntrack->secmark)) - return -ENOSPC; + if (ct_show_secctx(s, ct)) + goto release; + +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) + goto release; #endif - if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) - return -ENOSPC; - - return 0; + if (ct_show_delta_time(s, ct)) + goto release; + + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) + goto release; + + ret = 0; +release: + nf_ct_put(ct); + return ret; } -static struct seq_operations ct_seq_ops = { +static const struct seq_operations ct_seq_ops = { .start = ct_seq_start, .next = ct_seq_next, .stop = ct_seq_stop, @@ -199,23 +258,8 @@ static struct seq_operations ct_seq_ops = { static int ct_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct ct_iter_state *st; - int ret; - - st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); - if (st == NULL) - return -ENOMEM; - ret = seq_open(file, &ct_seq_ops); - if (ret) - goto out_free; - seq = file->private_data; - seq->private = st; - memset(st, 0, sizeof(struct ct_iter_state)); - return ret; -out_free: - kfree(st); - return ret; + return seq_open_net(inode, file, &ct_seq_ops, + sizeof(struct ct_iter_state)); } static const struct file_operations ct_file_ops = { @@ -223,21 +267,22 @@ static const struct file_operations ct_file_ops = { .open = ct_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) return SEQ_START_TOKEN; - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -245,13 +290,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -263,16 +309,17 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); - struct ip_conntrack_stat *st = v; + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); + const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " - "%08x %08x %08x %08x %08x %08x %08x %08x \n", + "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, st->searched, st->found, @@ -289,12 +336,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->expect_new, st->expect_create, - st->expect_delete + st->expect_delete, + st->search_restart ); return 0; } -static struct seq_operations ct_cpu_seq_ops = { +static const struct seq_operations ct_cpu_seq_ops = { .start = ct_cpu_seq_start, .next = ct_cpu_seq_next, .stop = ct_cpu_seq_stop, @@ -303,7 +351,8 @@ static struct seq_operations ct_cpu_seq_ops = { static int ct_cpu_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &ct_cpu_seq_ops); + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ct_cpu_seq_fops = { @@ -311,164 +360,249 @@ static const struct file_operations ct_cpu_seq_fops = { .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -#endif /* CONFIG_PROC_FS */ -/* Sysctl support */ +static int nf_conntrack_standalone_init_proc(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops); + if (!pde) + goto out_nf_conntrack; + + pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, + &ct_cpu_seq_fops); + if (!pde) + goto out_stat_nf_conntrack; + return 0; -int nf_conntrack_checksum __read_mostly = 1; -EXPORT_SYMBOL_GPL(nf_conntrack_checksum); +out_stat_nf_conntrack: + remove_proc_entry("nf_conntrack", net->proc_net); +out_nf_conntrack: + return -ENOMEM; +} + +static void nf_conntrack_standalone_fini_proc(struct net *net) +{ + remove_proc_entry("nf_conntrack", net->proc_net_stat); + remove_proc_entry("nf_conntrack", net->proc_net); +} +#else +static int nf_conntrack_standalone_init_proc(struct net *net) +{ + return 0; +} + +static void nf_conntrack_standalone_fini_proc(struct net *net) +{ +} +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ + +/* Sysctl support */ #ifdef CONFIG_SYSCTL /* Log invalid packets of a given protocol */ static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -static struct ctl_table_header *nf_ct_sysctl_header; +static struct ctl_table_header *nf_ct_netfilter_header; -static ctl_table nf_ct_sysctl_table[] = { +static struct ctl_table nf_ct_sysctl_table[] = { { - .ctl_name = NET_NF_CONNTRACK_MAX, .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_COUNT, .procname = "nf_conntrack_count", - .data = &nf_conntrack_count, + .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_BUCKETS, .procname = "nf_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_CHECKSUM, .procname = "nf_conntrack_checksum", - .data = &nf_conntrack_checksum, + .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_LOG_INVALID, .procname = "nf_conntrack_log_invalid", - .data = &nf_ct_log_invalid, + .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &log_invalid_proto_min, .extra2 = &log_invalid_proto_max, }, - - { .ctl_name = 0 } + { + .procname = "nf_conntrack_expect_max", + .data = &nf_ct_expect_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } }; #define NET_NF_CONNTRACK_MAX 2089 -static ctl_table nf_ct_netfilter_table[] = { - { - .ctl_name = NET_NETFILTER, - .procname = "netfilter", - .mode = 0555, - .child = nf_ct_sysctl_table, - }, +static struct ctl_table nf_ct_netfilter_table[] = { { - .ctl_name = NET_NF_CONNTRACK_MAX, .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; -static ctl_table nf_ct_net_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = nf_ct_netfilter_table, - }, - { .ctl_name = 0 } -}; -EXPORT_SYMBOL_GPL(nf_ct_log_invalid); +static int nf_conntrack_standalone_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out_kmemdup; + + table[1].data = &net->ct.count; + table[2].data = &net->ct.htable_size; + table[3].data = &net->ct.sysctl_checksum; + table[4].data = &net->ct.sysctl_log_invalid; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); + if (!net->ct.sysctl_header) + goto out_unregister_netfilter; + + return 0; + +out_unregister_netfilter: + kfree(table); +out_kmemdup: + return -ENOMEM; +} + +static void nf_conntrack_standalone_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_standalone_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_standalone_fini_sysctl(struct net *net) +{ +} #endif /* CONFIG_SYSCTL */ -static int __init nf_conntrack_standalone_init(void) +static int nf_conntrack_pernet_init(struct net *net) { -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc, *proc_exp, *proc_stat; -#endif - int ret = 0; + int ret; - ret = nf_conntrack_init(); + ret = nf_conntrack_init_net(net); if (ret < 0) - return ret; + goto out_init; -#ifdef CONFIG_PROC_FS - proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops); - if (!proc) goto cleanup_init; + ret = nf_conntrack_standalone_init_proc(net); + if (ret < 0) + goto out_proc; - proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440, - &exp_file_ops); - if (!proc_exp) goto cleanup_proc; + net->ct.sysctl_checksum = 1; + net->ct.sysctl_log_invalid = 0; + ret = nf_conntrack_standalone_init_sysctl(net); + if (ret < 0) + goto out_sysctl; - proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat); - if (!proc_stat) - goto cleanup_proc_exp; + return 0; + +out_sysctl: + nf_conntrack_standalone_fini_proc(net); +out_proc: + nf_conntrack_cleanup_net(net); +out_init: + return ret; +} + +static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { + nf_conntrack_standalone_fini_sysctl(net); + nf_conntrack_standalone_fini_proc(net); + } + nf_conntrack_cleanup_net_list(net_exit_list); +} + +static struct pernet_operations nf_conntrack_net_ops = { + .init = nf_conntrack_pernet_init, + .exit_batch = nf_conntrack_pernet_exit, +}; + +static int __init nf_conntrack_standalone_init(void) +{ + int ret = nf_conntrack_init_start(); + if (ret < 0) + goto out_start; - proc_stat->proc_fops = &ct_cpu_seq_fops; - proc_stat->owner = THIS_MODULE; -#endif #ifdef CONFIG_SYSCTL - nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table); - if (nf_ct_sysctl_header == NULL) { - printk("nf_conntrack: can't register to sysctl.\n"); + nf_ct_netfilter_header = + register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); + if (!nf_ct_netfilter_header) { + pr_err("nf_conntrack: can't register to sysctl.\n"); ret = -ENOMEM; - goto cleanup_proc_stat; + goto out_sysctl; } #endif - return ret; + ret = register_pernet_subsys(&nf_conntrack_net_ops); + if (ret < 0) + goto out_pernet; + + nf_conntrack_init_end(); + return 0; + +out_pernet: #ifdef CONFIG_SYSCTL - cleanup_proc_stat: + unregister_net_sysctl_table(nf_ct_netfilter_header); +out_sysctl: #endif -#ifdef CONFIG_PROC_FS - remove_proc_entry("nf_conntrack", proc_net_stat); - cleanup_proc_exp: - proc_net_remove("nf_conntrack_expect"); - cleanup_proc: - proc_net_remove("nf_conntrack"); - cleanup_init: -#endif /* CNFIG_PROC_FS */ - nf_conntrack_cleanup(); + nf_conntrack_cleanup_end(); +out_start: return ret; } static void __exit nf_conntrack_standalone_fini(void) { + nf_conntrack_cleanup_start(); + unregister_pernet_subsys(&nf_conntrack_net_ops); #ifdef CONFIG_SYSCTL - unregister_sysctl_table(nf_ct_sysctl_header); + unregister_net_sysctl_table(nf_ct_netfilter_header); #endif -#ifdef CONFIG_PROC_FS - remove_proc_entry("nf_conntrack", proc_net_stat); - proc_net_remove("nf_conntrack_expect"); - proc_net_remove("nf_conntrack"); -#endif /* CNFIG_PROC_FS */ - nf_conntrack_cleanup(); + nf_conntrack_cleanup_end(); } module_init(nf_conntrack_standalone_init); diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 37c4542e311..e68ab4fbd71 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -1,5 +1,5 @@ /* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -22,38 +22,32 @@ MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); MODULE_DESCRIPTION("TFTP connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_tftp"); +MODULE_ALIAS_NFCT_HELPER("tftp"); #define MAX_PORTS 8 static unsigned short ports[MAX_PORTS]; -static int ports_c; +static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "Port numbers of TFTP servers"); -#if 0 -#define DEBUGP(format, args...) printk("%s:%s:" format, \ - __FILE__, __FUNCTION__ , ## args) -#else -#define DEBUGP(format, args...) -#endif - -unsigned int (*nf_nat_tftp_hook)(struct sk_buff **pskb, +unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, struct nf_conntrack_expect *exp) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_tftp_hook); -static int tftp_help(struct sk_buff **pskb, +static int tftp_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - struct tftphdr _tftph, *tfh; + const struct tftphdr *tfh; + struct tftphdr _tftph; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple *tuple; unsigned int ret = NF_ACCEPT; - int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; typeof(nf_nat_tftp_hook) nf_nat_tftp; - tfh = skb_header_pointer(*pskb, protoff + sizeof(struct udphdr), + tfh = skb_header_pointer(skb, protoff + sizeof(struct udphdr), sizeof(_tftph), &_tftph); if (tfh == NULL) return NF_ACCEPT; @@ -62,45 +56,51 @@ static int tftp_help(struct sk_buff **pskb, case TFTP_OPCODE_READ: case TFTP_OPCODE_WRITE: /* RRQ and WRQ works the same way */ - DEBUGP(""); - NF_CT_DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - NF_CT_DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - exp = nf_conntrack_expect_alloc(ct); - if (exp == NULL) + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); return NF_DROP; + } tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - nf_conntrack_expect_init(exp, family, - &tuple->src.u3, &tuple->dst.u3, - IPPROTO_UDP, - NULL, &tuple->dst.u.udp.port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_UDP, NULL, &tuple->dst.u.udp.port); - DEBUGP("expect: "); - NF_CT_DUMP_TUPLE(&exp->tuple); - NF_CT_DUMP_TUPLE(&exp->mask); + pr_debug("expect: "); + nf_ct_dump_tuple(&exp->tuple); nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook); if (nf_nat_tftp && ct->status & IPS_NAT_MASK) - ret = nf_nat_tftp(pskb, ctinfo, exp); - else if (nf_conntrack_expect_related(exp) != 0) + ret = nf_nat_tftp(skb, ctinfo, exp); + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - nf_conntrack_expect_put(exp); + } + nf_ct_expect_put(exp); break; case TFTP_OPCODE_DATA: case TFTP_OPCODE_ACK: - DEBUGP("Data/ACK opcode\n"); + pr_debug("Data/ACK opcode\n"); break; case TFTP_OPCODE_ERROR: - DEBUGP("Error opcode\n"); + pr_debug("Error opcode\n"); break; default: - DEBUGP("Unknown opcode\n"); + pr_debug("Unknown opcode\n"); } return ret; } static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; -static char tftp_names[MAX_PORTS][2][sizeof("tftp-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy tftp_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; static void nf_conntrack_tftp_fini(void) { @@ -115,7 +115,6 @@ static void nf_conntrack_tftp_fini(void) static int __init nf_conntrack_tftp_init(void) { int i, j, ret; - char *tmpname; if (ports_c == 0) ports[ports_c++] = TFTP_PORT; @@ -128,25 +127,19 @@ static int __init nf_conntrack_tftp_init(void) for (j = 0; j < 2; j++) { tftp[i][j].tuple.dst.protonum = IPPROTO_UDP; tftp[i][j].tuple.src.u.udp.port = htons(ports[i]); - tftp[i][j].mask.src.l3num = 0xFFFF; - tftp[i][j].mask.dst.protonum = 0xFF; - tftp[i][j].mask.src.u.udp.port = htons(0xFFFF); - tftp[i][j].max_expected = 1; - tftp[i][j].timeout = 5 * 60; /* 5 minutes */ + tftp[i][j].expect_policy = &tftp_exp_policy; tftp[i][j].me = THIS_MODULE; tftp[i][j].help = tftp_help; - tmpname = &tftp_names[i][j][0]; if (ports[i] == TFTP_PORT) - sprintf(tmpname, "tftp"); + sprintf(tftp[i][j].name, "tftp"); else - sprintf(tmpname, "tftp-%u", i); - tftp[i][j].name = tmpname; + sprintf(tftp[i][j].name, "tftp-%u", i); ret = nf_conntrack_helper_register(&tftp[i][j]); if (ret) { - printk("nf_ct_tftp: failed to register helper " - "for pf: %u port: %u\n", + printk(KERN_ERR "nf_ct_tftp: failed to register" + " helper for pf: %u port: %u\n", tftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_tftp_fini(); return ret; diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c new file mode 100644 index 00000000000..93da609d9d2 --- /dev/null +++ b/net/netfilter/nf_conntrack_timeout.c @@ -0,0 +1,51 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +struct ctnl_timeout * +(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); + +void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); + +static struct nf_ct_ext_type timeout_extend __read_mostly = { + .len = sizeof(struct nf_conn_timeout), + .align = __alignof__(struct nf_conn_timeout), + .id = NF_CT_EXT_TIMEOUT, +}; + +int nf_conntrack_timeout_init(void) +{ + int ret = nf_ct_extend_register(&timeout_extend); + if (ret < 0) + pr_err("nf_ct_timeout: Unable to register timeout extension.\n"); + return ret; +} + +void nf_conntrack_timeout_fini(void) +{ + nf_ct_extend_unregister(&timeout_extend); +} diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c new file mode 100644 index 00000000000..7a394df0deb --- /dev/null +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -0,0 +1,114 @@ +/* + * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timestamp.h> + +static bool nf_ct_tstamp __read_mostly; + +module_param_named(tstamp, nf_ct_tstamp, bool, 0644); +MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table tstamp_sysctl_table[] = { + { + .procname = "nf_conntrack_timestamp", + .data = &init_net.ct.sysctl_tstamp, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type tstamp_extend __read_mostly = { + .len = sizeof(struct nf_conn_tstamp), + .align = __alignof__(struct nf_conn_tstamp), + .id = NF_CT_EXT_TSTAMP, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_tstamp; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter", + table); + if (!net->ct.tstamp_sysctl_header) { + printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.tstamp_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.tstamp_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_tstamp_pernet_init(struct net *net) +{ + net->ct.sysctl_tstamp = nf_ct_tstamp; + return nf_conntrack_tstamp_init_sysctl(net); +} + +void nf_conntrack_tstamp_pernet_fini(struct net *net) +{ + nf_conntrack_tstamp_fini_sysctl(net); +} + +int nf_conntrack_tstamp_init(void) +{ + int ret; + ret = nf_ct_extend_register(&tstamp_extend); + if (ret < 0) + pr_err("nf_ct_tstamp: Unable to register extension\n"); + return ret; +} + +void nf_conntrack_tstamp_fini(void) +{ + nf_ct_extend_unregister(&tstamp_extend); +} diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 0df7fff196a..61a3c927e63 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -6,33 +6,27 @@ #include <linux/netdevice.h> #ifdef CONFIG_NETFILTER_DEBUG -#define NFDEBUG(format, args...) printk(format , ## args) +#define NFDEBUG(format, args...) printk(KERN_DEBUG format , ## args) #else #define NFDEBUG(format, args...) #endif /* core.c */ -extern unsigned int nf_iterate(struct list_head *head, - struct sk_buff **skb, - int hook, - const struct net_device *indev, - const struct net_device *outdev, - struct list_head **i, - int (*okfn)(struct sk_buff *), - int hook_thresh); +unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, + unsigned int hook, const struct net_device *indev, + const struct net_device *outdev, + struct nf_hook_ops **elemp, + int (*okfn)(struct sk_buff *), int hook_thresh); /* nf_queue.c */ -extern int nf_queue(struct sk_buff *skb, - struct list_head *elem, - int pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum); -extern int __init netfilter_queue_init(void); +int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf, + unsigned int hook, struct net_device *indev, + struct net_device *outdev, int (*okfn)(struct sk_buff *), + unsigned int queuenum); +int __init netfilter_queue_init(void); /* nf_log.c */ -extern int __init netfilter_log_init(void); +int __init netfilter_log_init(void); #endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 91b220cf5a1..85296d4eac0 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -6,96 +6,149 @@ #include <linux/netfilter.h> #include <linux/seq_file.h> #include <net/protocol.h> +#include <net/netfilter/nf_log.h> #include "nf_internals.h" -/* Internal logging interface, which relies on the real +/* Internal logging interface, which relies on the real LOG target modules */ #define NF_LOG_PREFIXLEN 128 +#define NFLOGGER_NAME_LEN 64 -static struct nf_logger *nf_loggers[NPROTO]; +static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); -/* return EBUSY if somebody else is registered, EEXIST if the same logger - * is registred, 0 on success. */ -int nf_log_register(int pf, struct nf_logger *logger) +static struct nf_logger *__find_logger(int pf, const char *str_logger) { - int ret; + struct nf_logger *t; - if (pf >= NPROTO) - return -EINVAL; + list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) { + if (!strnicmp(str_logger, t->name, strlen(t->name))) + return t; + } - /* Any setup of logging members must be done before - * substituting pointer. */ - ret = mutex_lock_interruptible(&nf_log_mutex); - if (ret < 0) - return ret; + return NULL; +} - if (!nf_loggers[pf]) - rcu_assign_pointer(nf_loggers[pf], logger); - else if (nf_loggers[pf] == logger) - ret = -EEXIST; - else - ret = -EBUSY; +void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +{ + const struct nf_logger *log; + + if (pf == NFPROTO_UNSPEC) + return; + + mutex_lock(&nf_log_mutex); + log = rcu_dereference_protected(net->nf.nf_loggers[pf], + lockdep_is_held(&nf_log_mutex)); + if (log == NULL) + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); - return ret; } -EXPORT_SYMBOL(nf_log_register); +EXPORT_SYMBOL(nf_log_set); -void nf_log_unregister_pf(int pf) +void nf_log_unset(struct net *net, const struct nf_logger *logger) { - if (pf >= NPROTO) - return; + int i; + const struct nf_logger *log; + mutex_lock(&nf_log_mutex); - rcu_assign_pointer(nf_loggers[pf], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = rcu_dereference_protected(net->nf.nf_loggers[i], + lockdep_is_held(&nf_log_mutex)); + if (log == logger) + RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL); + } mutex_unlock(&nf_log_mutex); - - /* Give time to concurrent readers. */ synchronize_rcu(); } -EXPORT_SYMBOL(nf_log_unregister_pf); +EXPORT_SYMBOL(nf_log_unset); -void nf_log_unregister(struct nf_logger *logger) +/* return EEXIST if the same logger is registered, 0 on success. */ +int nf_log_register(u_int8_t pf, struct nf_logger *logger) { int i; + if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(logger->list); i++) + INIT_LIST_HEAD(&logger->list[i]); + mutex_lock(&nf_log_mutex); - for (i = 0; i < NPROTO; i++) { - if (nf_loggers[i] == logger) - rcu_assign_pointer(nf_loggers[i], NULL); + + if (pf == NFPROTO_UNSPEC) { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + list_add_tail(&(logger->list[i]), &(nf_loggers_l[i])); + } else { + /* register at end of list to honor first register win */ + list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); } + mutex_unlock(&nf_log_mutex); - synchronize_rcu(); + return 0; +} +EXPORT_SYMBOL(nf_log_register); + +void nf_log_unregister(struct nf_logger *logger) +{ + int i; + + mutex_lock(&nf_log_mutex); + for (i = 0; i < NFPROTO_NUMPROTO; i++) + list_del(&logger->list[i]); + mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unregister); -void nf_log_packet(int pf, +int nf_log_bind_pf(struct net *net, u_int8_t pf, + const struct nf_logger *logger) +{ + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) + return -EINVAL; + mutex_lock(&nf_log_mutex); + if (__find_logger(pf, logger->name) == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); + mutex_unlock(&nf_log_mutex); + return 0; +} +EXPORT_SYMBOL(nf_log_bind_pf); + +void nf_log_unbind_pf(struct net *net, u_int8_t pf) +{ + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) + return; + mutex_lock(&nf_log_mutex); + RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL); + mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_unbind_pf); + +void nf_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - struct nf_loginfo *loginfo, + const struct nf_loginfo *loginfo, const char *fmt, ...) { va_list args; char prefix[NF_LOG_PREFIXLEN]; - struct nf_logger *logger; + const struct nf_logger *logger; rcu_read_lock(); - logger = rcu_dereference(nf_loggers[pf]); + logger = rcu_dereference(net->nf.nf_loggers[pf]); if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); - /* We must read logging before nf_logfn[pf] */ - logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); - } else if (net_ratelimit()) { - printk(KERN_WARNING "nf_log_packet: can\'t log since " - "no backend logging module loaded in! Please either " - "load one, or disable logging explicitly\n"); + logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); } rcu_read_unlock(); } @@ -104,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet); #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { - rcu_read_lock(); + struct net *net = seq_file_net(seq); + + mutex_lock(&nf_log_mutex); - if (*pos >= NPROTO) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -114,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos) static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct net *net = seq_file_net(s); + (*pos)++; - if (*pos >= NPROTO) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -124,23 +181,43 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) static void seq_stop(struct seq_file *s, void *v) { - rcu_read_unlock(); + mutex_unlock(&nf_log_mutex); } static int seq_show(struct seq_file *s, void *v) { loff_t *pos = v; const struct nf_logger *logger; + struct nf_logger *t; + int ret; + struct net *net = seq_file_net(s); - logger = rcu_dereference(nf_loggers[*pos]); + logger = rcu_dereference_protected(net->nf.nf_loggers[*pos], + lockdep_is_held(&nf_log_mutex)); if (!logger) - return seq_printf(s, "%2lld NONE\n", *pos); + ret = seq_printf(s, "%2lld NONE (", *pos); + else + ret = seq_printf(s, "%2lld %s (", *pos, logger->name); + + if (ret < 0) + return ret; + + list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) { + ret = seq_printf(s, "%s", t->name); + if (ret < 0) + return ret; + if (&t->list[*pos] != nf_loggers_l[*pos].prev) { + ret = seq_printf(s, ","); + if (ret < 0) + return ret; + } + } - return seq_printf(s, "%2lld %s\n", *pos, logger->name); + return seq_printf(s, ")\n"); } -static struct seq_operations nflog_seq_ops = { +static const struct seq_operations nflog_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, @@ -149,7 +226,8 @@ static struct seq_operations nflog_seq_ops = { static int nflog_open(struct inode *inode, struct file *file) { - return seq_open(file, &nflog_seq_ops); + return seq_open_net(inode, file, &nflog_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations nflog_file_ops = { @@ -157,22 +235,168 @@ static const struct file_operations nflog_file_ops = { .open = nflog_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; + #endif /* PROC_FS */ +#ifdef CONFIG_SYSCTL +static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; -int __init netfilter_log_init(void) +static int nf_log_proc_dostring(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + const struct nf_logger *logger; + char buf[NFLOGGER_NAME_LEN]; + size_t size = *lenp; + int r = 0; + int tindex = (unsigned long)table->extra1; + struct net *net = current->nsproxy->net_ns; + + if (write) { + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, buffer, size)) + return -EFAULT; + + if (!strcmp(buf, "NONE")) { + nf_log_unbind_pf(net, tindex); + return 0; + } + mutex_lock(&nf_log_mutex); + logger = __find_logger(tindex, buf); + if (logger == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); + mutex_unlock(&nf_log_mutex); + } else { + mutex_lock(&nf_log_mutex); + logger = rcu_dereference_protected(net->nf.nf_loggers[tindex], + lockdep_is_held(&nf_log_mutex)); + if (!logger) + table->data = "NONE"; + else + table->data = logger->name; + r = proc_dostring(table, write, buffer, lenp, ppos); + mutex_unlock(&nf_log_mutex); + } + + return r; +} + +static int netfilter_log_sysctl_init(struct net *net) { + int i; + struct ctl_table *table; + + table = nf_log_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_log_sysctl_table, + sizeof(nf_log_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } else { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + snprintf(nf_log_sysctl_fnames[i], + 3, "%d", i); + nf_log_sysctl_table[i].procname = + nf_log_sysctl_fnames[i]; + nf_log_sysctl_table[i].data = NULL; + nf_log_sysctl_table[i].maxlen = + NFLOGGER_NAME_LEN * sizeof(char); + nf_log_sysctl_table[i].mode = 0644; + nf_log_sysctl_table[i].proc_handler = + nf_log_proc_dostring; + nf_log_sysctl_table[i].extra1 = + (void *)(unsigned long) i; + } + } + + net->nf.nf_log_dir_header = register_net_sysctl(net, + "net/netfilter/nf_log", + table); + if (!net->nf.nf_log_dir_header) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void netfilter_log_sysctl_exit(struct net *net) +{ + struct ctl_table *table; + + table = net->nf.nf_log_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_log_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); +} +#else +static int netfilter_log_sysctl_init(struct net *net) +{ + return 0; +} + +static void netfilter_log_sysctl_exit(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +static int __net_init nf_log_net_init(struct net *net) +{ + int ret = -ENOMEM; + #ifdef CONFIG_PROC_FS - struct proc_dir_entry *pde; + if (!proc_create("nf_log", S_IRUGO, + net->nf.proc_netfilter, &nflog_file_ops)) + return ret; +#endif + ret = netfilter_log_sysctl_init(net); + if (ret < 0) + goto out_sysctl; + + return 0; - pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter); - if (!pde) - return -1; +out_sysctl: +#ifdef CONFIG_PROC_FS + remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif + return ret; +} - pde->proc_fops = &nflog_file_ops; +static void __net_exit nf_log_net_exit(struct net *net) +{ + netfilter_log_sysctl_exit(net); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nf_log", net->nf.proc_netfilter); #endif +} + +static struct pernet_operations nf_log_net_ops = { + .init = nf_log_net_init, + .exit = nf_log_net_exit, +}; + +int __init netfilter_log_init(void) +{ + int i, ret; + + ret = register_pernet_subsys(&nf_log_net_ops); + if (ret < 0) + return ret; + + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + INIT_LIST_HEAD(&(nf_loggers_l[i])); + return 0; } diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c new file mode 100644 index 00000000000..eb772380a20 --- /dev/null +++ b/net/netfilter/nf_nat_amanda.c @@ -0,0 +1,90 @@ +/* Amanda extension for TCP NAT alteration. + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on a copy of HW's ip_nat_irc.c as well as other modules + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/udp.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_amanda.h> + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_amanda"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + char buffer[sizeof("65535")]; + u_int16_t port; + unsigned int ret; + + /* Connection comes from client. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_ORIGINAL; + + /* When you see the packet, we need to NAT it the same as the + * this one (ie. same IP: it will be TCP and master is UDP). */ + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int res; + + exp->tuple.dst.u.tcp.port = htons(port); + res = nf_ct_expect_related(exp); + if (res == 0) + break; + else if (res != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, exp->master, "all ports in use"); + return NF_DROP; + } + + sprintf(buffer, "%u", port); + ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, + protoff, matchoff, matchlen, + buffer, strlen(buffer)); + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); + nf_ct_unexpect_related(exp); + } + return ret; +} + +static void __exit nf_nat_amanda_fini(void) +{ + RCU_INIT_POINTER(nf_nat_amanda_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_amanda_init(void) +{ + BUG_ON(nf_nat_amanda_hook != NULL); + RCU_INIT_POINTER(nf_nat_amanda_hook, help); + return 0; +} + +module_init(nf_nat_amanda_init); +module_exit(nf_nat_amanda_fini); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c new file mode 100644 index 00000000000..a49907b1dab --- /dev/null +++ b/net/netfilter/nf_nat_core.c @@ -0,0 +1,898 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/gfp.h> +#include <net/xfrm.h> +#include <linux/jhash.h> +#include <linux/rtnetlink.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <linux/netfilter/nf_nat.h> + +static DEFINE_SPINLOCK(nf_nat_lock); + +static DEFINE_MUTEX(nf_nat_proto_mutex); +static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] + __read_mostly; +static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] + __read_mostly; + + +inline const struct nf_nat_l3proto * +__nf_nat_l3proto_find(u8 family) +{ + return rcu_dereference(nf_nat_l3protos[family]); +} + +inline const struct nf_nat_l4proto * +__nf_nat_l4proto_find(u8 family, u8 protonum) +{ + return rcu_dereference(nf_nat_l4protos[family][protonum]); +} +EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find); + +#ifdef CONFIG_XFRM +static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) +{ + const struct nf_nat_l3proto *l3proto; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + unsigned long statusbit; + u8 family; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return; + + family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + rcu_read_lock(); + l3proto = __nf_nat_l3proto_find(family); + if (l3proto == NULL) + goto out; + + dir = CTINFO2DIR(ctinfo); + if (dir == IP_CT_DIR_ORIGINAL) + statusbit = IPS_DST_NAT; + else + statusbit = IPS_SRC_NAT; + + l3proto->decode_session(skb, ct, dir, statusbit, fl); +out: + rcu_read_unlock(); +} + +int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) +{ + struct flowi fl; + unsigned int hh_len; + struct dst_entry *dst; + int err; + + err = xfrm_decode_session(skb, &fl, family); + if (err < 0) + return err; + + dst = skb_dst(skb); + if (dst->xfrm) + dst = ((struct xfrm_dst *)dst)->route; + dst_hold(dst); + + dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(nf_xfrm_me_harder); +#endif /* CONFIG_XFRM */ + +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + unsigned int hash; + + /* Original src, to ensure we map it consistently if poss. */ + hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), + tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); + return ((u64)hash * net->ct.nat_htable_size) >> 32; +} + +/* Is this tuple already taken? (not by us) */ +int +nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + /* Conntrack tracking doesn't keep track of outgoing tuples; only + * incoming ones. NAT means they don't have a fixed mapping, + * so we invert the tuple and look for the incoming reply. + * + * We could keep a separate hash if this proves too slow. + */ + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuplepr(&reply, tuple); + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); +} +EXPORT_SYMBOL(nf_nat_used_tuple); + +/* If we source map this tuple so reply looks like reply_tuple, will + * that meet the constraints of range. + */ +static int in_range(const struct nf_nat_l3proto *l3proto, + const struct nf_nat_l4proto *l4proto, + const struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range) +{ + /* If we are supposed to map IPs, then we must be in the + * range specified, otherwise let this drag us onto a new src IP. + */ + if (range->flags & NF_NAT_RANGE_MAP_IPS && + !l3proto->in_range(tuple, range)) + return 0; + + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) || + l4proto->in_range(tuple, NF_NAT_MANIP_SRC, + &range->min_proto, &range->max_proto)) + return 1; + + return 0; +} + +static inline int +same_src(const struct nf_conn *ct, + const struct nf_conntrack_tuple *tuple) +{ + const struct nf_conntrack_tuple *t; + + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + return (t->dst.protonum == tuple->dst.protonum && + nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && + t->src.u.all == tuple->src.u.all); +} + +/* Only called for SRC manip */ +static int +find_appropriate_src(struct net *net, u16 zone, + const struct nf_nat_l3proto *l3proto, + const struct nf_nat_l4proto *l4proto, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *result, + const struct nf_nat_range *range) +{ + unsigned int h = hash_by_src(net, zone, tuple); + const struct nf_conn_nat *nat; + const struct nf_conn *ct; + + hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { + ct = nat->ct; + if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { + /* Copy source part from reply tuple. */ + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(l3proto, l4proto, result, range)) + return 1; + } + } + return 0; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus + * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + * 1-65535, we don't do pro-rata allocation based on ports; we choose + * the ip with the lowest src-ip/dst-ip/proto usage. + */ +static void +find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + const struct nf_conn *ct, + enum nf_nat_manip_type maniptype) +{ + union nf_inet_addr *var_ipp; + unsigned int i, max; + /* Host order */ + u32 minip, maxip, j, dist; + bool full_range; + + /* No IP mapping? Do nothing. */ + if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) + return; + + if (maniptype == NF_NAT_MANIP_SRC) + var_ipp = &tuple->src.u3; + else + var_ipp = &tuple->dst.u3; + + /* Fast path: only one choice. */ + if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { + *var_ipp = range->min_addr; + return; + } + + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + max = sizeof(var_ipp->ip) / sizeof(u32) - 1; + else + max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; + + /* Hashing source and destination IPs gives a fairly even + * spread in practice (if there are a small number of IPs + * involved, there usually aren't that many connections + * anyway). The consistency means that servers see the same + * client coming from the same IP (some Internet Banking sites + * like this), even across reboots. + */ + j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), + range->flags & NF_NAT_RANGE_PERSISTENT ? + 0 : (__force u32)tuple->dst.u3.all[max] ^ zone); + + full_range = false; + for (i = 0; i <= max; i++) { + /* If first bytes of the address are at the maximum, use the + * distance. Otherwise use the full range. + */ + if (!full_range) { + minip = ntohl((__force __be32)range->min_addr.all[i]); + maxip = ntohl((__force __be32)range->max_addr.all[i]); + dist = maxip - minip + 1; + } else { + minip = 0; + dist = ~0; + } + + var_ipp->all[i] = (__force __u32) + htonl(minip + (((u64)j * dist) >> 32)); + if (var_ipp->all[i] != range->max_addr.all[i]) + full_range = true; + + if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) + j ^= (__force u32)tuple->dst.u3.all[i]; + } +} + +/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, + * we change the source to map into the range. For NF_INET_PRE_ROUTING + * and NF_INET_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void +get_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig_tuple, + const struct nf_nat_range *range, + struct nf_conn *ct, + enum nf_nat_manip_type maniptype) +{ + const struct nf_nat_l3proto *l3proto; + const struct nf_nat_l4proto *l4proto; + struct net *net = nf_ct_net(ct); + u16 zone = nf_ct_zone(ct); + + rcu_read_lock(); + l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); + l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, + orig_tuple->dst.protonum); + + /* 1) If this srcip/proto/src-proto-part is currently mapped, + * and that same mapping gives a unique tuple within the given + * range, use that. + * + * This is only required for source (ie. NAT/masq) mappings. + * So far, we don't do local source mappings, so multiple + * manips not an issue. + */ + if (maniptype == NF_NAT_MANIP_SRC && + !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { + /* try the original tuple first */ + if (in_range(l3proto, l4proto, orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + goto out; + } + } else if (find_appropriate_src(net, zone, l3proto, l4proto, + orig_tuple, tuple, range)) { + pr_debug("get_unique_tuple: Found current src map\n"); + if (!nf_nat_used_tuple(tuple, ct)) + goto out; + } + } + + /* 2) Select the least-used IP/proto combination in the given range */ + *tuple = *orig_tuple; + find_best_ips_proto(zone, tuple, range, ct, maniptype); + + /* 3) The per-protocol part of the manip is made to map into + * the range to make a unique tuple. + */ + + /* Only bother mapping if it's not already in range and unique */ + if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + if (l4proto->in_range(tuple, maniptype, + &range->min_proto, + &range->max_proto) && + (range->min_proto.all == range->max_proto.all || + !nf_nat_used_tuple(tuple, ct))) + goto out; + } else if (!nf_nat_used_tuple(tuple, ct)) { + goto out; + } + } + + /* Last change: get protocol to try to obtain unique tuple. */ + l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); +out: + rcu_read_unlock(); +} + +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + if (nat) + return nat; + + if (!nf_ct_is_confirmed(ct)) + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + + return nat; +} +EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); + +unsigned int +nf_nat_setup_info(struct nf_conn *ct, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_tuple curr_tuple, new_tuple; + struct nf_conn_nat *nat; + + /* nat helper or nfctnetlink also setup binding */ + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || + maniptype == NF_NAT_MANIP_DST); + BUG_ON(nf_nat_initialized(ct, maniptype)); + + /* What we've got will look like inverse of reply. Normally + * this is what is in the conntrack, except for prior + * manipulations (future optimization: if num_manips == 0, + * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) + */ + nf_ct_invert_tuplepr(&curr_tuple, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); + + if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { + struct nf_conntrack_tuple reply; + + /* Alter conntrack table so will recognize replies. */ + nf_ct_invert_tuplepr(&reply, &new_tuple); + nf_conntrack_alter_reply(ct, &reply); + + /* Non-atomic: we own this at the moment. */ + if (maniptype == NF_NAT_MANIP_SRC) + ct->status |= IPS_SRC_NAT; + else + ct->status |= IPS_DST_NAT; + + if (nfct_help(ct)) + nfct_seqadj_ext_add(ct); + } + + if (maniptype == NF_NAT_MANIP_SRC) { + unsigned int srchash; + + srchash = hash_by_src(net, nf_ct_zone(ct), + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + /* nf_conntrack_alter_reply might re-allocate extension aera */ + nat = nfct_nat(ct); + nat->ct = ct; + hlist_add_head_rcu(&nat->bysource, + &net->ct.nat_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); + } + + /* It's done. */ + if (maniptype == NF_NAT_MANIP_DST) + ct->status |= IPS_DST_NAT_DONE; + else + ct->status |= IPS_SRC_NAT_DONE; + + return NF_ACCEPT; +} +EXPORT_SYMBOL(nf_nat_setup_info); + +static unsigned int +__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) +{ + /* Force range to this IP; let proto decide mapping for + * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + * Use reply in case it's already been mangled (eg local packet). + */ + union nf_inet_addr ip = + (manip == NF_NAT_MANIP_SRC ? + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); + struct nf_nat_range range = { + .flags = NF_NAT_RANGE_MAP_IPS, + .min_addr = ip, + .max_addr = ip, + }; + return nf_nat_setup_info(ct, &range, manip); +} + +unsigned int +nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); +} +EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); + +/* Do packet manipulations according to nf_nat_setup_info. */ +unsigned int nf_nat_packet(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff *skb) +{ + const struct nf_nat_l3proto *l3proto; + const struct nf_nat_l4proto *l4proto; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned long statusbit; + enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); + + if (mtype == NF_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + /* Non-atomic: these bits don't change. */ + if (ct->status & statusbit) { + struct nf_conntrack_tuple target; + + /* We are aiming to look like inverse of other direction. */ + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + + l3proto = __nf_nat_l3proto_find(target.src.l3num); + l4proto = __nf_nat_l4proto_find(target.src.l3num, + target.dst.protonum); + if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) + return NF_DROP; + } + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_nat_packet); + +struct nf_nat_proto_clean { + u8 l3proto; + u8 l4proto; +}; + +/* kill conntracks with affected NAT section */ +static int nf_nat_proto_remove(struct nf_conn *i, void *data) +{ + const struct nf_nat_proto_clean *clean = data; + struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + + if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || + (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) + return 0; + + return i->status & IPS_NAT_MASK ? 1 : 0; +} + +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + + if (nf_nat_proto_remove(ct, data)) + return 1; + + if (!nat || !nat->ct) + return 0; + + /* This netns is being destroyed, and conntrack has nat null binding. + * Remove it from bysource hash, as the table will be freed soon. + * + * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() + * will delete entry from already-freed table. + */ + if (!del_timer(&ct->timeout)) + return 1; + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + ct->status &= ~IPS_NAT_DONE_MASK; + nat->ct = NULL; + spin_unlock_bh(&nf_nat_lock); + + add_timer(&ct->timeout); + + /* don't delete conntrack. Although that would make things a lot + * simpler, we'd end up flushing all conntracks on nat rmmod. + */ + return 0; +} + +static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) +{ + struct nf_nat_proto_clean clean = { + .l3proto = l3proto, + .l4proto = l4proto, + }; + struct net *net; + + rtnl_lock(); + for_each_net(net) + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); + rtnl_unlock(); +} + +static void nf_nat_l3proto_clean(u8 l3proto) +{ + struct nf_nat_proto_clean clean = { + .l3proto = l3proto, + }; + struct net *net; + + rtnl_lock(); + + for_each_net(net) + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); + rtnl_unlock(); +} + +/* Protocol registration. */ +int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto) +{ + const struct nf_nat_l4proto **l4protos; + unsigned int i; + int ret = 0; + + mutex_lock(&nf_nat_proto_mutex); + if (nf_nat_l4protos[l3proto] == NULL) { + l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *), + GFP_KERNEL); + if (l4protos == NULL) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < IPPROTO_MAX; i++) + RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown); + + /* Before making proto_array visible to lockless readers, + * we must make sure its content is committed to memory. + */ + smp_wmb(); + + nf_nat_l4protos[l3proto] = l4protos; + } + + if (rcu_dereference_protected( + nf_nat_l4protos[l3proto][l4proto->l4proto], + lockdep_is_held(&nf_nat_proto_mutex) + ) != &nf_nat_l4proto_unknown) { + ret = -EBUSY; + goto out; + } + RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto); + out: + mutex_unlock(&nf_nat_proto_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_register); + +/* No one stores the protocol anywhere; simply delete it. */ +void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto) +{ + mutex_lock(&nf_nat_proto_mutex); + RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], + &nf_nat_l4proto_unknown); + mutex_unlock(&nf_nat_proto_mutex); + synchronize_rcu(); + + nf_nat_l4proto_clean(l3proto, l4proto->l4proto); +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); + +int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) +{ + int err; + + err = nf_ct_l3proto_try_module_get(l3proto->l3proto); + if (err < 0) + return err; + + mutex_lock(&nf_nat_proto_mutex); + RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], + &nf_nat_l4proto_tcp); + RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP], + &nf_nat_l4proto_udp); + mutex_unlock(&nf_nat_proto_mutex); + + RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto); + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_register); + +void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto) +{ + mutex_lock(&nf_nat_proto_mutex); + RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL); + mutex_unlock(&nf_nat_proto_mutex); + synchronize_rcu(); + + nf_nat_l3proto_clean(l3proto->l3proto); + nf_ct_l3proto_module_put(l3proto->l3proto); +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); + +/* No one using conntrack by the time this called. */ +static void nf_nat_cleanup_conntrack(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); + + if (nat == NULL || nat->ct == NULL) + return; + + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + spin_unlock_bh(&nf_nat_lock); +} + +static void nf_nat_move_storage(void *new, void *old) +{ + struct nf_conn_nat *new_nat = new; + struct nf_conn_nat *old_nat = old; + struct nf_conn *ct = old_nat->ct; + + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) + return; + + spin_lock_bh(&nf_nat_lock); + hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); + spin_unlock_bh(&nf_nat_lock); +} + +static struct nf_ct_ext_type nat_extend __read_mostly = { + .len = sizeof(struct nf_conn_nat), + .align = __alignof__(struct nf_conn_nat), + .destroy = nf_nat_cleanup_conntrack, + .move = nf_nat_move_storage, + .id = NF_CT_EXT_NAT, + .flags = NF_CT_EXT_F_PREALLOC, +}; + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { + [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, + [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, +}; + +static int nfnetlink_parse_nat_proto(struct nlattr *attr, + const struct nf_conn *ct, + struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_PROTONAT_MAX+1]; + const struct nf_nat_l4proto *l4proto; + int err; + + err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); + if (err < 0) + return err; + + l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->nlattr_to_range) + err = l4proto->nlattr_to_range(tb, range); + + return err; +} + +static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { + [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, + [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, + [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, + [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, + [CTA_NAT_PROTO] = { .type = NLA_NESTED }, +}; + +static int +nfnetlink_parse_nat(const struct nlattr *nat, + const struct nf_conn *ct, struct nf_nat_range *range, + const struct nf_nat_l3proto *l3proto) +{ + struct nlattr *tb[CTA_NAT_MAX+1]; + int err; + + memset(range, 0, sizeof(*range)); + + err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); + if (err < 0) + return err; + + err = l3proto->nlattr_to_range(tb, range); + if (err < 0) + return err; + + if (!tb[CTA_NAT_PROTO]) + return 0; + + return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); +} + +/* This function is called under rcu_read_lock() */ +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + struct nf_nat_range range; + const struct nf_nat_l3proto *l3proto; + int err; + + /* Should not happen, restricted to creating new conntracks + * via ctnetlink. + */ + if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) + return -EEXIST; + + /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to + * attach the null binding, otherwise this may oops. + */ + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + if (l3proto == NULL) + return -EAGAIN; + + /* No NAT information has been passed, allocate the null-binding */ + if (attr == NULL) + return __nf_nat_alloc_null_binding(ct, manip); + + err = nfnetlink_parse_nat(attr, ct, &range, l3proto); + if (err < 0) + return err; + + return nf_nat_setup_info(ct, &range, manip); +} +#else +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +static int __net_init nf_nat_net_init(struct net *net) +{ + /* Leave them the same for the moment. */ + net->ct.nat_htable_size = net->ct.htable_size; + net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); + if (!net->ct.nat_bysource) + return -ENOMEM; + return 0; +} + +static void __net_exit nf_nat_net_exit(struct net *net) +{ + struct nf_nat_proto_clean clean = {}; + + nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); + synchronize_rcu(); + nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); +} + +static struct pernet_operations nf_nat_net_ops = { + .init = nf_nat_net_init, + .exit = nf_nat_net_exit, +}; + +static struct nf_ct_helper_expectfn follow_master_nat = { + .name = "nat-follow-master", + .expectfn = nf_nat_follow_master, +}; + +static int __init nf_nat_init(void) +{ + int ret; + + ret = nf_ct_extend_register(&nat_extend); + if (ret < 0) { + printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); + return ret; + } + + ret = register_pernet_subsys(&nf_nat_net_ops); + if (ret < 0) + goto cleanup_extend; + + nf_ct_helper_expectfn_register(&follow_master_nat); + + /* Initialize fake conntrack so that NAT will skip it */ + nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); + + BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); + RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, + nfnetlink_parse_nat_setup); +#ifdef CONFIG_XFRM + BUG_ON(nf_nat_decode_session_hook != NULL); + RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); +#endif + return 0; + + cleanup_extend: + nf_ct_extend_unregister(&nat_extend); + return ret; +} + +static void __exit nf_nat_cleanup(void) +{ + unsigned int i; + + unregister_pernet_subsys(&nf_nat_net_ops); + nf_ct_extend_unregister(&nat_extend); + nf_ct_helper_expectfn_unregister(&follow_master_nat); + RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); +#ifdef CONFIG_XFRM + RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); +#endif + for (i = 0; i < NFPROTO_NUMPROTO; i++) + kfree(nf_nat_l4protos[i]); + synchronize_net(); +} + +MODULE_LICENSE("GPL"); + +module_init(nf_nat_init); +module_exit(nf_nat_cleanup); diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c new file mode 100644 index 00000000000..e84a578dbe3 --- /dev/null +++ b/net/netfilter/nf_nat_ftp.c @@ -0,0 +1,146 @@ +/* FTP extension for TCP NAT alteration. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/inet.h> +#include <linux/tcp.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_ftp.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp NAT helper"); +MODULE_ALIAS("ip_nat_ftp"); + +/* FIXME: Time out? --RR */ + +static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type, + char *buffer, size_t buflen, + union nf_inet_addr *addr, u16 port) +{ + switch (type) { + case NF_CT_FTP_PORT: + case NF_CT_FTP_PASV: + return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&addr->ip)[0], + ((unsigned char *)&addr->ip)[1], + ((unsigned char *)&addr->ip)[2], + ((unsigned char *)&addr->ip)[3], + port >> 8, + port & 0xFF); + case NF_CT_FTP_EPRT: + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + return snprintf(buffer, buflen, "|1|%pI4|%u|", + &addr->ip, port); + else + return snprintf(buffer, buflen, "|2|%pI6|%u|", + &addr->ip6, port); + case NF_CT_FTP_EPSV: + return snprintf(buffer, buflen, "|||%u|", port); + } + + return 0; +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int nf_nat_ftp(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + union nf_inet_addr newaddr; + u_int16_t port; + int dir = CTINFO2DIR(ctinfo); + struct nf_conn *ct = exp->master; + char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN]; + unsigned int buflen; + + pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + + /* Connection will come from wherever this packet goes, hence !dir */ + newaddr = ct->tuplehash[!dir].tuple.dst.u3; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use"); + return NF_DROP; + } + + buflen = nf_nat_ftp_fmt_cmd(ct, type, buffer, sizeof(buffer), + &newaddr, port); + if (!buflen) + goto out; + + pr_debug("calling nf_nat_mangle_tcp_packet\n"); + + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, + matchlen, buffer, buflen)) + goto out; + + return NF_ACCEPT; + +out: + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + nf_ct_unexpect_related(exp); + return NF_DROP; +} + +static void __exit nf_nat_ftp_fini(void) +{ + RCU_INIT_POINTER(nf_nat_ftp_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_ftp_init(void) +{ + BUG_ON(nf_nat_ftp_hook != NULL); + RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp); + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO KBUILD_MODNAME + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_ftp_init); +module_exit(nf_nat_ftp_fini); diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c new file mode 100644 index 00000000000..2840abb5bb9 --- /dev/null +++ b/net/netfilter/nf_nat_helper.c @@ -0,0 +1,212 @@ +/* nf_nat_helper.c - generic support functions for NAT helpers + * + * (C) 2000-2002 Harald Welte <laforge@netfilter.org> + * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2007-2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> + +/* Frobs data inside this packet, which is linear. */ +static void mangle_contents(struct sk_buff *skb, + unsigned int dataoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + unsigned char *data; + + BUG_ON(skb_is_nonlinear(skb)); + data = skb_network_header(skb) + dataoff; + + /* move post-replacement */ + memmove(data + match_offset + rep_len, + data + match_offset + match_len, + skb_tail_pointer(skb) - (skb_network_header(skb) + dataoff + + match_offset + match_len)); + + /* insert data from buffer */ + memcpy(data + match_offset, rep_buffer, rep_len); + + /* update skb info */ + if (rep_len > match_len) { + pr_debug("nf_nat_mangle_packet: Extending packet by " + "%u from %u bytes\n", rep_len - match_len, skb->len); + skb_put(skb, rep_len - match_len); + } else { + pr_debug("nf_nat_mangle_packet: Shrinking packet from " + "%u from %u bytes\n", match_len - rep_len, skb->len); + __skb_trim(skb, skb->len + rep_len - match_len); + } + + if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) { + /* fix IP hdr checksum information */ + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); + } else + ipv6_hdr(skb)->payload_len = + htons(skb->len - sizeof(struct ipv6hdr)); +} + +/* Unusual, but possible case. */ +static int enlarge_skb(struct sk_buff *skb, unsigned int extra) +{ + if (skb->len + extra > 65535) + return 0; + + if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC)) + return 0; + + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX + * command in FTP). + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * */ +int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len, bool adjust) +{ + const struct nf_nat_l3proto *l3proto; + struct tcphdr *tcph; + int oldlen, datalen; + + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (rep_len > match_len && + rep_len - match_len > skb_tailroom(skb) && + !enlarge_skb(skb, rep_len - match_len)) + return 0; + + SKB_LINEAR_ASSERT(skb); + + tcph = (void *)skb->data + protoff; + + oldlen = skb->len - protoff; + mangle_contents(skb, protoff + tcph->doff*4, + match_offset, match_len, rep_buffer, rep_len); + + datalen = skb->len - protoff; + + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + l3proto->csum_recalc(skb, IPPROTO_TCP, tcph, &tcph->check, + datalen, oldlen); + + if (adjust && rep_len != match_len) + nf_ct_seqadj_set(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); + + return 1; +} +EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); + +/* Generic function for mangling variable-length address changes inside + * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX + * command in the Amanda protocol) + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * XXX - This function could be merged with nf_nat_mangle_tcp_packet which + * should be fairly easy to do. + */ +int +nf_nat_mangle_udp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + const struct nf_nat_l3proto *l3proto; + struct udphdr *udph; + int datalen, oldlen; + + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (rep_len > match_len && + rep_len - match_len > skb_tailroom(skb) && + !enlarge_skb(skb, rep_len - match_len)) + return 0; + + udph = (void *)skb->data + protoff; + + oldlen = skb->len - protoff; + mangle_contents(skb, protoff + sizeof(*udph), + match_offset, match_len, rep_buffer, rep_len); + + /* update the length of the UDP packet */ + datalen = skb->len - protoff; + udph->len = htons(datalen); + + /* fix udp checksum if udp checksum was previously calculated */ + if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) + return 1; + + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check, + datalen, oldlen); + + return 1; +} +EXPORT_SYMBOL(nf_nat_mangle_udp_packet); + +/* Setup NAT on this expected conntrack so it follows master. */ +/* If we fail to get a free NAT slot, we'll get dropped on confirm */ +void nf_nat_follow_master(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.dst.u3; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); + range.min_proto = range.max_proto = exp->saved_proto; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.src.u3; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} +EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c new file mode 100644 index 00000000000..1fb2258c353 --- /dev/null +++ b/net/netfilter/nf_nat_irc.c @@ -0,0 +1,119 @@ +/* IRC extension for TCP NAT alteration. + * + * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> + * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * based on a copy of RR's ip_nat_ftp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/tcp.h> +#include <linux/kernel.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_irc.h> + +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("IRC (DCC) NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_irc"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + char buffer[sizeof("4294967296 65635")]; + struct nf_conn *ct = exp->master; + union nf_inet_addr newaddr; + u_int16_t port; + unsigned int ret; + + /* Reply comes from server. */ + newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3; + + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use"); + return NF_DROP; + } + + /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 + * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 + * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 + * + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, + * 255.255.255.255==4294967296, 10 digits) + * P: bound port (min 1 d, max 5d (65635)) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + /* AAA = "us", ie. where server normally talks to. */ + snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port); + pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", + buffer, &newaddr.ip, port); + + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, + matchlen, buffer, strlen(buffer)); + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + nf_ct_unexpect_related(exp); + } + + return ret; +} + +static void __exit nf_nat_irc_fini(void) +{ + RCU_INIT_POINTER(nf_nat_irc_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_irc_init(void) +{ + BUG_ON(nf_nat_irc_hook != NULL); + RCU_INIT_POINTER(nf_nat_irc_hook, help); + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO KBUILD_MODNAME + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_irc_init); +module_exit(nf_nat_irc_fini); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c new file mode 100644 index 00000000000..83a72a235ca --- /dev/null +++ b/net/netfilter/nf_nat_proto_common.c @@ -0,0 +1,114 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/random.h> +#include <linux/netfilter.h> +#include <linux/export.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + __be16 port; + + if (maniptype == NF_NAT_MANIP_SRC) + port = tuple->src.u.all; + else + port = tuple->dst.u.all; + + return ntohs(port) >= ntohs(min->all) && + ntohs(port) <= ntohs(max->all); +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range); + +void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct, + u16 *rover) +{ + unsigned int range_size, min, i; + __be16 *portptr; + u_int16_t off; + + if (maniptype == NF_NAT_MANIP_SRC) + portptr = &tuple->src.u.all; + else + portptr = &tuple->dst.u.all; + + /* If no range specified... */ + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == NF_NAT_MANIP_DST) + return; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr) < 512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min_proto.all); + range_size = ntohs(range->max_proto.all) - min + 1; + } + + if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) { + off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC + ? tuple->dst.u.all + : tuple->src.u.all); + } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { + off = prandom_u32(); + } else { + off = *rover; + } + + for (i = 0; ; ++off) { + *portptr = htons(min + off % range_size); + if (++i != range_size && nf_nat_used_tuple(tuple, ct)) + continue; + if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) + *rover = off; + return; + } +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range *range) +{ + if (tb[CTA_PROTONAT_PORT_MIN]) { + range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); + range->max_proto.all = range->min_proto.all; + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + if (tb[CTA_PROTONAT_PORT_MAX]) { + range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range); +#endif diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c new file mode 100644 index 00000000000..c8be2cdac0b --- /dev/null +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -0,0 +1,116 @@ +/* + * DCCP NAT protocol helper + * + * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/dccp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u_int16_t dccp_port_rover; + +static void +dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &dccp_port_rover); +} + +static bool +dccp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct dccp_hdr *hdr; + __be16 *portptr, oldport, newport; + int hdrsize = 8; /* DCCP connection tracking guarantees this much */ + + if (skb->len >= hdroff + sizeof(struct dccp_hdr)) + hdrsize = sizeof(struct dccp_hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + hdr = (struct dccp_hdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + newport = tuple->src.u.dccp.port; + portptr = &hdr->dccph_sport; + } else { + newport = tuple->dst.u.dccp.port; + portptr = &hdr->dccph_dport; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, + tuple, maniptype); + inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, + 0); + return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { + .l4proto = IPPROTO_DCCP, + .manip_pkt = dccp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = dccp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_dccp_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp); + if (err < 0) + goto err1; + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp); + if (err < 0) + goto err2; + return 0; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); +err1: + return err; +} + +static void __exit nf_nat_proto_dccp_fini(void) +{ + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); + +} + +module_init(nf_nat_proto_dccp_init); +module_exit(nf_nat_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("DCCP NAT protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c new file mode 100644 index 00000000000..754536f2c67 --- /dev/null +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sctp.h> +#include <linux/module.h> +#include <net/sctp/checksum.h> + +#include <net/netfilter/nf_nat_l4proto.h> + +static u_int16_t nf_sctp_port_rover; + +static void +sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &nf_sctp_port_rover); +} + +static bool +sctp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + sctp_sctphdr_t *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct sctphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + hdr->source = tuple->src.u.sctp.port; + } else { + /* Get rid of dst port */ + hdr->dest = tuple->dst.u.sctp.port; + } + + hdr->checksum = sctp_compute_cksum(skb, hdroff); + + return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { + .l4proto = IPPROTO_SCTP, + .manip_pkt = sctp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = sctp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_sctp_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp); + if (err < 0) + goto err1; + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp); + if (err < 0) + goto err2; + return 0; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); +err1: + return err; +} + +static void __exit nf_nat_proto_sctp_exit(void) +{ + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); +} + +module_init(nf_nat_proto_sctp_init); +module_exit(nf_nat_proto_sctp_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCTP NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c new file mode 100644 index 00000000000..83ec8a6e4c3 --- /dev/null +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -0,0 +1,85 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> + +static u16 tcp_port_rover; + +static void +tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &tcp_port_rover); +} + +static bool +tcp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct tcphdr *hdr; + __be16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if (skb->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + hdr = (struct tcphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_tcp = { + .l4proto = IPPROTO_TCP, + .manip_pkt = tcp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = tcp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c new file mode 100644 index 00000000000..7df613fb34a --- /dev/null +++ b/net/netfilter/nf_nat_proto_udp.c @@ -0,0 +1,76 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u16 udp_port_rover; + +static void +udp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &udp_port_rover); +} + +static bool +udp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct udphdr *hdr; + __be16 *portptr, newport; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + hdr = (struct udphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { + l3proto->csum_update(skb, iphdroff, &hdr->check, + tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, + 0); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + } + *portptr = newport; + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_udp = { + .l4proto = IPPROTO_UDP, + .manip_pkt = udp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = udp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c new file mode 100644 index 00000000000..776a0d1317b --- /dev/null +++ b/net/netfilter/nf_nat_proto_udplite.c @@ -0,0 +1,106 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/module.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u16 udplite_port_rover; + +static void +udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &udplite_port_rover); +} + +static bool +udplite_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct udphdr *hdr; + __be16 *portptr, newport; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct udphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of source port */ + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + + l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + + *portptr = newport; + return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_udplite = { + .l4proto = IPPROTO_UDPLITE, + .manip_pkt = udplite_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = udplite_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_udplite_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite); + if (err < 0) + goto err1; + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite); + if (err < 0) + goto err2; + return 0; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); +err1: + return err; +} + +static void __exit nf_nat_proto_udplite_fini(void) +{ + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); +} + +module_init(nf_nat_proto_udplite_init); +module_exit(nf_nat_proto_udplite_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c new file mode 100644 index 00000000000..6e494d58441 --- /dev/null +++ b/net/netfilter/nf_nat_proto_unknown.c @@ -0,0 +1,54 @@ +/* The "unknown" protocol. This is what is used for protocols we + * don't understand. It's returned by ip_ct_find_proto(). + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type manip_type, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + return true; +} + +static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + /* Sorry: we can't help you; if it's not unique, we can't frob + * anything. + */ + return; +} + +static bool +unknown_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_unknown = { + .manip_pkt = unknown_manip_pkt, + .in_range = unknown_in_range, + .unique_tuple = unknown_unique_tuple, +}; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c new file mode 100644 index 00000000000..b4d691db955 --- /dev/null +++ b/net/netfilter/nf_nat_sip.c @@ -0,0 +1,653 @@ +/* SIP extension for NAT alteration. + * + * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> + * based on RR's ip_nat_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008, 2011, 2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet.h> +#include <linux/udp.h> +#include <linux/tcp.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <linux/netfilter/nf_conntrack_sip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); +MODULE_DESCRIPTION("SIP NAT helper"); +MODULE_ALIAS("ip_nat_sip"); + + +static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + const char *buffer, unsigned int buflen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct tcphdr *th; + unsigned int baseoff; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) { + th = (struct tcphdr *)(skb->data + protoff); + baseoff = protoff + th->doff * 4; + matchoff += dataoff - baseoff; + + if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + protoff, matchoff, matchlen, + buffer, buflen, false)) + return 0; + } else { + baseoff = protoff + sizeof(struct udphdr); + matchoff += dataoff - baseoff; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, + protoff, matchoff, matchlen, + buffer, buflen)) + return 0; + } + + /* Reload data pointer and adjust datalen value */ + *dptr = skb->data + dataoff; + *datalen += buflen - matchlen; + return 1; +} + +static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer, + const union nf_inet_addr *addr, bool delim) +{ + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + return sprintf(buffer, "%pI4", &addr->ip); + else { + if (delim) + return sprintf(buffer, "[%pI6c]", &addr->ip6); + else + return sprintf(buffer, "%pI6c", &addr->ip6); + } +} + +static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer, + const union nf_inet_addr *addr, u16 port) +{ + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + return sprintf(buffer, "%pI4:%u", &addr->ip, port); + else + return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port); +} + +static int map_addr(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + union nf_inet_addr *addr, __be16 port) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; + unsigned int buflen; + union nf_inet_addr newaddr; + __be16 newport; + + if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) && + ct->tuplehash[dir].tuple.src.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.dst.u3; + newport = ct->tuplehash[!dir].tuple.dst.u.udp.port; + } else if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, addr) && + ct->tuplehash[dir].tuple.dst.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.src.u3; + newport = ct_sip_info->forced_dport ? : + ct->tuplehash[!dir].tuple.src.u.udp.port; + } else + return 1; + + if (nf_inet_addr_cmp(&newaddr, addr) && newport == port) + return 1; + + buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport)); + return mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen); +} + +static int map_sip_addr(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + enum sip_header_types type) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + union nf_inet_addr addr; + __be16 port; + + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, + &matchoff, &matchlen, &addr, &port) <= 0) + return 1; + return map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, &addr, port); +} + +static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + unsigned int coff, matchoff, matchlen; + enum sip_header_types hdr; + union nf_inet_addr addr; + __be16 port; + int request, in_header; + + /* Basic rules: requests and responses. */ + if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) { + if (ct_sip_parse_request(ct, *dptr, *datalen, + &matchoff, &matchlen, + &addr, &port) > 0 && + !map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle SIP message"); + return NF_DROP; + } + request = 1; + } else + request = 0; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) + hdr = SIP_HDR_VIA_TCP; + else + hdr = SIP_HDR_VIA_UDP; + + /* Translate topmost Via header and parameters */ + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + hdr, NULL, &matchoff, &matchlen, + &addr, &port) > 0) { + unsigned int olen, matchend, poff, plen, buflen, n; + char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; + + /* We're only interested in headers related to this + * connection */ + if (request) { + if (!nf_inet_addr_cmp(&addr, + &ct->tuplehash[dir].tuple.src.u3) || + port != ct->tuplehash[dir].tuple.src.u.udp.port) + goto next; + } else { + if (!nf_inet_addr_cmp(&addr, + &ct->tuplehash[dir].tuple.dst.u3) || + port != ct->tuplehash[dir].tuple.dst.u.udp.port) + goto next; + } + + olen = *datalen; + if (!map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle Via header"); + return NF_DROP; + } + + matchend = matchoff + matchlen + *datalen - olen; + + /* The maddr= parameter (RFC 2361) specifies where to send + * the reply. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "maddr=", &poff, &plen, + &addr, true) > 0 && + nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) && + !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) { + buflen = sip_sprintf_addr(ct, buffer, + &ct->tuplehash[!dir].tuple.dst.u3, + true); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle maddr"); + return NF_DROP; + } + } + + /* The received= parameter (RFC 2361) contains the address + * from which the server received the request. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "received=", &poff, &plen, + &addr, false) > 0 && + nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) && + !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) { + buflen = sip_sprintf_addr(ct, buffer, + &ct->tuplehash[!dir].tuple.src.u3, + false); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle received"); + return NF_DROP; + } + } + + /* The rport= parameter (RFC 3581) contains the port number + * from which the server received the request. */ + if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, + "rport=", &poff, &plen, + &n) > 0 && + htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && + htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { + __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; + buflen = sprintf(buffer, "%u", ntohs(p)); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle rport"); + return NF_DROP; + } + } + } + +next: + /* Translate Contact headers */ + coff = 0; + in_header = 0; + while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, + SIP_HDR_CONTACT, &in_header, + &matchoff, &matchlen, + &addr, &port) > 0) { + if (!map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, + &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle contact"); + return NF_DROP; + } + } + + if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) || + !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) { + nf_ct_helper_log(skb, ct, "cannot mangle SIP from/to"); + return NF_DROP; + } + + /* Mangle destination port for Cisco phones, then fix up checksums */ + if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { + struct udphdr *uh; + + if (!skb_make_writable(skb, skb->len)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + + uh = (void *)skb->data + protoff; + uh->dest = ct_sip_info->forced_dport; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, protoff, + 0, 0, NULL, 0)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + } + + return NF_ACCEPT; +} + +static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, + s16 off) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) + return; + + th = (struct tcphdr *)(skb->data + protoff); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} + +/* Handles expected signalling connections and media streams */ +static void nf_nat_sip_expected(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); + range.min_proto = range.max_proto = exp->saved_proto; + range.min_addr = range.max_addr = exp->saved_addr; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); + + /* Change src to where master sends to, but only if the connection + * actually came from the same source. */ + if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, + &ct->master->tuplehash[exp->dir].tuple.src.u3)) { + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.dst.u3; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); + } +} + +static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + struct nf_conntrack_expect *exp, + unsigned int matchoff, + unsigned int matchlen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + union nf_inet_addr newaddr; + u_int16_t port; + __be16 srcport; + char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; + unsigned int buflen; + + /* Connection will come from reply */ + if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3)) + newaddr = exp->tuple.dst.u3; + else + newaddr = ct->tuplehash[!dir].tuple.dst.u3; + + /* If the signalling port matches the connection's source port in the + * original direction, try to use the destination port in the opposite + * direction. */ + srcport = ct_sip_info->forced_dport ? : + ct->tuplehash[dir].tuple.src.u.udp.port; + if (exp->tuple.dst.u.udp.port == srcport) + port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); + else + port = ntohs(exp->tuple.dst.u.udp.port); + + exp->saved_addr = exp->tuple.dst.u3; + exp->tuple.dst.u3 = newaddr; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + exp->expectfn = nf_nat_sip_expected; + + for (; port != 0; port++) { + int ret; + + exp->tuple.dst.u.udp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use for SIP"); + return NF_DROP; + } + + if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) || + exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { + buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + goto err; + } + } + return NF_ACCEPT; + +err: + nf_ct_unexpect_related(exp); + return NF_DROP; +} + +static int mangle_content_len(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + char buffer[sizeof("65536")]; + int buflen, c_len; + + /* Get actual SDP length */ + if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, + SDP_HDR_VERSION, SDP_HDR_UNSPEC, + &matchoff, &matchlen) <= 0) + return 0; + c_len = *datalen - matchoff + strlen("v="); + + /* Now, update SDP length */ + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH, + &matchoff, &matchlen) <= 0) + return 0; + + buflen = sprintf(buffer, "%u", c_len); + return mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen); +} + +static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + char *buffer, int buflen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + + if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, + &matchoff, &matchlen) <= 0) + return -ENOENT; + return mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL; +} + +static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + const union nf_inet_addr *addr) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + char buffer[INET6_ADDRSTRLEN]; + unsigned int buflen; + + buflen = sip_sprintf_addr(ct, buffer, addr, false); + if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, + sdpoff, type, term, buffer, buflen)) + return 0; + + return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, + unsigned int matchlen, + u_int16_t port) +{ + char buffer[sizeof("nnnnn")]; + unsigned int buflen; + + buflen = sprintf(buffer, "%u", port); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen)) + return 0; + + return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + const union nf_inet_addr *addr) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + char buffer[INET6_ADDRSTRLEN]; + unsigned int buflen; + + /* Mangle session description owner and contact addresses */ + buflen = sip_sprintf_addr(ct, buffer, addr, false); + if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, + SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen)) + return 0; + + switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, + SDP_HDR_CONNECTION, SDP_HDR_MEDIA, + buffer, buflen)) { + case 0: + /* + * RFC 2327: + * + * Session description + * + * c=* (connection information - not required if included in all media) + */ + case -ENOENT: + break; + default: + return 0; + } + + return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp, + unsigned int mediaoff, + unsigned int medialen, + union nf_inet_addr *rtp_addr) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + u_int16_t port; + + /* Connection will come from reply */ + if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3)) + *rtp_addr = rtp_exp->tuple.dst.u3; + else + *rtp_addr = ct->tuplehash[!dir].tuple.dst.u3; + + rtp_exp->saved_addr = rtp_exp->tuple.dst.u3; + rtp_exp->tuple.dst.u3 = *rtp_addr; + rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; + rtp_exp->dir = !dir; + rtp_exp->expectfn = nf_nat_sip_expected; + + rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3; + rtcp_exp->tuple.dst.u3 = *rtp_addr; + rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; + rtcp_exp->dir = !dir; + rtcp_exp->expectfn = nf_nat_sip_expected; + + /* Try to get same pair of ports: if not, try to change them. */ + for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); + port != 0; port += 2) { + int ret; + + rtp_exp->tuple.dst.u.udp.port = htons(port); + ret = nf_ct_expect_related(rtp_exp); + if (ret == -EBUSY) + continue; + else if (ret < 0) { + port = 0; + break; + } + rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) + break; + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { + nf_ct_unexpect_related(rtp_exp); + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use for SDP media"); + goto err1; + } + + /* Update media port. */ + if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && + !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen, + mediaoff, medialen, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle SDP message"); + goto err2; + } + + return NF_ACCEPT; + +err2: + nf_ct_unexpect_related(rtp_exp); + nf_ct_unexpect_related(rtcp_exp); +err1: + return NF_DROP; +} + +static struct nf_ct_helper_expectfn sip_nat = { + .name = "sip", + .expectfn = nf_nat_sip_expected, +}; + +static void __exit nf_nat_sip_fini(void) +{ + RCU_INIT_POINTER(nf_nat_sip_hooks, NULL); + + nf_ct_helper_expectfn_unregister(&sip_nat); + synchronize_rcu(); +} + +static const struct nf_nat_sip_hooks sip_hooks = { + .msg = nf_nat_sip, + .seq_adjust = nf_nat_sip_seq_adjust, + .expect = nf_nat_sip_expect, + .sdp_addr = nf_nat_sdp_addr, + .sdp_port = nf_nat_sdp_port, + .sdp_session = nf_nat_sdp_session, + .sdp_media = nf_nat_sdp_media, +}; + +static int __init nf_nat_sip_init(void) +{ + BUG_ON(nf_nat_sip_hooks != NULL); + RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks); + nf_ct_helper_expectfn_register(&sip_nat); + return 0; +} + +module_init(nf_nat_sip_init); +module_exit(nf_nat_sip_fini); diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c new file mode 100644 index 00000000000..7f67e1d5310 --- /dev/null +++ b/net/netfilter/nf_nat_tftp.c @@ -0,0 +1,52 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/udp.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_tftp.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("TFTP NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_tftp"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp) +{ + const struct nf_conn *ct = exp->master; + + exp->saved_proto.udp.port + = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = nf_nat_follow_master; + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, exp->master, "cannot add expectation"); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit nf_nat_tftp_fini(void) +{ + RCU_INIT_POINTER(nf_nat_tftp_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_tftp_init(void) +{ + BUG_ON(nf_nat_tftp_hook != NULL); + RCU_INIT_POINTER(nf_nat_tftp_hook, help); + return 0; +} + +module_init(nf_nat_tftp_init); +module_exit(nf_nat_tftp_fini); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b1f2ace96f6..5d24b1fdb59 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -1,4 +1,10 @@ +/* + * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 + */ + #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> @@ -7,350 +13,216 @@ #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/protocol.h> +#include <net/netfilter/nf_queue.h> +#include <net/dst.h> #include "nf_internals.h" /* - * A queue handler may be registered for each protocol. Each is protected by - * long term mutex. The handler must provide an an outfn() to accept packets - * for queueing and must reinject all packets it receives, no matter what. + * Hook for nfnetlink_queue to register its queue handler. + * We do this so that most of the NFQUEUE code can be modular. + * + * Once the queue is registered it must reinject all packets it + * receives, no matter what. */ -static struct nf_queue_handler *queue_handler[NPROTO]; - -static DEFINE_RWLOCK(queue_handler_lock); +static const struct nf_queue_handler __rcu *queue_handler __read_mostly; /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -int nf_register_queue_handler(int pf, struct nf_queue_handler *qh) +void nf_register_queue_handler(const struct nf_queue_handler *qh) { - int ret; - - if (pf >= NPROTO) - return -EINVAL; - - write_lock_bh(&queue_handler_lock); - if (queue_handler[pf] == qh) - ret = -EEXIST; - else if (queue_handler[pf]) - ret = -EBUSY; - else { - queue_handler[pf] = qh; - ret = 0; - } - write_unlock_bh(&queue_handler_lock); - - return ret; + /* should never happen, we only have one queueing backend in kernel */ + WARN_ON(rcu_access_pointer(queue_handler)); + rcu_assign_pointer(queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -int nf_unregister_queue_handler(int pf) +void nf_unregister_queue_handler(void) { - if (pf >= NPROTO) - return -EINVAL; + RCU_INIT_POINTER(queue_handler, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL(nf_unregister_queue_handler); - write_lock_bh(&queue_handler_lock); - queue_handler[pf] = NULL; - write_unlock_bh(&queue_handler_lock); +void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +{ + /* Release those devices we held, or Alexey will kill me. */ + if (entry->indev) + dev_put(entry->indev); + if (entry->outdev) + dev_put(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; - return 0; + if (nf_bridge->physindev) + dev_put(nf_bridge->physindev); + if (nf_bridge->physoutdev) + dev_put(nf_bridge->physoutdev); + } +#endif + /* Drop reference to owner of hook which queued us. */ + module_put(entry->elem->owner); } -EXPORT_SYMBOL(nf_unregister_queue_handler); +EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); -void nf_unregister_queue_handlers(struct nf_queue_handler *qh) +/* Bump dev refs so they don't vanish while packet is out */ +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { - int pf; + if (!try_module_get(entry->elem->owner)) + return false; + + if (entry->indev) + dev_hold(entry->indev); + if (entry->outdev) + dev_hold(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + struct net_device *physdev; - write_lock_bh(&queue_handler_lock); - for (pf = 0; pf < NPROTO; pf++) { - if (queue_handler[pf] == qh) - queue_handler[pf] = NULL; + physdev = nf_bridge->physindev; + if (physdev) + dev_hold(physdev); + physdev = nf_bridge->physoutdev; + if (physdev) + dev_hold(physdev); } - write_unlock_bh(&queue_handler_lock); +#endif + + return true; } -EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); +EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); /* * Any packet that leaves via this function must come back * through nf_reinject(). */ -static int __nf_queue(struct sk_buff *skb, - struct list_head *elem, - int pf, unsigned int hook, +int nf_queue(struct sk_buff *skb, + struct nf_hook_ops *elem, + u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), unsigned int queuenum) { - int status; - struct nf_info *info; -#ifdef CONFIG_BRIDGE_NETFILTER - struct net_device *physindev = NULL; - struct net_device *physoutdev = NULL; -#endif - struct nf_afinfo *afinfo; - - /* QUEUE == DROP if noone is waiting, to be safe. */ - read_lock(&queue_handler_lock); - if (!queue_handler[pf]) { - read_unlock(&queue_handler_lock); - kfree_skb(skb); - return 1; - } + int status = -ENOENT; + struct nf_queue_entry *entry = NULL; + const struct nf_afinfo *afinfo; + const struct nf_queue_handler *qh; - afinfo = nf_get_afinfo(pf); - if (!afinfo) { - read_unlock(&queue_handler_lock); - kfree_skb(skb); - return 1; - } - - info = kmalloc(sizeof(*info) + afinfo->route_key_size, GFP_ATOMIC); - if (!info) { - if (net_ratelimit()) - printk(KERN_ERR "OOM queueing packet %p\n", - skb); - read_unlock(&queue_handler_lock); - kfree_skb(skb); - return 1; - } - - *info = (struct nf_info) { - (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; + /* QUEUE == DROP if no one is waiting, to be safe. */ + rcu_read_lock(); - /* If it's going away, ignore hook. */ - if (!try_module_get(info->elem->owner)) { - read_unlock(&queue_handler_lock); - kfree(info); - return 0; + qh = rcu_dereference(queue_handler); + if (!qh) { + status = -ESRCH; + goto err_unlock; } - /* Bump dev refs so they don't vanish while packet is out */ - if (indev) dev_hold(indev); - if (outdev) dev_hold(outdev); - -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - physindev = skb->nf_bridge->physindev; - if (physindev) dev_hold(physindev); - physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev) dev_hold(physoutdev); - } -#endif - afinfo->saveroute(skb, info); - status = queue_handler[pf]->outfn(skb, info, queuenum, - queue_handler[pf]->data); + afinfo = nf_get_afinfo(pf); + if (!afinfo) + goto err_unlock; + + entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); + if (!entry) { + status = -ENOMEM; + goto err_unlock; + } + + *entry = (struct nf_queue_entry) { + .skb = skb, + .elem = elem, + .pf = pf, + .hook = hook, + .indev = indev, + .outdev = outdev, + .okfn = okfn, + .size = sizeof(*entry) + afinfo->route_key_size, + }; + + if (!nf_queue_entry_get_refs(entry)) { + status = -ECANCELED; + goto err_unlock; + } + skb_dst_force(skb); + afinfo->saveroute(skb, entry); + status = qh->outfn(entry, queuenum); - read_unlock(&queue_handler_lock); + rcu_read_unlock(); if (status < 0) { - /* James M doesn't say fuck enough. */ - if (indev) dev_put(indev); - if (outdev) dev_put(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (physindev) dev_put(physindev); - if (physoutdev) dev_put(physoutdev); -#endif - module_put(info->elem->owner); - kfree(info); - kfree_skb(skb); - - return 1; + nf_queue_entry_release_refs(entry); + goto err; } - return 1; -} - -int nf_queue(struct sk_buff *skb, - struct list_head *elem, - int pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum) -{ - struct sk_buff *segs; - - if (!skb_is_gso(skb)) - return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - queuenum); - - switch (pf) { - case AF_INET: - skb->protocol = htons(ETH_P_IP); - break; - case AF_INET6: - skb->protocol = htons(ETH_P_IPV6); - break; - } - - segs = skb_gso_segment(skb, 0); - kfree_skb(skb); - if (unlikely(IS_ERR(segs))) - return 1; - - do { - struct sk_buff *nskb = segs->next; + return 0; - segs->next = NULL; - if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn, - queuenum)) - kfree_skb(segs); - segs = nskb; - } while (segs); - return 1; +err_unlock: + rcu_read_unlock(); +err: + kfree(entry); + return status; } -void nf_reinject(struct sk_buff *skb, struct nf_info *info, - unsigned int verdict) +void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { - struct list_head *elem = &info->elem->list; - struct list_head *i; - struct nf_afinfo *afinfo; + struct sk_buff *skb = entry->skb; + struct nf_hook_ops *elem = entry->elem; + const struct nf_afinfo *afinfo; + int err; rcu_read_lock(); - /* Release those devices we held, or Alexey will kill me. */ - if (info->indev) dev_put(info->indev); - if (info->outdev) dev_put(info->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - if (skb->nf_bridge->physindev) - dev_put(skb->nf_bridge->physindev); - if (skb->nf_bridge->physoutdev) - dev_put(skb->nf_bridge->physoutdev); - } -#endif - - /* Drop reference to owner of hook which queued us. */ - module_put(info->elem->owner); - - list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { - if (i == elem) - break; - } - - if (i == &nf_hooks[info->pf][info->hook]) { - /* The module which sent it to userspace is gone. */ - NFDEBUG("%s: module disappeared, dropping packet.\n", - __FUNCTION__); - verdict = NF_DROP; - } + nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) { - elem = elem->prev; + elem = list_entry(elem->list.prev, struct nf_hook_ops, list); verdict = NF_ACCEPT; } if (verdict == NF_ACCEPT) { - afinfo = nf_get_afinfo(info->pf); - if (!afinfo || afinfo->reroute(&skb, info) < 0) + afinfo = nf_get_afinfo(entry->pf); + if (!afinfo || afinfo->reroute(skb, entry) < 0) verdict = NF_DROP; } if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(&nf_hooks[info->pf][info->hook], - &skb, info->hook, - info->indev, info->outdev, &elem, - info->okfn, INT_MIN); + verdict = nf_iterate(&nf_hooks[entry->pf][entry->hook], + skb, entry->hook, + entry->indev, entry->outdev, &elem, + entry->okfn, INT_MIN); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: - info->okfn(skb); - case NF_STOLEN: + local_bh_disable(); + entry->okfn(skb); + local_bh_enable(); break; case NF_QUEUE: - if (!__nf_queue(skb, elem, info->pf, info->hook, - info->indev, info->outdev, info->okfn, - verdict >> NF_VERDICT_BITS)) - goto next_hook; + err = nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_QBITS); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; + if (err == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } + break; + case NF_STOLEN: break; default: kfree_skb(skb); } rcu_read_unlock(); - kfree(info); - return; + kfree(entry); } EXPORT_SYMBOL(nf_reinject); - -#ifdef CONFIG_PROC_FS -static void *seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos >= NPROTO) - return NULL; - - return pos; -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - - if (*pos >= NPROTO) - return NULL; - - return pos; -} - -static void seq_stop(struct seq_file *s, void *v) -{ - -} - -static int seq_show(struct seq_file *s, void *v) -{ - int ret; - loff_t *pos = v; - struct nf_queue_handler *qh; - - read_lock_bh(&queue_handler_lock); - qh = queue_handler[*pos]; - if (!qh) - ret = seq_printf(s, "%2lld NONE\n", *pos); - else - ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); - read_unlock_bh(&queue_handler_lock); - - return ret; -} - -static struct seq_operations nfqueue_seq_ops = { - .start = seq_start, - .next = seq_next, - .stop = seq_stop, - .show = seq_show, -}; - -static int nfqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &nfqueue_seq_ops); -} - -static const struct file_operations nfqueue_file_ops = { - .owner = THIS_MODULE, - .open = nfqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif /* PROC_FS */ - - -int __init netfilter_queue_init(void) -{ -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *pde; - - pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter); - if (!pde) - return -1; - pde->proc_fops = &nfqueue_file_ops; -#endif - return 0; -} - diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index 8b8ece75031..f042ae52155 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -23,14 +23,13 @@ static inline int overlap(int min1, int max1, int min2, int max2) /* Functions to register sockopt ranges (exclusive). */ int nf_register_sockopt(struct nf_sockopt_ops *reg) { - struct list_head *i; + struct nf_sockopt_ops *ops; int ret = 0; if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) return -EINTR; - list_for_each(i, &nf_sockopts) { - struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == reg->pf && (overlap(ops->set_optmin, ops->set_optmax, reg->set_optmin, reg->set_optmax) @@ -55,144 +54,113 @@ EXPORT_SYMBOL(nf_register_sockopt); void nf_unregister_sockopt(struct nf_sockopt_ops *reg) { - /* No point being interruptible: we're probably in cleanup_module() */ - restart: mutex_lock(&nf_sockopt_mutex); - if (reg->use != 0) { - /* To be woken by nf_sockopt call... */ - /* FIXME: Stuart Young's name appears gratuitously. */ - set_current_state(TASK_UNINTERRUPTIBLE); - reg->cleanup_task = current; - mutex_unlock(&nf_sockopt_mutex); - schedule(); - goto restart; - } list_del(®->list); mutex_unlock(&nf_sockopt_mutex); } EXPORT_SYMBOL(nf_unregister_sockopt); -/* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, int pf, int val, - char __user *opt, int *len, int get) +static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, + int val, int get) { - struct list_head *i; struct nf_sockopt_ops *ops; - int ret; if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; + return ERR_PTR(-EINTR); - list_for_each(i, &nf_sockopts) { - ops = (struct nf_sockopt_ops *)i; + list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == pf) { + if (!try_module_get(ops->owner)) + goto out_nosup; + if (get) { - if (val >= ops->get_optmin - && val < ops->get_optmax) { - ops->use++; - mutex_unlock(&nf_sockopt_mutex); - ret = ops->get(sk, val, opt, len); + if (val >= ops->get_optmin && + val < ops->get_optmax) goto out; - } } else { - if (val >= ops->set_optmin - && val < ops->set_optmax) { - ops->use++; - mutex_unlock(&nf_sockopt_mutex); - ret = ops->set(sk, val, opt, *len); + if (val >= ops->set_optmin && + val < ops->set_optmax) goto out; - } } + module_put(ops->owner); } } +out_nosup: + ops = ERR_PTR(-ENOPROTOOPT); +out: mutex_unlock(&nf_sockopt_mutex); - return -ENOPROTOOPT; + return ops; +} - out: - mutex_lock(&nf_sockopt_mutex); - ops->use--; - if (ops->cleanup_task) - wake_up_process(ops->cleanup_task); - mutex_unlock(&nf_sockopt_mutex); +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, + char __user *opt, int *len, int get) +{ + struct nf_sockopt_ops *ops; + int ret; + + ops = nf_sockopt_find(sk, pf, val, get); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + if (get) + ret = ops->get(sk, val, opt, len); + else + ret = ops->set(sk, val, opt, *len); + + module_put(ops->owner); return ret; } -int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, - int len) +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + unsigned int len) { return nf_sockopt(sk, pf, val, opt, &len, 0); } EXPORT_SYMBOL(nf_setsockopt); -int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + int *len) { return nf_sockopt(sk, pf, val, opt, len, 1); } EXPORT_SYMBOL(nf_getsockopt); #ifdef CONFIG_COMPAT -static int compat_nf_sockopt(struct sock *sk, int pf, int val, +static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len, int get) { - struct list_head *i; struct nf_sockopt_ops *ops; int ret; - if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - - list_for_each(i, &nf_sockopts) { - ops = (struct nf_sockopt_ops *)i; - if (ops->pf == pf) { - if (get) { - if (val >= ops->get_optmin - && val < ops->get_optmax) { - ops->use++; - mutex_unlock(&nf_sockopt_mutex); - if (ops->compat_get) - ret = ops->compat_get(sk, - val, opt, len); - else - ret = ops->get(sk, - val, opt, len); - goto out; - } - } else { - if (val >= ops->set_optmin - && val < ops->set_optmax) { - ops->use++; - mutex_unlock(&nf_sockopt_mutex); - if (ops->compat_set) - ret = ops->compat_set(sk, - val, opt, *len); - else - ret = ops->set(sk, - val, opt, *len); - goto out; - } - } - } + ops = nf_sockopt_find(sk, pf, val, get); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + if (get) { + if (ops->compat_get) + ret = ops->compat_get(sk, val, opt, len); + else + ret = ops->get(sk, val, opt, len); + } else { + if (ops->compat_set) + ret = ops->compat_set(sk, val, opt, *len); + else + ret = ops->set(sk, val, opt, *len); } - mutex_unlock(&nf_sockopt_mutex); - return -ENOPROTOOPT; - out: - mutex_lock(&nf_sockopt_mutex); - ops->use--; - if (ops->cleanup_task) - wake_up_process(ops->cleanup_task); - mutex_unlock(&nf_sockopt_mutex); + module_put(ops->owner); return ret; } -int compat_nf_setsockopt(struct sock *sk, int pf, - int val, char __user *opt, int len) +int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, + int val, char __user *opt, unsigned int len) { return compat_nf_sockopt(sk, pf, val, opt, &len, 0); } EXPORT_SYMBOL(compat_nf_setsockopt); -int compat_nf_getsockopt(struct sock *sk, int pf, +int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { return compat_nf_sockopt(sk, pf, val, opt, len, 1); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c new file mode 100644 index 00000000000..52e20c9a46a --- /dev/null +++ b/net/netfilter/nf_synproxy_core.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <asm/unaligned.h> +#include <net/tcp.h> +#include <net/netns/generic.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +int synproxy_net_id; +EXPORT_SYMBOL_GPL(synproxy_net_id); + +bool +synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, struct synproxy_options *opts) +{ + int length = (th->doff * 4) - sizeof(*th); + u8 buf[40], *ptr; + + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); + if (ptr == NULL) + return false; + + opts->options = 0; + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return true; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return true; + if (opsize > length) + return true; + + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS) { + opts->mss = get_unaligned_be16(ptr); + opts->options |= XT_SYNPROXY_OPT_MSS; + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW) { + opts->wscale = *ptr; + if (opts->wscale > 14) + opts->wscale = 14; + opts->options |= XT_SYNPROXY_OPT_WSCALE; + } + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP) { + opts->tsval = get_unaligned_be32(ptr); + opts->tsecr = get_unaligned_be32(ptr + 4); + opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) + opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + break; + } + + ptr += opsize - 2; + length -= opsize; + } + } + return true; +} +EXPORT_SYMBOL_GPL(synproxy_parse_options); + +unsigned int synproxy_options_size(const struct synproxy_options *opts) +{ + unsigned int size = 0; + + if (opts->options & XT_SYNPROXY_OPT_MSS) + size += TCPOLEN_MSS_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + size += TCPOLEN_TSTAMP_ALIGNED; + else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + size += TCPOLEN_SACKPERM_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + size += TCPOLEN_WSCALE_ALIGNED; + + return size; +} +EXPORT_SYMBOL_GPL(synproxy_options_size); + +void +synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) +{ + __be32 *ptr = (__be32 *)(th + 1); + u8 options = opts->options; + + if (options & XT_SYNPROXY_OPT_MSS) + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + + if (options & XT_SYNPROXY_OPT_TIMESTAMP) { + if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); + + if (options & XT_SYNPROXY_OPT_WSCALE) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + opts->wscale); +} +EXPORT_SYMBOL_GPL(synproxy_build_options); + +void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts) +{ + opts->tsecr = opts->tsval; + opts->tsval = tcp_time_stamp & ~0x3f; + + if (opts->options & XT_SYNPROXY_OPT_WSCALE) { + opts->tsval |= opts->wscale; + opts->wscale = info->wscale; + } else + opts->tsval |= 0xf; + + if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + opts->tsval |= 1 << 4; + + if (opts->options & XT_SYNPROXY_OPT_ECN) + opts->tsval |= 1 << 5; +} +EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); + +void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +{ + opts->wscale = opts->tsecr & 0xf; + if (opts->wscale != 0xf) + opts->options |= XT_SYNPROXY_OPT_WSCALE; + + opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + + opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; +} +EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); + +unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) +{ + unsigned int optoff, optend; + u32 *ptr, old; + + if (synproxy->tsoff == 0) + return 1; + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + th->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + while (optoff < optend) { + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_TIMESTAMP && + op[1] == TCPOLEN_TIMESTAMP) { + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + ptr = (u32 *)&op[2]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) - + synproxy->tsoff); + } else { + ptr = (u32 *)&op[6]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) + + synproxy->tsoff); + } + inet_proto_csum_replace4(&th->check, skb, + old, *ptr, 0); + return 1; + } + optoff += op[1]; + } + } + return 1; +} +EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); + +static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { + .len = sizeof(struct nf_conn_synproxy), + .align = __alignof__(struct nf_conn_synproxy), + .id = NF_CT_EXT_SYNPROXY, +}; + +#ifdef CONFIG_PROC_FS +static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) +{ + return; +} + +static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct synproxy_stats *stats = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries\t\tsyn_received\t" + "cookie_invalid\tcookie_valid\t" + "cookie_retrans\tconn_reopened\n"); + return 0; + } + + seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, + stats->syn_received, + stats->cookie_invalid, + stats->cookie_valid, + stats->cookie_retrans, + stats->conn_reopened); + + return 0; +} + +static const struct seq_operations synproxy_cpu_seq_ops = { + .start = synproxy_cpu_seq_start, + .next = synproxy_cpu_seq_next, + .stop = synproxy_cpu_seq_stop, + .show = synproxy_cpu_seq_show, +}; + +static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &synproxy_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations synproxy_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = synproxy_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init synproxy_proc_init(struct net *net) +{ + if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, + &synproxy_cpu_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + remove_proc_entry("synproxy", net->proc_net_stat); +} +#else +static int __net_init synproxy_proc_init(struct net *net) +{ + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init synproxy_net_init(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct nf_conntrack_tuple t; + struct nf_conn *ct; + int err = -ENOMEM; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); + if (IS_ERR(ct)) { + err = PTR_ERR(ct); + goto err1; + } + + if (!nfct_seqadj_ext_add(ct)) + goto err2; + if (!nfct_synproxy_ext_add(ct)) + goto err2; + + nf_conntrack_tmpl_insert(net, ct); + snet->tmpl = ct; + + snet->stats = alloc_percpu(struct synproxy_stats); + if (snet->stats == NULL) + goto err2; + + err = synproxy_proc_init(net); + if (err < 0) + goto err3; + + return 0; + +err3: + free_percpu(snet->stats); +err2: + nf_conntrack_free(ct); +err1: + return err; +} + +static void __net_exit synproxy_net_exit(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + + nf_ct_put(snet->tmpl); + synproxy_proc_exit(net); + free_percpu(snet->stats); +} + +static struct pernet_operations synproxy_net_ops = { + .init = synproxy_net_init, + .exit = synproxy_net_exit, + .id = &synproxy_net_id, + .size = sizeof(struct synproxy_net), +}; + +static int __init synproxy_core_init(void) +{ + int err; + + err = nf_ct_extend_register(&nf_ct_synproxy_extend); + if (err < 0) + goto err1; + + err = register_pernet_subsys(&synproxy_net_ops); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +err1: + return err; +} + +static void __exit synproxy_core_exit(void) +{ + unregister_pernet_subsys(&synproxy_net_ops); + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +} + +module_init(synproxy_core_init); +module_exit(synproxy_core_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_sysctl.c b/net/netfilter/nf_sysctl.c deleted file mode 100644 index ee34589e48a..00000000000 --- a/net/netfilter/nf_sysctl.c +++ /dev/null @@ -1,134 +0,0 @@ -/* nf_sysctl.c netfilter sysctl registration/unregistation - * - * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> - */ -#include <linux/module.h> -#include <linux/sysctl.h> -#include <linux/string.h> -#include <linux/slab.h> - -static void -path_free(struct ctl_table *path, struct ctl_table *table) -{ - struct ctl_table *t, *next; - - for (t = path; t != NULL && t != table; t = next) { - next = t->child; - kfree(t); - } -} - -static struct ctl_table * -path_dup(struct ctl_table *path, struct ctl_table *table) -{ - struct ctl_table *t, *last = NULL, *tmp; - - for (t = path; t != NULL; t = t->child) { - /* twice the size since path elements are terminated by an - * empty element */ - tmp = kmemdup(t, 2 * sizeof(*t), GFP_KERNEL); - if (tmp == NULL) { - if (last != NULL) - path_free(path, table); - return NULL; - } - - if (last != NULL) - last->child = tmp; - else - path = tmp; - last = tmp; - } - - if (last != NULL) - last->child = table; - else - path = table; - - return path; -} - -struct ctl_table_header * -nf_register_sysctl_table(struct ctl_table *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - - path = path_dup(path, table); - if (path == NULL) - return NULL; - header = register_sysctl_table(path); - if (header == NULL) - path_free(path, table); - return header; -} -EXPORT_SYMBOL_GPL(nf_register_sysctl_table); - -void -nf_unregister_sysctl_table(struct ctl_table_header *header, - struct ctl_table *table) -{ - struct ctl_table *path = header->ctl_table; - - unregister_sysctl_table(header); - path_free(path, table); -} -EXPORT_SYMBOL_GPL(nf_unregister_sysctl_table); - -/* net/netfilter */ -static struct ctl_table nf_net_netfilter_table[] = { - { - .ctl_name = NET_NETFILTER, - .procname = "netfilter", - .mode = 0555, - }, - { - .ctl_name = 0 - } -}; -struct ctl_table nf_net_netfilter_sysctl_path[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = nf_net_netfilter_table, - }, - { - .ctl_name = 0 - } -}; -EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path); - -/* net/ipv4/netfilter */ -static struct ctl_table nf_net_ipv4_netfilter_table[] = { - { - .ctl_name = NET_IPV4_NETFILTER, - .procname = "netfilter", - .mode = 0555, - }, - { - .ctl_name = 0 - } -}; -static struct ctl_table nf_net_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = nf_net_ipv4_netfilter_table, - }, - { - .ctl_name = 0 - } -}; -struct ctl_table nf_net_ipv4_netfilter_sysctl_path[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = nf_net_ipv4_table, - }, - { - .ctl_name = 0 - } -}; -EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c new file mode 100644 index 00000000000..8746ff9a835 --- /dev/null +++ b/net/netfilter/nf_tables_api.c @@ -0,0 +1,4041 @@ +/* + * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +static LIST_HEAD(nf_tables_expressions); + +/** + * nft_register_afinfo - register nf_tables address family info + * + * @afi: address family info to register + * + * Register the address family for use with nf_tables. Returns zero on + * success or a negative errno code otherwise. + */ +int nft_register_afinfo(struct net *net, struct nft_af_info *afi) +{ + INIT_LIST_HEAD(&afi->tables); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_tail_rcu(&afi->list, &net->nft.af_info); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_afinfo); + +/** + * nft_unregister_afinfo - unregister nf_tables address family info + * + * @afi: address family info to unregister + * + * Unregister the address family for use with nf_tables. + */ +void nft_unregister_afinfo(struct nft_af_info *afi) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&afi->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_afinfo); + +static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family) +{ + struct nft_af_info *afi; + + list_for_each_entry(afi, &net->nft.af_info, list) { + if (afi->family == family) + return afi; + } + return NULL; +} + +static struct nft_af_info * +nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) +{ + struct nft_af_info *afi; + + afi = nft_afinfo_lookup(net, family); + if (afi != NULL) + return afi; +#ifdef CONFIG_MODULES + if (autoload) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-afinfo-%u", family); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + afi = nft_afinfo_lookup(net, family); + if (afi != NULL) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-EAFNOSUPPORT); +} + +static void nft_ctx_init(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nft_af_info *afi, + struct nft_table *table, + struct nft_chain *chain, + const struct nlattr * const *nla) +{ + ctx->net = sock_net(skb->sk); + ctx->afi = afi; + ctx->table = table; + ctx->chain = chain; + ctx->nla = nla; + ctx->portid = NETLINK_CB(skb).portid; + ctx->report = nlmsg_report(nlh); + ctx->seq = nlh->nlmsg_seq; +} + +static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, + u32 size) +{ + struct nft_trans *trans; + + trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); + if (trans == NULL) + return NULL; + + trans->msg_type = msg_type; + trans->ctx = *ctx; + + return trans; +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + list_del(&trans->list); + kfree(trans); +} + +/* + * Tables + */ + +static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, + const struct nlattr *nla) +{ + struct nft_table *table; + + list_for_each_entry(table, &afi->tables, list) { + if (!nla_strcmp(nla, table->name)) + return table; + } + return NULL; +} + +static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, + const struct nlattr *nla) +{ + struct nft_table *table; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + table = nft_table_lookup(afi, nla); + if (table != NULL) + return table; + + return ERR_PTR(-ENOENT); +} + +static inline u64 nf_tables_alloc_handle(struct nft_table *table) +{ + return ++table->hgenerator; +} + +static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX]; + +static const struct nf_chain_type * +__nf_tables_chain_type_lookup(int family, const struct nlattr *nla) +{ + int i; + + for (i = 0; i < NFT_CHAIN_T_MAX; i++) { + if (chain_type[family][i] != NULL && + !nla_strcmp(nla, chain_type[family][i]->name)) + return chain_type[family][i]; + } + return NULL; +} + +static const struct nf_chain_type * +nf_tables_chain_type_lookup(const struct nft_af_info *afi, + const struct nlattr *nla, + bool autoload) +{ + const struct nf_chain_type *type; + + type = __nf_tables_chain_type_lookup(afi->family, nla); + if (type != NULL) + return type; +#ifdef CONFIG_MODULES + if (autoload) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-chain-%u-%.*s", afi->family, + nla_len(nla), (const char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + type = __nf_tables_chain_type_lookup(afi->family, nla); + if (type != NULL) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { + [NFTA_TABLE_NAME] = { .type = NLA_STRING }, + [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, +}; + +static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, + int event, u32 flags, int family, + const struct nft_table *table) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || + nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || + nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) +{ + struct sk_buff *skb; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_dump_tables(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_table_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWTABLE, + NLM_F_MULTI, + afi->family, table) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +done: + rcu_read_unlock(); + cb->args[0] = idx; + return skb->len; +} + +/* Internal table flags */ +#define NFT_TABLE_INACTIVE (1 << 15) + +static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + struct sk_buff *skb2; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_tables, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, + family, table); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static int nf_tables_table_enable(const struct nft_af_info *afi, + struct nft_table *table) +{ + struct nft_chain *chain; + int err, i = 0; + + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + if (err < 0) + goto err; + + i++; + } + return 0; +err: + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + if (i-- <= 0) + break; + + nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); + } + return err; +} + +static void nf_tables_table_disable(const struct nft_af_info *afi, + struct nft_table *table) +{ + struct nft_chain *chain; + + list_for_each_entry(chain, &table->chains, list) { + if (chain->flags & NFT_BASE_CHAIN) + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } +} + +static int nf_tables_updtable(struct nft_ctx *ctx) +{ + struct nft_trans *trans; + u32 flags; + int ret = 0; + + if (!ctx->nla[NFTA_TABLE_FLAGS]) + return 0; + + flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; + + if (flags == ctx->table->flags) + return 0; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, + sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if ((flags & NFT_TABLE_F_DORMANT) && + !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { + nft_trans_table_enable(trans) = false; + } else if (!(flags & NFT_TABLE_F_DORMANT) && + ctx->table->flags & NFT_TABLE_F_DORMANT) { + ret = nf_tables_table_enable(ctx->afi, ctx->table); + if (ret >= 0) { + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + nft_trans_table_enable(trans) = true; + } + } + if (ret < 0) + goto err; + + nft_trans_table_update(trans) = true; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +err: + nft_trans_destroy(trans); + return ret; +} + +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWTABLE) + ctx->table->flags |= NFT_TABLE_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nlattr *name; + struct nft_af_info *afi; + struct nft_table *table; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + u32 flags = 0; + struct nft_ctx ctx; + int err; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + name = nla[NFTA_TABLE_NAME]; + table = nf_tables_table_lookup(afi, name); + if (IS_ERR(table)) { + if (PTR_ERR(table) != -ENOENT) + return PTR_ERR(table); + table = NULL; + } + + if (table != NULL) { + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + return nf_tables_updtable(&ctx); + } + + if (nla[NFTA_TABLE_FLAGS]) { + flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; + } + + if (!try_module_get(afi->owner)) + return -EAFNOSUPPORT; + + table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); + if (table == NULL) { + module_put(afi->owner); + return -ENOMEM; + } + + nla_strlcpy(table->name, name, nla_len(name)); + INIT_LIST_HEAD(&table->chains); + INIT_LIST_HEAD(&table->sets); + table->flags = flags; + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); + if (err < 0) { + kfree(table); + module_put(afi->owner); + return err; + } + list_add_tail_rcu(&table->list, &afi->tables); + return 0; +} + +static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct nft_table *table; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family, err; + struct nft_ctx ctx; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + if (table->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); + if (err < 0) + return err; + + list_del_rcu(&table->list); + return 0; +} + +static void nf_tables_table_destroy(struct nft_ctx *ctx) +{ + BUG_ON(ctx->table->use > 0); + + kfree(ctx->table); + module_put(ctx->afi->owner); +} + +int nft_register_chain_type(const struct nf_chain_type *ctype) +{ + int err = 0; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (chain_type[ctype->family][ctype->type] != NULL) { + err = -EBUSY; + goto out; + } + chain_type[ctype->family][ctype->type] = ctype; +out: + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return err; +} +EXPORT_SYMBOL_GPL(nft_register_chain_type); + +void nft_unregister_chain_type(const struct nf_chain_type *ctype) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + chain_type[ctype->family][ctype->type] = NULL; + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_chain_type); + +/* + * Chains + */ + +static struct nft_chain * +nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) +{ + struct nft_chain *chain; + + list_for_each_entry(chain, &table->chains, list) { + if (chain->handle == handle) + return chain; + } + + return ERR_PTR(-ENOENT); +} + +static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, + const struct nlattr *nla) +{ + struct nft_chain *chain; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + list_for_each_entry(chain, &table->chains, list) { + if (!nla_strcmp(nla, chain->name)) + return chain; + } + + return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { + [NFTA_CHAIN_TABLE] = { .type = NLA_STRING }, + [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 }, + [NFTA_CHAIN_NAME] = { .type = NLA_STRING, + .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, + [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, + [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { + [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, + [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, +}; + +static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) +{ + struct nft_stats *cpu_stats, total; + struct nlattr *nest; + unsigned int seq; + u64 pkts, bytes; + int cpu; + + memset(&total, 0, sizeof(total)); + for_each_possible_cpu(cpu) { + cpu_stats = per_cpu_ptr(stats, cpu); + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + pkts = cpu_stats->pkts; + bytes = cpu_stats->bytes; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + total.pkts += pkts; + total.bytes += bytes; + } + nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); + if (nest == NULL) + goto nla_put_failure; + + if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) || + nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes))) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, + int event, u32 flags, int family, + const struct nft_table *table, + const struct nft_chain *chain) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle))) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) + goto nla_put_failure; + + if (chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = nft_base_chain(chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + struct nlattr *nest; + + nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + goto nla_put_failure; + nla_nest_end(skb, nest); + + if (nla_put_be32(skb, NFTA_CHAIN_POLICY, + htonl(basechain->policy))) + goto nla_put_failure; + + if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) + goto nla_put_failure; + + if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) + goto nla_put_failure; + } + + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +{ + struct sk_buff *skb; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_dump_chains(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWCHAIN, + NLM_F_MULTI, + afi->family, table, chain) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + } +done: + rcu_read_unlock(); + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + struct sk_buff *skb2; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_chains, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, + family, table, chain); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { + [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, + [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, +}; + +static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) +{ + struct nlattr *tb[NFTA_COUNTER_MAX+1]; + struct nft_stats __percpu *newstats; + struct nft_stats *stats; + int err; + + err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); + if (err < 0) + return ERR_PTR(err); + + if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) + return ERR_PTR(-EINVAL); + + newstats = netdev_alloc_pcpu_stats(struct nft_stats); + if (newstats == NULL) + return ERR_PTR(-ENOMEM); + + /* Restore old counters on this cpu, no problem. Per-cpu statistics + * are not exposed to userspace. + */ + stats = this_cpu_ptr(newstats); + stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + + return newstats; +} + +static void nft_chain_stats_replace(struct nft_base_chain *chain, + struct nft_stats __percpu *newstats) +{ + if (chain->stats) { + struct nft_stats __percpu *oldstats = + nft_dereference(chain->stats); + + rcu_assign_pointer(chain->stats, newstats); + synchronize_rcu(); + free_percpu(oldstats); + } else + rcu_assign_pointer(chain->stats, newstats); +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWCHAIN) + ctx->chain->flags |= NFT_CHAIN_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static void nf_tables_chain_destroy(struct nft_chain *chain) +{ + BUG_ON(chain->use > 0); + + if (chain->flags & NFT_BASE_CHAIN) { + module_put(nft_base_chain(chain)->type->owner); + free_percpu(nft_base_chain(chain)->stats); + kfree(nft_base_chain(chain)); + } else { + kfree(chain); + } +} + +static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nlattr * uninitialized_var(name); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + struct nft_base_chain *basechain = NULL; + struct nlattr *ha[NFTA_HOOK_MAX + 1]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + u8 policy = NF_ACCEPT; + u64 handle = 0; + unsigned int i; + struct nft_stats __percpu *stats; + int err; + bool create; + struct nft_ctx ctx; + + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + + chain = NULL; + name = nla[NFTA_CHAIN_NAME]; + + if (nla[NFTA_CHAIN_HANDLE]) { + handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); + chain = nf_tables_chain_lookup_byhandle(table, handle); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { + chain = nf_tables_chain_lookup(table, name); + if (IS_ERR(chain)) { + if (PTR_ERR(chain) != -ENOENT) + return PTR_ERR(chain); + chain = NULL; + } + } + + if (nla[NFTA_CHAIN_POLICY]) { + if ((chain != NULL && + !(chain->flags & NFT_BASE_CHAIN)) || + nla[NFTA_CHAIN_HOOK] == NULL) + return -EOPNOTSUPP; + + policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); + switch (policy) { + case NF_DROP: + case NF_ACCEPT: + break; + default: + return -EINVAL; + } + } + + if (chain != NULL) { + struct nft_stats *stats = NULL; + struct nft_trans *trans; + + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (nla[NFTA_CHAIN_HANDLE] && name && + !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) + return -EEXIST; + + if (nla[NFTA_CHAIN_COUNTERS]) { + if (!(chain->flags & NFT_BASE_CHAIN)) + return -EOPNOTSUPP; + + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) + return PTR_ERR(stats); + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, + sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + nft_trans_chain_stats(trans) = stats; + nft_trans_chain_update(trans) = true; + + if (nla[NFTA_CHAIN_POLICY]) + nft_trans_chain_policy(trans) = policy; + else + nft_trans_chain_policy(trans) = -1; + + if (nla[NFTA_CHAIN_HANDLE] && name) { + nla_strlcpy(nft_trans_chain_name(trans), name, + NFT_CHAIN_MAXNAMELEN); + } + list_add_tail(&trans->list, &net->nft.commit_list); + return 0; + } + + if (table->use == UINT_MAX) + return -EOVERFLOW; + + if (nla[NFTA_CHAIN_HOOK]) { + const struct nf_chain_type *type; + struct nf_hook_ops *ops; + nf_hookfn *hookfn; + u32 hooknum, priority; + + type = chain_type[family][NFT_CHAIN_T_DEFAULT]; + if (nla[NFTA_CHAIN_TYPE]) { + type = nf_tables_chain_type_lookup(afi, + nla[NFTA_CHAIN_TYPE], + create); + if (IS_ERR(type)) + return PTR_ERR(type); + } + + err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], + nft_hook_policy); + if (err < 0) + return err; + if (ha[NFTA_HOOK_HOOKNUM] == NULL || + ha[NFTA_HOOK_PRIORITY] == NULL) + return -EINVAL; + + hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); + if (hooknum >= afi->nhooks) + return -EINVAL; + priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + + if (!(type->hook_mask & (1 << hooknum))) + return -EOPNOTSUPP; + if (!try_module_get(type->owner)) + return -ENOENT; + hookfn = type->hooks[hooknum]; + + basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); + if (basechain == NULL) + return -ENOMEM; + + if (nla[NFTA_CHAIN_COUNTERS]) { + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) { + module_put(type->owner); + kfree(basechain); + return PTR_ERR(stats); + } + basechain->stats = stats; + } else { + stats = netdev_alloc_pcpu_stats(struct nft_stats); + if (IS_ERR(stats)) { + module_put(type->owner); + kfree(basechain); + return PTR_ERR(stats); + } + rcu_assign_pointer(basechain->stats, stats); + } + + basechain->type = type; + chain = &basechain->chain; + + for (i = 0; i < afi->nops; i++) { + ops = &basechain->ops[i]; + ops->pf = family; + ops->owner = afi->owner; + ops->hooknum = hooknum; + ops->priority = priority; + ops->priv = chain; + ops->hook = afi->hooks[ops->hooknum]; + if (hookfn) + ops->hook = hookfn; + if (afi->hook_ops_init) + afi->hook_ops_init(ops, i); + } + + chain->flags |= NFT_BASE_CHAIN; + basechain->policy = policy; + } else { + chain = kzalloc(sizeof(*chain), GFP_KERNEL); + if (chain == NULL) + return -ENOMEM; + } + + INIT_LIST_HEAD(&chain->rules); + chain->handle = nf_tables_alloc_handle(table); + chain->net = net; + chain->table = table; + nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) { + err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + if (err < 0) + goto err1; + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); + if (err < 0) + goto err2; + + table->use++; + list_add_tail_rcu(&chain->list, &table->chains); + return 0; +err2: + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } +err1: + nf_tables_chain_destroy(chain); + return err; +} + +static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + struct nft_ctx ctx; + int err; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + if (chain->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); + if (err < 0) + return err; + + table->use--; + list_del_rcu(&chain->list); + return 0; +} + +/* + * Expressions + */ + +/** + * nft_register_expr - register nf_tables expr type + * @ops: expr type + * + * Registers the expr type for use with nf_tables. Returns zero on + * success or a negative errno code otherwise. + */ +int nft_register_expr(struct nft_expr_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (type->family == NFPROTO_UNSPEC) + list_add_tail_rcu(&type->list, &nf_tables_expressions); + else + list_add_rcu(&type->list, &nf_tables_expressions); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_expr); + +/** + * nft_unregister_expr - unregister nf_tables expr type + * @ops: expr type + * + * Unregisters the expr typefor use with nf_tables. + */ +void nft_unregister_expr(struct nft_expr_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&type->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_expr); + +static const struct nft_expr_type *__nft_expr_type_get(u8 family, + struct nlattr *nla) +{ + const struct nft_expr_type *type; + + list_for_each_entry(type, &nf_tables_expressions, list) { + if (!nla_strcmp(nla, type->name) && + (!type->family || type->family == family)) + return type; + } + return NULL; +} + +static const struct nft_expr_type *nft_expr_type_get(u8 family, + struct nlattr *nla) +{ + const struct nft_expr_type *type; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + type = __nft_expr_type_get(family, nla); + if (type != NULL && try_module_get(type->owner)) + return type; + +#ifdef CONFIG_MODULES + if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%.*s", + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { + [NFTA_EXPR_NAME] = { .type = NLA_STRING }, + [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, +}; + +static int nf_tables_fill_expr_info(struct sk_buff *skb, + const struct nft_expr *expr) +{ + if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) + goto nla_put_failure; + + if (expr->ops->dump) { + struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA); + if (data == NULL) + goto nla_put_failure; + if (expr->ops->dump(skb, expr) < 0) + goto nla_put_failure; + nla_nest_end(skb, data); + } + + return skb->len; + +nla_put_failure: + return -1; +}; + +struct nft_expr_info { + const struct nft_expr_ops *ops; + struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nf_tables_expr_parse(const struct nft_ctx *ctx, + const struct nlattr *nla, + struct nft_expr_info *info) +{ + const struct nft_expr_type *type; + const struct nft_expr_ops *ops; + struct nlattr *tb[NFTA_EXPR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy); + if (err < 0) + return err; + + type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); + if (IS_ERR(type)) + return PTR_ERR(type); + + if (tb[NFTA_EXPR_DATA]) { + err = nla_parse_nested(info->tb, type->maxattr, + tb[NFTA_EXPR_DATA], type->policy); + if (err < 0) + goto err1; + } else + memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1)); + + if (type->select_ops != NULL) { + ops = type->select_ops(ctx, + (const struct nlattr * const *)info->tb); + if (IS_ERR(ops)) { + err = PTR_ERR(ops); + goto err1; + } + } else + ops = type->ops; + + info->ops = ops; + return 0; + +err1: + module_put(type->owner); + return err; +} + +static int nf_tables_newexpr(const struct nft_ctx *ctx, + const struct nft_expr_info *info, + struct nft_expr *expr) +{ + const struct nft_expr_ops *ops = info->ops; + int err; + + expr->ops = ops; + if (ops->init) { + err = ops->init(ctx, expr, (const struct nlattr **)info->tb); + if (err < 0) + goto err1; + } + + return 0; + +err1: + expr->ops = NULL; + return err; +} + +static void nf_tables_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) +{ + if (expr->ops->destroy) + expr->ops->destroy(ctx, expr); + module_put(expr->ops->type->owner); +} + +/* + * Rules + */ + +static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, + u64 handle) +{ + struct nft_rule *rule; + + // FIXME: this sucks + list_for_each_entry(rule, &chain->rules, list) { + if (handle == rule->handle) + return rule; + } + + return ERR_PTR(-ENOENT); +} + +static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, + const struct nlattr *nla) +{ + if (nla == NULL) + return ERR_PTR(-EINVAL); + + return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); +} + +static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { + [NFTA_RULE_TABLE] = { .type = NLA_STRING }, + [NFTA_RULE_CHAIN] = { .type = NLA_STRING, + .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, + [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, + [NFTA_RULE_POSITION] = { .type = NLA_U64 }, + [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, +}; + +static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, + int event, u32 flags, int family, + const struct nft_table *table, + const struct nft_chain *chain, + const struct nft_rule *rule) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + const struct nft_expr *expr, *next; + struct nlattr *list; + const struct nft_rule *prule; + int type = event | NFNL_SUBSYS_NFTABLES << 8; + + nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle))) + goto nla_put_failure; + + if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { + prule = list_entry(rule->list.prev, struct nft_rule, list); + if (nla_put_be64(skb, NFTA_RULE_POSITION, + cpu_to_be64(prule->handle))) + goto nla_put_failure; + } + + list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS); + if (list == NULL) + goto nla_put_failure; + nft_rule_for_each_expr(expr, next, rule) { + struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM); + if (elem == NULL) + goto nla_put_failure; + if (nf_tables_fill_expr_info(skb, expr) < 0) + goto nla_put_failure; + nla_nest_end(skb, elem); + } + nla_nest_end(skb, list); + + if (rule->ulen && + nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_rule_notify(const struct nft_ctx *ctx, + const struct nft_rule *rule, + int event) +{ + struct sk_buff *skb; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain, rule); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static inline bool +nft_rule_is_active(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << net->nft.gencursor)) == 0; +} + +static inline int gencursor_next(struct net *net) +{ + return net->nft.gencursor+1 == 1 ? 1 : 0; +} + +static inline int +nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << gencursor_next(net))) == 0; +} + +static inline void +nft_rule_activate_next(struct net *net, struct nft_rule *rule) +{ + /* Now inactive, will be active in the future */ + rule->genmask = (1 << net->nft.gencursor); +} + +static inline void +nft_rule_disactivate_next(struct net *net, struct nft_rule *rule) +{ + rule->genmask = (1 << gencursor_next(net)); +} + +static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) +{ + rule->genmask = 0; +} + +static int nf_tables_dump_rules(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + const struct nft_rule *rule; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { + list_for_each_entry_rcu(rule, &chain->rules, list) { + if (!nft_rule_is_active(net, rule)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWRULE, + NLM_F_MULTI | NLM_F_APPEND, + afi->family, table, chain, rule) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + } + } +done: + rcu_read_unlock(); + + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + const struct nft_rule *rule; + struct sk_buff *skb2; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_rules, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + + rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) + return PTR_ERR(rule); + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, + family, table, chain, rule); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static void nf_tables_rule_destroy(const struct nft_ctx *ctx, + struct nft_rule *rule) +{ + struct nft_expr *expr; + + /* + * Careful: some expressions might not be initialized in case this + * is called on error from nf_tables_newrule(). + */ + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + nf_tables_expr_destroy(ctx, expr); + expr = nft_expr_next(expr); + } + kfree(rule); +} + +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, + struct nft_rule *rule) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); + if (trans == NULL) + return NULL; + + nft_trans_rule(trans) = rule; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return trans; +} + +#define NFT_RULE_MAXEXPRS 128 + +static struct nft_expr_info *info; + +static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct net *net = sock_net(skb->sk); + struct nft_table *table; + struct nft_chain *chain; + struct nft_rule *rule, *old_rule = NULL; + struct nft_trans *trans = NULL; + struct nft_expr *expr; + struct nft_ctx ctx; + struct nlattr *tmp; + unsigned int size, i, n, ulen = 0; + int err, rem; + bool create; + u64 handle, pos_handle; + + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + + if (nla[NFTA_RULE_HANDLE]) { + handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); + rule = __nf_tables_rule_lookup(chain, handle); + if (IS_ERR(rule)) + return PTR_ERR(rule); + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + old_rule = rule; + else + return -EOPNOTSUPP; + } else { + if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) + return -EINVAL; + handle = nf_tables_alloc_handle(table); + + if (chain->use == UINT_MAX) + return -EOVERFLOW; + } + + if (nla[NFTA_RULE_POSITION]) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -EOPNOTSUPP; + + pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); + old_rule = __nf_tables_rule_lookup(chain, pos_handle); + if (IS_ERR(old_rule)) + return PTR_ERR(old_rule); + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + + n = 0; + size = 0; + if (nla[NFTA_RULE_EXPRESSIONS]) { + nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { + err = -EINVAL; + if (nla_type(tmp) != NFTA_LIST_ELEM) + goto err1; + if (n == NFT_RULE_MAXEXPRS) + goto err1; + err = nf_tables_expr_parse(&ctx, tmp, &info[n]); + if (err < 0) + goto err1; + size += info[n].ops->size; + n++; + } + } + + if (nla[NFTA_RULE_USERDATA]) + ulen = nla_len(nla[NFTA_RULE_USERDATA]); + + err = -ENOMEM; + rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); + if (rule == NULL) + goto err1; + + nft_rule_activate_next(net, rule); + + rule->handle = handle; + rule->dlen = size; + rule->ulen = ulen; + + if (ulen) + nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); + + expr = nft_expr_first(rule); + for (i = 0; i < n; i++) { + err = nf_tables_newexpr(&ctx, &info[i], expr); + if (err < 0) + goto err2; + info[i].ops = NULL; + expr = nft_expr_next(expr); + } + + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (nft_rule_is_active_next(net, old_rule)) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, + old_rule); + if (trans == NULL) { + err = -ENOMEM; + goto err2; + } + nft_rule_disactivate_next(net, old_rule); + chain->use--; + list_add_tail_rcu(&rule->list, &old_rule->list); + } else { + err = -ENOENT; + goto err2; + } + } else if (nlh->nlmsg_flags & NLM_F_APPEND) + if (old_rule) + list_add_rcu(&rule->list, &old_rule->list); + else + list_add_tail_rcu(&rule->list, &chain->rules); + else { + if (old_rule) + list_add_tail_rcu(&rule->list, &old_rule->list); + else + list_add_rcu(&rule->list, &chain->rules); + } + + if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { + err = -ENOMEM; + goto err3; + } + chain->use++; + return 0; + +err3: + list_del_rcu(&rule->list); + if (trans) { + list_del_rcu(&nft_trans_rule(trans)->list); + nft_rule_clear(net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + chain->use++; + } +err2: + nf_tables_rule_destroy(&ctx, rule); +err1: + for (i = 0; i < n; i++) { + if (info[i].ops != NULL) + module_put(info[i].ops->type->owner); + } + return err; +} + +static int +nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) +{ + /* You cannot delete the same rule twice */ + if (nft_rule_is_active_next(ctx->net, rule)) { + if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) + return -ENOMEM; + nft_rule_disactivate_next(ctx->net, rule); + ctx->chain->use--; + return 0; + } + return -ENOENT; +} + +static int nf_table_delrule_by_chain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &ctx->chain->rules, list) { + err = nf_tables_delrule_one(ctx, rule); + if (err < 0) + return err; + } + return 0; +} + +static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct net *net = sock_net(skb->sk); + struct nft_table *table; + struct nft_chain *chain = NULL; + struct nft_rule *rule; + int family = nfmsg->nfgen_family, err = 0; + struct nft_ctx ctx; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + if (nla[NFTA_RULE_CHAIN]) { + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + + if (chain) { + if (nla[NFTA_RULE_HANDLE]) { + rule = nf_tables_rule_lookup(chain, + nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) + return PTR_ERR(rule); + + err = nf_tables_delrule_one(&ctx, rule); + } else { + err = nf_table_delrule_by_chain(&ctx); + } + } else { + list_for_each_entry(chain, &table->chains, list) { + ctx.chain = chain; + err = nf_table_delrule_by_chain(&ctx); + if (err < 0) + break; + } + } + + return err; +} + +/* + * Sets + */ + +static LIST_HEAD(nf_tables_set_ops); + +int nft_register_set(struct nft_set_ops *ops) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_tail_rcu(&ops->list, &nf_tables_set_ops); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_set); + +void nft_unregister_set(struct nft_set_ops *ops) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&ops->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_set); + +/* + * Select a set implementation based on the data characteristics and the + * given policy. The total memory use might not be known if no size is + * given, in that case the amount of memory per element is used. + */ +static const struct nft_set_ops * +nft_select_set_ops(const struct nlattr * const nla[], + const struct nft_set_desc *desc, + enum nft_set_policies policy) +{ + const struct nft_set_ops *ops, *bops; + struct nft_set_estimate est, best; + u32 features; + +#ifdef CONFIG_MODULES + if (list_empty(&nf_tables_set_ops)) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-set"); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (!list_empty(&nf_tables_set_ops)) + return ERR_PTR(-EAGAIN); + } +#endif + features = 0; + if (nla[NFTA_SET_FLAGS] != NULL) { + features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); + features &= NFT_SET_INTERVAL | NFT_SET_MAP; + } + + bops = NULL; + best.size = ~0; + best.class = ~0; + + list_for_each_entry(ops, &nf_tables_set_ops, list) { + if ((ops->features & features) != features) + continue; + if (!ops->estimate(desc, features, &est)) + continue; + + switch (policy) { + case NFT_SET_POL_PERFORMANCE: + if (est.class < best.class) + break; + if (est.class == best.class && est.size < best.size) + break; + continue; + case NFT_SET_POL_MEMORY: + if (est.size < best.size) + break; + if (est.size == best.size && est.class < best.class) + break; + continue; + default: + break; + } + + if (!try_module_get(ops->owner)) + continue; + if (bops != NULL) + module_put(bops->owner); + + bops = ops; + best = est; + } + + if (bops != NULL) + return bops; + + return ERR_PTR(-EOPNOTSUPP); +} + +static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { + [NFTA_SET_TABLE] = { .type = NLA_STRING }, + [NFTA_SET_NAME] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, + [NFTA_SET_FLAGS] = { .type = NLA_U32 }, + [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, + [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, + [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 }, + [NFTA_SET_DATA_LEN] = { .type = NLA_U32 }, + [NFTA_SET_POLICY] = { .type = NLA_U32 }, + [NFTA_SET_DESC] = { .type = NLA_NESTED }, + [NFTA_SET_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { + [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi = NULL; + struct nft_table *table = NULL; + + if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + } + + if (nla[NFTA_SET_TABLE] != NULL) { + if (afi == NULL) + return -EAFNOSUPPORT; + + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + } + + nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + return 0; +} + +struct nft_set *nf_tables_set_lookup(const struct nft_table *table, + const struct nlattr *nla) +{ + struct nft_set *set; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + list_for_each_entry(set, &table->sets, list) { + if (!nla_strcmp(nla, set->name)) + return set; + } + return ERR_PTR(-ENOENT); +} + +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + struct nft_trans *trans; + u32 id = ntohl(nla_get_be32(nla)); + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWSET && + id == nft_trans_set_id(trans)) + return nft_trans_set(trans); + } + return ERR_PTR(-ENOENT); +} + +static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, + const char *name) +{ + const struct nft_set *i; + const char *p; + unsigned long *inuse; + unsigned int n = 0, min = 0; + + p = strnchr(name, IFNAMSIZ, '%'); + if (p != NULL) { + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (inuse == NULL) + return -ENOMEM; +cont: + list_for_each_entry(i, &ctx->table->sets, list) { + int tmp; + + if (!sscanf(i->name, name, &tmp)) + continue; + if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) + continue; + + set_bit(tmp - min, inuse); + } + + n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); + if (n >= BITS_PER_BYTE * PAGE_SIZE) { + min += BITS_PER_BYTE * PAGE_SIZE; + memset(inuse, 0, PAGE_SIZE); + goto cont; + } + free_page((unsigned long)inuse); + } + + snprintf(set->name, sizeof(set->name), name, min + n); + list_for_each_entry(i, &ctx->table->sets, list) { + if (!strcmp(set->name, i->name)) + return -ENFILE; + } + return 0; +} + +static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, + const struct nft_set *set, u16 event, u16 flags) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *desc; + u32 portid = ctx->portid; + u32 seq = ctx->seq; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_NAME, set->name)) + goto nla_put_failure; + if (set->flags != 0) + if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) + goto nla_put_failure; + + if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen))) + goto nla_put_failure; + if (set->flags & NFT_SET_MAP) { + if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) + goto nla_put_failure; + } + + desc = nla_nest_start(skb, NFTA_SET_DESC); + if (desc == NULL) + goto nla_put_failure; + if (set->size && + nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + goto nla_put_failure; + nla_nest_end(skb, desc); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_set_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + int event, gfp_t gfp_flags) +{ + struct sk_buff *skb; + u32 portid = ctx->portid; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); + if (skb == NULL) + goto err; + + err = nf_tables_fill_set(skb, ctx, set, event, 0); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, + ctx->report, gfp_flags); +err: + if (err < 0) + nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); + return err; +} + +static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nft_set *set; + unsigned int idx = 0, s_idx = cb->args[0]; + + if (cb->args[1]) + return skb->len; + + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(set, &ctx->table->sets, list) { + if (idx < s_idx) + goto cont; + if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + cb->args[1] = 1; +done: + rcu_read_unlock(); + return skb->len; +} + +static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nft_set *set; + unsigned int idx, s_idx = cb->args[0]; + struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; + + if (cb->args[1]) + return skb->len; + + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(table, &ctx->afi->tables, list) { + if (cur_table) { + if (cur_table != table) + continue; + + cur_table = NULL; + } + ctx->table = table; + idx = 0; + list_for_each_entry_rcu(set, &ctx->table->sets, list) { + if (idx < s_idx) + goto cont; + if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + cb->args[2] = (unsigned long) table; + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + cb->args[1] = 1; +done: + rcu_read_unlock(); + return skb->len; +} + +static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nft_set *set; + unsigned int idx, s_idx = cb->args[0]; + struct nft_af_info *afi; + struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; + struct net *net = sock_net(skb->sk); + int cur_family = cb->args[3]; + + if (cb->args[1]) + return skb->len; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (cur_family) { + if (afi->family != cur_family) + continue; + + cur_family = 0; + } + + list_for_each_entry_rcu(table, &afi->tables, list) { + if (cur_table) { + if (cur_table != table) + continue; + + cur_table = NULL; + } + + ctx->table = table; + ctx->afi = afi; + idx = 0; + list_for_each_entry_rcu(set, &ctx->table->sets, list) { + if (idx < s_idx) + goto cont; + if (nf_tables_fill_set(skb, ctx, set, + NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + cb->args[2] = (unsigned long) table; + cb->args[3] = afi->family; + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + if (s_idx) + s_idx = 0; + } + } + cb->args[1] = 1; +done: + rcu_read_unlock(); + return skb->len; +} + +static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nlattr *nla[NFTA_SET_MAX + 1]; + struct nft_ctx ctx; + int err, ret; + + err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX, + nft_set_policy); + if (err < 0) + return err; + + err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla); + if (err < 0) + return err; + + if (ctx.table == NULL) { + if (ctx.afi == NULL) + ret = nf_tables_dump_sets_all(&ctx, skb, cb); + else + ret = nf_tables_dump_sets_family(&ctx, skb, cb); + } else + ret = nf_tables_dump_sets_table(&ctx, skb, cb); + + return ret; +} + +#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */ + +static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nft_set *set; + struct nft_ctx ctx; + struct sk_buff *skb2; + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + int err; + + /* Verify existance before starting dump */ + err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + if (err < 0) + return err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_sets, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + /* Only accept unspec with dump */ + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, + struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *da[NFTA_SET_DESC_MAX + 1]; + int err; + + err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); + if (err < 0) + return err; + + if (da[NFTA_SET_DESC_SIZE] != NULL) + desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + + return 0; +} + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + nft_trans_set_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); + set->flags |= NFT_SET_INACTIVE; + } + nft_trans_set(trans) = set; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_set_ops *ops; + struct nft_af_info *afi; + struct net *net = sock_net(skb->sk); + struct nft_table *table; + struct nft_set *set; + struct nft_ctx ctx; + char name[IFNAMSIZ]; + unsigned int size; + bool create; + u32 ktype, dtype, flags, policy; + struct nft_set_desc desc; + int err; + + if (nla[NFTA_SET_TABLE] == NULL || + nla[NFTA_SET_NAME] == NULL || + nla[NFTA_SET_KEY_LEN] == NULL || + nla[NFTA_SET_ID] == NULL) + return -EINVAL; + + memset(&desc, 0, sizeof(desc)); + + ktype = NFT_DATA_VALUE; + if (nla[NFTA_SET_KEY_TYPE] != NULL) { + ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); + if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) + return -EINVAL; + } + + desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); + if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + + flags = 0; + if (nla[NFTA_SET_FLAGS] != NULL) { + flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); + if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | + NFT_SET_INTERVAL | NFT_SET_MAP)) + return -EINVAL; + } + + dtype = 0; + if (nla[NFTA_SET_DATA_TYPE] != NULL) { + if (!(flags & NFT_SET_MAP)) + return -EINVAL; + + dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); + if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && + dtype != NFT_DATA_VERDICT) + return -EINVAL; + + if (dtype != NFT_DATA_VERDICT) { + if (nla[NFTA_SET_DATA_LEN] == NULL) + return -EINVAL; + desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); + if (desc.dlen == 0 || + desc.dlen > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + } else + desc.dlen = sizeof(struct nft_data); + } else if (flags & NFT_SET_MAP) + return -EINVAL; + + policy = NFT_SET_POL_PERFORMANCE; + if (nla[NFTA_SET_POLICY] != NULL) + policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + + if (nla[NFTA_SET_DESC] != NULL) { + err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); + if (err < 0) + return err; + } + + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + + set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); + if (IS_ERR(set)) { + if (PTR_ERR(set) != -ENOENT) + return PTR_ERR(set); + set = NULL; + } + + if (set != NULL) { + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + return 0; + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -ENOENT; + + ops = nft_select_set_ops(nla, &desc, policy); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + size = 0; + if (ops->privsize != NULL) + size = ops->privsize(nla); + + err = -ENOMEM; + set = kzalloc(sizeof(*set) + size, GFP_KERNEL); + if (set == NULL) + goto err1; + + nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name)); + err = nf_tables_set_alloc_name(&ctx, set, name); + if (err < 0) + goto err2; + + INIT_LIST_HEAD(&set->bindings); + set->ops = ops; + set->ktype = ktype; + set->klen = desc.klen; + set->dtype = dtype; + set->dlen = desc.dlen; + set->flags = flags; + set->size = desc.size; + + err = ops->init(set, &desc, nla); + if (err < 0) + goto err2; + + err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); + if (err < 0) + goto err2; + + list_add_tail_rcu(&set->list, &table->sets); + table->use++; + return 0; + +err2: + kfree(set); +err1: + module_put(ops->owner); + return err; +} + +static void nft_set_destroy(struct nft_set *set) +{ + set->ops->destroy(set); + module_put(set->ops->owner); + kfree(set); +} + +static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ + list_del_rcu(&set->list); + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); + nft_set_destroy(set); +} + +static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_set *set; + struct nft_ctx ctx; + int err; + + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; + if (nla[NFTA_SET_TABLE] == NULL) + return -EINVAL; + + err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + if (!list_empty(&set->bindings)) + return -EBUSY; + + err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); + if (err < 0) + return err; + + list_del_rcu(&set->list); + ctx.table->use--; + return 0; +} + +static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_iter *iter, + const struct nft_set_elem *elem) +{ + enum nft_registers dreg; + + dreg = nft_type_to_reg(set->dtype); + return nft_validate_data_load(ctx, dreg, &elem->data, + set->dtype == NFT_DATA_VERDICT ? + NFT_DATA_VERDICT : NFT_DATA_VALUE); +} + +int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding) +{ + struct nft_set_binding *i; + struct nft_set_iter iter; + + if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) + return -EBUSY; + + if (set->flags & NFT_SET_MAP) { + /* If the set is already bound to the same chain all + * jumps are already validated for that chain. + */ + list_for_each_entry(i, &set->bindings, list) { + if (i->chain == binding->chain) + goto bind; + } + + iter.skip = 0; + iter.count = 0; + iter.err = 0; + iter.fn = nf_tables_bind_check_setelem; + + set->ops->walk(ctx, set, &iter); + if (iter.err < 0) { + /* Destroy anonymous sets if binding fails */ + if (set->flags & NFT_SET_ANONYMOUS) + nf_tables_set_destroy(ctx, set); + + return iter.err; + } + } +bind: + binding->chain = ctx->chain; + list_add_tail_rcu(&binding->list, &set->bindings); + return 0; +} + +void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding) +{ + list_del_rcu(&binding->list); + + if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && + !(set->flags & NFT_SET_INACTIVE)) + nf_tables_set_destroy(ctx, set); +} + +/* + * Set elements + */ + +static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { + [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { + [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + bool trans) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct nft_table *table; + struct net *net = sock_net(skb->sk); + + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (!trans && (table->flags & NFT_TABLE_INACTIVE)) + return -ENOENT; + + nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + return 0; +} + +static int nf_tables_fill_setelem(struct sk_buff *skb, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + nest = nla_nest_start(skb, NFTA_LIST_ELEM); + if (nest == NULL) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE, + set->klen) < 0) + goto nla_put_failure; + + if (set->flags & NFT_SET_MAP && + !(elem->flags & NFT_SET_ELEM_INTERVAL_END) && + nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data, + set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, + set->dlen) < 0) + goto nla_put_failure; + + if (elem->flags != 0) + if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags))) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +struct nft_set_dump_args { + const struct netlink_callback *cb; + struct nft_set_iter iter; + struct sk_buff *skb; +}; + +static int nf_tables_dump_setelem(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_iter *iter, + const struct nft_set_elem *elem) +{ + struct nft_set_dump_args *args; + + args = container_of(iter, struct nft_set_dump_args, iter); + return nf_tables_fill_setelem(args->skb, set, elem); +} + +static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nft_set *set; + struct nft_set_dump_args args; + struct nft_ctx ctx; + struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *nest; + u32 portid, seq; + int event, err; + + err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, + NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy); + if (err < 0) + return err; + + err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, + false); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + + event = NFT_MSG_NEWSETELEM; + event |= NFNL_SUBSYS_NFTABLES << 8; + portid = NETLINK_CB(cb->skb).portid; + seq = cb->nlh->nlmsg_seq; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + NLM_F_MULTI); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx.afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + if (nest == NULL) + goto nla_put_failure; + + args.cb = cb; + args.skb = skb; + args.iter.skip = cb->args[0]; + args.iter.count = 0; + args.iter.err = 0; + args.iter.fn = nf_tables_dump_setelem; + set->ops->walk(&ctx, set, &args.iter); + + nla_nest_end(skb, nest); + nlmsg_end(skb, nlh); + + if (args.iter.err && args.iter.err != -EMSGSIZE) + return args.iter.err; + if (args.iter.count == cb->args[0]) + return 0; + + cb->args[0] = args.iter.count; + return skb->len; + +nla_put_failure: + return -ENOSPC; +} + +static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nft_set *set; + struct nft_ctx ctx; + int err; + + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_set, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + return -EOPNOTSUPP; +} + +static int nf_tables_fill_setelem_info(struct sk_buff *skb, + const struct nft_ctx *ctx, u32 seq, + u32 portid, int event, u16 flags, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *nest; + int err; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_NAME, set->name)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + if (nest == NULL) + goto nla_put_failure; + + err = nf_tables_fill_setelem(skb, set, elem); + if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) +{ + struct net *net = ctx->net; + u32 portid = ctx->portid; + struct sk_buff *skb; + int err; + + if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, + set, elem); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); +err: + if (err < 0) + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + return err; +} + +static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, + int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + if (trans == NULL) + return NULL; + + nft_trans_elem_set(trans) = set; + return trans; +} + +static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr *attr) +{ + struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_data_desc d1, d2; + struct nft_set_elem elem; + struct nft_set_binding *binding; + enum nft_registers dreg; + struct nft_trans *trans; + int err; + + if (set->size && set->nelems == set->size) + return -ENFILE; + + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy); + if (err < 0) + return err; + + if (nla[NFTA_SET_ELEM_KEY] == NULL) + return -EINVAL; + + elem.flags = 0; + if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { + elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); + if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + } + + if (set->flags & NFT_SET_MAP) { + if (nla[NFTA_SET_ELEM_DATA] == NULL && + !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) + return -EINVAL; + if (nla[NFTA_SET_ELEM_DATA] != NULL && + elem.flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + } else { + if (nla[NFTA_SET_ELEM_DATA] != NULL) + return -EINVAL; + } + + err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + goto err1; + err = -EINVAL; + if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) + goto err2; + + err = -EEXIST; + if (set->ops->get(set, &elem) == 0) + goto err2; + + if (nla[NFTA_SET_ELEM_DATA] != NULL) { + err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]); + if (err < 0) + goto err2; + + err = -EINVAL; + if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) + goto err3; + + dreg = nft_type_to_reg(set->dtype); + list_for_each_entry(binding, &set->bindings, list) { + struct nft_ctx bind_ctx = { + .afi = ctx->afi, + .table = ctx->table, + .chain = (struct nft_chain *)binding->chain, + }; + + err = nft_validate_data_load(&bind_ctx, dreg, + &elem.data, d2.type); + if (err < 0) + goto err3; + } + } + + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); + if (trans == NULL) + goto err3; + + err = set->ops->insert(set, &elem); + if (err < 0) + goto err4; + + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; + +err4: + kfree(trans); +err3: + if (nla[NFTA_SET_ELEM_DATA] != NULL) + nft_data_uninit(&elem.data, d2.type); +err2: + nft_data_uninit(&elem.key, d1.type); +err1: + return err; +} + +static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + const struct nlattr *attr; + struct nft_set *set; + struct nft_ctx ctx; + int rem, err = 0; + + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) { + if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { + set = nf_tables_set_lookup_byid(net, + nla[NFTA_SET_ELEM_LIST_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + + if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + return -EBUSY; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_add_set_elem(&ctx, set, attr); + if (err < 0) + break; + + set->nelems++; + } + return err; +} + +static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr *attr) +{ + struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_data_desc desc; + struct nft_set_elem elem; + struct nft_trans *trans; + int err; + + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy); + if (err < 0) + goto err1; + + err = -EINVAL; + if (nla[NFTA_SET_ELEM_KEY] == NULL) + goto err1; + + err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + goto err1; + + err = -EINVAL; + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) + goto err2; + + err = set->ops->get(set, &elem); + if (err < 0) + goto err2; + + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); + if (trans == NULL) + goto err2; + + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + nft_data_uninit(&elem.key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP) + nft_data_uninit(&elem.data, set->dtype); + +err2: + nft_data_uninit(&elem.key, desc.type); +err1: + return err; +} + +static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nlattr *attr; + struct nft_set *set; + struct nft_ctx ctx; + int rem, err = 0; + + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + return -EBUSY; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_del_setelem(&ctx, set, attr); + if (err < 0) + break; + + set->nelems--; + } + return err; +} + +static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { + [NFT_MSG_NEWTABLE] = { + .call_batch = nf_tables_newtable, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, + [NFT_MSG_GETTABLE] = { + .call = nf_tables_gettable, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, + [NFT_MSG_DELTABLE] = { + .call_batch = nf_tables_deltable, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, + [NFT_MSG_NEWCHAIN] = { + .call_batch = nf_tables_newchain, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, + [NFT_MSG_GETCHAIN] = { + .call = nf_tables_getchain, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, + [NFT_MSG_DELCHAIN] = { + .call_batch = nf_tables_delchain, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, + [NFT_MSG_NEWRULE] = { + .call_batch = nf_tables_newrule, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_GETRULE] = { + .call = nf_tables_getrule, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_DELRULE] = { + .call_batch = nf_tables_delrule, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_NEWSET] = { + .call_batch = nf_tables_newset, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, + [NFT_MSG_GETSET] = { + .call = nf_tables_getset, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, + [NFT_MSG_DELSET] = { + .call_batch = nf_tables_delset, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, + [NFT_MSG_NEWSETELEM] = { + .call_batch = nf_tables_newsetelem, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, + [NFT_MSG_GETSETELEM] = { + .call = nf_tables_getsetelem, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, + [NFT_MSG_DELSETELEM] = { + .call_batch = nf_tables_delsetelem, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, +}; + +static void nft_chain_commit_update(struct nft_trans *trans) +{ + struct nft_base_chain *basechain; + + if (nft_trans_chain_name(trans)[0]) + strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + + if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) + return; + + basechain = nft_base_chain(trans->ctx.chain); + nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + + switch (nft_trans_chain_policy(trans)) { + case NF_DROP: + case NF_ACCEPT: + basechain->policy = nft_trans_chain_policy(trans); + break; + } +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * removed rules. + */ +static void nf_tables_commit_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_DELTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_DELRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_DELSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_commit(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + /* Bump generation counter, invalidate any dump in progress */ + while (++net->nft.base_seq == 0); + + /* A new generation has just started */ + net->nft.gencursor = gencursor_next(net); + + /* Make sure all packets have left the previous generation before + * purging old rules. + */ + synchronize_rcu(); + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (!nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + } else { + trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + } + nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELTABLE: + nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) + nft_chain_commit_update(trans); + else + trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + + nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + break; + case NFT_MSG_NEWRULE: + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_NEWRULE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELRULE: + list_del_rcu(&nft_trans_rule(trans)->list); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_DELRULE); + break; + case NFT_MSG_NEWSET: + nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS && + !list_empty(&nft_trans_set(trans)->bindings)) + trans->ctx.table->use--; + + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_NEWSET, GFP_KERNEL); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSET: + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_DELSET, GFP_KERNEL); + break; + case NFT_MSG_NEWSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_NEWSETELEM, 0); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_DELSETELEM, 0); + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); + } + + return 0; +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * aborted rules. + */ +static void nf_tables_abort_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_NEWCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_NEWRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_NEWSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_abort(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + nft_trans_destroy(trans); + } else { + list_del_rcu(&trans->ctx.table->list); + } + break; + case NFT_MSG_DELTABLE: + list_add_tail_rcu(&trans->ctx.table->list, + &trans->ctx.afi->tables); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) { + if (nft_trans_chain_stats(trans)) + free_percpu(nft_trans_chain_stats(trans)); + + nft_trans_destroy(trans); + } else { + trans->ctx.table->use--; + list_del_rcu(&trans->ctx.chain->list); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + } + break; + case NFT_MSG_DELCHAIN: + trans->ctx.table->use++; + list_add_tail_rcu(&trans->ctx.chain->list, + &trans->ctx.table->chains); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWRULE: + trans->ctx.chain->use--; + list_del_rcu(&nft_trans_rule(trans)->list); + break; + case NFT_MSG_DELRULE: + trans->ctx.chain->use++; + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSET: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_set(trans)->list); + break; + case NFT_MSG_DELSET: + trans->ctx.table->use++; + list_add_tail_rcu(&nft_trans_set(trans)->list, + &trans->ctx.table->sets); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSETELEM: + nft_trans_elem_set(trans)->nelems--; + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nft_trans_elem_set(trans)->nelems++; + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe_reverse(trans, next, + &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); + } + + return 0; +} + +static const struct nfnetlink_subsystem nf_tables_subsys = { + .name = "nf_tables", + .subsys_id = NFNL_SUBSYS_NFTABLES, + .cb_count = NFT_MSG_MAX, + .cb = nf_tables_cb, + .commit = nf_tables_commit, + .abort = nf_tables_abort, +}; + +/* + * Loop detection - walk through the ruleset beginning at the destination chain + * of a new jump until either the source chain is reached (loop) or all + * reachable chains have been traversed. + * + * The loop check is performed whenever a new jump verdict is added to an + * expression or verdict map or a verdict map is bound to a new chain. + */ + +static int nf_tables_check_loops(const struct nft_ctx *ctx, + const struct nft_chain *chain); + +static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_iter *iter, + const struct nft_set_elem *elem) +{ + if (elem->flags & NFT_SET_ELEM_INTERVAL_END) + return 0; + + switch (elem->data.verdict) { + case NFT_JUMP: + case NFT_GOTO: + return nf_tables_check_loops(ctx, elem->data.chain); + default: + return 0; + } +} + +static int nf_tables_check_loops(const struct nft_ctx *ctx, + const struct nft_chain *chain) +{ + const struct nft_rule *rule; + const struct nft_expr *expr, *last; + const struct nft_set *set; + struct nft_set_binding *binding; + struct nft_set_iter iter; + + if (ctx->chain == chain) + return -ELOOP; + + list_for_each_entry(rule, &chain->rules, list) { + nft_rule_for_each_expr(expr, last, rule) { + const struct nft_data *data = NULL; + int err; + + if (!expr->ops->validate) + continue; + + err = expr->ops->validate(ctx, expr, &data); + if (err < 0) + return err; + + if (data == NULL) + continue; + + switch (data->verdict) { + case NFT_JUMP: + case NFT_GOTO: + err = nf_tables_check_loops(ctx, data->chain); + if (err < 0) + return err; + default: + break; + } + } + } + + list_for_each_entry(set, &ctx->table->sets, list) { + if (!(set->flags & NFT_SET_MAP) || + set->dtype != NFT_DATA_VERDICT) + continue; + + list_for_each_entry(binding, &set->bindings, list) { + if (binding->chain != chain) + continue; + + iter.skip = 0; + iter.count = 0; + iter.err = 0; + iter.fn = nf_tables_loop_check_setelem; + + set->ops->walk(ctx, set, &iter); + if (iter.err < 0) + return iter.err; + } + } + + return 0; +} + +/** + * nft_validate_input_register - validate an expressions' input register + * + * @reg: the register number + * + * Validate that the input register is one of the general purpose + * registers. + */ +int nft_validate_input_register(enum nft_registers reg) +{ + if (reg <= NFT_REG_VERDICT) + return -EINVAL; + if (reg > NFT_REG_MAX) + return -ERANGE; + return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_input_register); + +/** + * nft_validate_output_register - validate an expressions' output register + * + * @reg: the register number + * + * Validate that the output register is one of the general purpose + * registers or the verdict register. + */ +int nft_validate_output_register(enum nft_registers reg) +{ + if (reg < NFT_REG_VERDICT) + return -EINVAL; + if (reg > NFT_REG_MAX) + return -ERANGE; + return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_output_register); + +/** + * nft_validate_data_load - validate an expressions' data load + * + * @ctx: context of the expression performing the load + * @reg: the destination register number + * @data: the data to load + * @type: the data type + * + * Validate that a data load uses the appropriate data type for + * the destination register. A value of NULL for the data means + * that its runtime gathered data, which is always of type + * NFT_DATA_VALUE. + */ +int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type) +{ + int err; + + switch (reg) { + case NFT_REG_VERDICT: + if (data == NULL || type != NFT_DATA_VERDICT) + return -EINVAL; + + if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) { + err = nf_tables_check_loops(ctx, data->chain); + if (err < 0) + return err; + + if (ctx->chain->level + 1 > data->chain->level) { + if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE) + return -EMLINK; + data->chain->level = ctx->chain->level + 1; + } + } + + return 0; + default: + if (data != NULL && type != NFT_DATA_VALUE) + return -EINVAL; + return 0; + } +} +EXPORT_SYMBOL_GPL(nft_validate_data_load); + +static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { + [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, + [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, + .len = NFT_CHAIN_MAXNAMELEN - 1 }, +}; + +static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, + struct nft_data_desc *desc, const struct nlattr *nla) +{ + struct nlattr *tb[NFTA_VERDICT_MAX + 1]; + struct nft_chain *chain; + int err; + + err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy); + if (err < 0) + return err; + + if (!tb[NFTA_VERDICT_CODE]) + return -EINVAL; + data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); + + switch (data->verdict) { + default: + switch (data->verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; + default: + return -EINVAL; + } + /* fall through */ + case NFT_CONTINUE: + case NFT_BREAK: + case NFT_RETURN: + desc->len = sizeof(data->verdict); + break; + case NFT_JUMP: + case NFT_GOTO: + if (!tb[NFTA_VERDICT_CHAIN]) + return -EINVAL; + chain = nf_tables_chain_lookup(ctx->table, + tb[NFTA_VERDICT_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_BASE_CHAIN) + return -EOPNOTSUPP; + + chain->use++; + data->chain = chain; + desc->len = sizeof(data); + break; + } + + desc->type = NFT_DATA_VERDICT; + return 0; +} + +static void nft_verdict_uninit(const struct nft_data *data) +{ + switch (data->verdict) { + case NFT_JUMP: + case NFT_GOTO: + data->chain->use--; + break; + } +} + +static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, NFTA_DATA_VERDICT); + if (!nest) + goto nla_put_failure; + + if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict))) + goto nla_put_failure; + + switch (data->verdict) { + case NFT_JUMP: + case NFT_GOTO: + if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + return -1; +} + +static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data, + struct nft_data_desc *desc, const struct nlattr *nla) +{ + unsigned int len; + + len = nla_len(nla); + if (len == 0) + return -EINVAL; + if (len > sizeof(data->data)) + return -EOVERFLOW; + + nla_memcpy(data->data, nla, sizeof(data->data)); + desc->type = NFT_DATA_VALUE; + desc->len = len; + return 0; +} + +static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data, + unsigned int len) +{ + return nla_put(skb, NFTA_DATA_VALUE, len, data->data); +} + +static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { + [NFTA_DATA_VALUE] = { .type = NLA_BINARY, + .len = FIELD_SIZEOF(struct nft_data, data) }, + [NFTA_DATA_VERDICT] = { .type = NLA_NESTED }, +}; + +/** + * nft_data_init - parse nf_tables data netlink attributes + * + * @ctx: context of the expression using the data + * @data: destination struct nft_data + * @desc: data description + * @nla: netlink attribute containing data + * + * Parse the netlink data attributes and initialize a struct nft_data. + * The type and length of data are returned in the data description. + * + * The caller can indicate that it only wants to accept data of type + * NFT_DATA_VALUE by passing NULL for the ctx argument. + */ +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, + struct nft_data_desc *desc, const struct nlattr *nla) +{ + struct nlattr *tb[NFTA_DATA_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy); + if (err < 0) + return err; + + if (tb[NFTA_DATA_VALUE]) + return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); + if (tb[NFTA_DATA_VERDICT] && ctx != NULL) + return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(nft_data_init); + +/** + * nft_data_uninit - release a nft_data item + * + * @data: struct nft_data to release + * @type: type of data + * + * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, + * all others need to be released by calling this function. + */ +void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) +{ + switch (type) { + case NFT_DATA_VALUE: + return; + case NFT_DATA_VERDICT: + return nft_verdict_uninit(data); + default: + WARN_ON(1); + } +} +EXPORT_SYMBOL_GPL(nft_data_uninit); + +int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, + enum nft_data_types type, unsigned int len) +{ + struct nlattr *nest; + int err; + + nest = nla_nest_start(skb, attr); + if (nest == NULL) + return -1; + + switch (type) { + case NFT_DATA_VALUE: + err = nft_value_dump(skb, data, len); + break; + case NFT_DATA_VERDICT: + err = nft_verdict_dump(skb, data); + break; + default: + err = -EINVAL; + WARN_ON(1); + } + + nla_nest_end(skb, nest); + return err; +} +EXPORT_SYMBOL_GPL(nft_data_dump); + +static int nf_tables_init_net(struct net *net) +{ + INIT_LIST_HEAD(&net->nft.af_info); + INIT_LIST_HEAD(&net->nft.commit_list); + net->nft.base_seq = 1; + return 0; +} + +static struct pernet_operations nf_tables_net_ops = { + .init = nf_tables_init_net, +}; + +static int __init nf_tables_module_init(void) +{ + int err; + + info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS, + GFP_KERNEL); + if (info == NULL) { + err = -ENOMEM; + goto err1; + } + + err = nf_tables_core_module_init(); + if (err < 0) + goto err2; + + err = nfnetlink_subsys_register(&nf_tables_subsys); + if (err < 0) + goto err3; + + pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n"); + return register_pernet_subsys(&nf_tables_net_ops); +err3: + nf_tables_core_module_exit(); +err2: + kfree(info); +err1: + return err; +} + +static void __exit nf_tables_module_exit(void) +{ + unregister_pernet_subsys(&nf_tables_net_ops); + nfnetlink_subsys_unregister(&nf_tables_subsys); + nf_tables_core_module_exit(); + kfree(info); +} + +module_init(nf_tables_module_init); +module_exit(nf_tables_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c new file mode 100644 index 00000000000..3b90eb2b2c5 --- /dev/null +++ b/net/netfilter/nf_tables_core.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> + +static void nft_cmp_fast_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1]) +{ + const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + u32 mask = nft_cmp_fast_mask(priv->len); + + if ((data[priv->sreg].data[0] & mask) == priv->data) + return; + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static bool nft_payload_fast_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + struct nft_data *dest = &data[priv->dreg]; + unsigned char *ptr; + + if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) + ptr = skb_network_header(skb); + else + ptr = skb_network_header(skb) + pkt->xt.thoff; + + ptr += priv->offset; + + if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) + return false; + + if (priv->len == 2) + *(u16 *)dest->data = *(u16 *)ptr; + else if (priv->len == 4) + *(u32 *)dest->data = *(u32 *)ptr; + else + *(u8 *)dest->data = *(u8 *)ptr; + return true; +} + +struct nft_jumpstack { + const struct nft_chain *chain; + const struct nft_rule *rule; + int rulenum; +}; + +enum nft_trace { + NFT_TRACE_RULE, + NFT_TRACE_RETURN, + NFT_TRACE_POLICY, +}; + +static const char *const comments[] = { + [NFT_TRACE_RULE] = "rule", + [NFT_TRACE_RETURN] = "return", + [NFT_TRACE_POLICY] = "policy", +}; + +static struct nf_loginfo trace_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 4, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static void nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) +{ + struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + + nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", + chain->table->name, chain->name, comments[type], + rulenum); +} + +unsigned int +nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +{ + const struct nft_chain *chain = ops->priv, *basechain = chain; + const struct nft_rule *rule; + const struct nft_expr *expr, *last; + struct nft_data data[NFT_REG_MAX + 1]; + unsigned int stackptr = 0; + struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; + struct nft_stats *stats; + int rulenum; + /* + * Cache cursor to avoid problems in case that the cursor is updated + * while traversing the ruleset. + */ + unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); + +do_chain: + rulenum = 0; + rule = list_entry(&chain->rules, struct nft_rule, list); +next_rule: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + list_for_each_entry_continue_rcu(rule, &chain->rules, list) { + + /* This rule is not active, skip. */ + if (unlikely(rule->genmask & (1 << gencursor))) + continue; + + rulenum++; + + nft_rule_for_each_expr(expr, last, rule) { + if (expr->ops == &nft_cmp_fast_ops) + nft_cmp_fast_eval(expr, data); + else if (expr->ops != &nft_payload_fast_ops || + !nft_payload_fast_eval(expr, data, pkt)) + expr->ops->eval(expr, data, pkt); + + if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE) + break; + } + + switch (data[NFT_REG_VERDICT].verdict) { + case NFT_BREAK: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + continue; + case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + continue; + } + break; + } + + switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + + return data[NFT_REG_VERDICT].verdict; + } + + switch (data[NFT_REG_VERDICT].verdict) { + case NFT_JUMP: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + + BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); + jumpstack[stackptr].chain = chain; + jumpstack[stackptr].rule = rule; + jumpstack[stackptr].rulenum = rulenum; + stackptr++; + chain = data[NFT_REG_VERDICT].chain; + goto do_chain; + case NFT_GOTO: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + + chain = data[NFT_REG_VERDICT].chain; + goto do_chain; + case NFT_RETURN: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); + break; + case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) + nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); + break; + default: + WARN_ON(1); + } + + if (stackptr > 0) { + stackptr--; + chain = jumpstack[stackptr].chain; + rule = jumpstack[stackptr].rule; + rulenum = jumpstack[stackptr].rulenum; + goto next_rule; + } + + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + + rcu_read_lock_bh(); + stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); + u64_stats_update_begin(&stats->syncp); + stats->pkts++; + stats->bytes += pkt->skb->len; + u64_stats_update_end(&stats->syncp); + rcu_read_unlock_bh(); + + return nft_base_chain(basechain)->policy; +} +EXPORT_SYMBOL_GPL(nft_do_chain); + +int __init nf_tables_core_module_init(void) +{ + int err; + + err = nft_immediate_module_init(); + if (err < 0) + goto err1; + + err = nft_cmp_module_init(); + if (err < 0) + goto err2; + + err = nft_lookup_module_init(); + if (err < 0) + goto err3; + + err = nft_bitwise_module_init(); + if (err < 0) + goto err4; + + err = nft_byteorder_module_init(); + if (err < 0) + goto err5; + + err = nft_payload_module_init(); + if (err < 0) + goto err6; + + return 0; + +err6: + nft_byteorder_module_exit(); +err5: + nft_bitwise_module_exit(); +err4: + nft_lookup_module_exit(); +err3: + nft_cmp_module_exit(); +err2: + nft_immediate_module_exit(); +err1: + return err; +} + +void nf_tables_core_module_exit(void) +{ + nft_payload_module_exit(); + nft_byteorder_module_exit(); + nft_bitwise_module_exit(); + nft_lookup_module_exit(); + nft_cmp_module_exit(); + nft_immediate_module_exit(); +} diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c new file mode 100644 index 00000000000..9dd2d216cfc --- /dev/null +++ b/net/netfilter/nf_tables_inet.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/ip.h> + +static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n) +{ + struct nft_af_info *afi; + + if (n == 1) + afi = &nft_af_ipv4; + else + afi = &nft_af_ipv6; + + ops->pf = afi->family; + if (afi->hooks[ops->hooknum]) + ops->hook = afi->hooks[ops->hooknum]; +} + +static struct nft_af_info nft_af_inet __read_mostly = { + .family = NFPROTO_INET, + .nhooks = NF_INET_NUMHOOKS, + .owner = THIS_MODULE, + .nops = 2, + .hook_ops_init = nft_inet_hook_ops_init, +}; + +static int __net_init nf_tables_inet_init_net(struct net *net) +{ + net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.inet == NULL) + return -ENOMEM; + memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet)); + + if (nft_register_afinfo(net, net->nft.inet) < 0) + goto err; + + return 0; + +err: + kfree(net->nft.inet); + return -ENOMEM; +} + +static void __net_exit nf_tables_inet_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.inet); + kfree(net->nft.inet); +} + +static struct pernet_operations nf_tables_inet_net_ops = { + .init = nf_tables_inet_init_net, + .exit = nf_tables_inet_exit_net, +}; + +static const struct nf_chain_type filter_inet = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_INET, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_inet_init(void) +{ + int ret; + + nft_register_chain_type(&filter_inet); + ret = register_pernet_subsys(&nf_tables_inet_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_inet); + + return ret; +} + +static void __exit nf_tables_inet_exit(void) +{ + unregister_pernet_subsys(&nf_tables_inet_net_ops); + nft_unregister_chain_type(&filter_inet); +} + +module_init(nf_tables_inet_init); +module_exit(nf_tables_inet_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(1); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 8797e6953ef..c138b8fbe28 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -18,20 +18,15 @@ #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> -#include <linux/major.h> -#include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> -#include <linux/fcntl.h> #include <linux/skbuff.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <net/sock.h> -#include <net/netlink.h> #include <linux/init.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); @@ -40,68 +35,76 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; -static struct sock *nfnl = NULL; -static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; -static DEFINE_MUTEX(nfnl_mutex); - -static void nfnl_lock(void) +static struct { + struct mutex mutex; + const struct nfnetlink_subsystem __rcu *subsys; +} table[NFNL_SUBSYS_COUNT]; + +static const int nfnl_group2type[NFNLGRP_MAX+1] = { + [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK, + [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK, + [NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK, + [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP, + [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP, + [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, +}; + +void nfnl_lock(__u8 subsys_id) { - mutex_lock(&nfnl_mutex); + mutex_lock(&table[subsys_id].mutex); } +EXPORT_SYMBOL_GPL(nfnl_lock); -static int nfnl_trylock(void) +void nfnl_unlock(__u8 subsys_id) { - return !mutex_trylock(&nfnl_mutex); + mutex_unlock(&table[subsys_id].mutex); } +EXPORT_SYMBOL_GPL(nfnl_unlock); -static void __nfnl_unlock(void) +#ifdef CONFIG_PROVE_LOCKING +int lockdep_nfnl_is_held(u8 subsys_id) { - mutex_unlock(&nfnl_mutex); -} - -static void nfnl_unlock(void) -{ - mutex_unlock(&nfnl_mutex); - if (nfnl->sk_receive_queue.qlen) - nfnl->sk_data_ready(nfnl, 0); + return lockdep_is_held(&table[subsys_id].mutex); } +EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); +#endif -int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) +int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { - nfnl_lock(); - if (subsys_table[n->subsys_id]) { - nfnl_unlock(); + nfnl_lock(n->subsys_id); + if (table[n->subsys_id].subsys) { + nfnl_unlock(n->subsys_id); return -EBUSY; } - subsys_table[n->subsys_id] = n; - nfnl_unlock(); + rcu_assign_pointer(table[n->subsys_id].subsys, n); + nfnl_unlock(n->subsys_id); return 0; } EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); -int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n) +int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) { - nfnl_lock(); - subsys_table[n->subsys_id] = NULL; - nfnl_unlock(); - + nfnl_lock(n->subsys_id); + table[n->subsys_id].subsys = NULL; + nfnl_unlock(n->subsys_id); + synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); -static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) { u_int8_t subsys_id = NFNL_SUBSYS_ID(type); if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; - return subsys_table[subsys_id]; + return rcu_dereference(table[subsys_id].subsys); } -static inline struct nfnl_callback * -nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss) +static inline const struct nfnl_callback * +nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) { u_int8_t cb_id = NFNL_MSG_TYPE(type); @@ -111,168 +114,355 @@ nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss) return &ss->cb[cb_id]; } -void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, - const void *data) +int nfnetlink_has_listeners(struct net *net, unsigned int group) { - struct nfattr *nfa; - int size = NFA_LENGTH(attrlen); - - nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); - nfa->nfa_type = attrtype; - nfa->nfa_len = size; - memcpy(NFA_DATA(nfa), data, attrlen); - memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); + return netlink_has_listeners(net->nfnl, group); } -EXPORT_SYMBOL_GPL(__nfa_fill); +EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) { - memset(tb, 0, sizeof(struct nfattr *) * maxattr); - - while (NFA_OK(nfa, len)) { - unsigned flavor = NFA_TYPE(nfa); - if (flavor && flavor <= maxattr) - tb[flavor-1] = nfa; - nfa = NFA_NEXT(nfa, len); - } + return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); } -EXPORT_SYMBOL_GPL(nfattr_parse); +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); -/** - * nfnetlink_check_attributes - check and parse nfnetlink attributes - * - * subsys: nfnl subsystem for which this message is to be parsed - * nlmsghdr: netlink message to be checked/parsed - * cda: array of pointers, needs to be at least subsys->attr_count big - * - */ -static int -nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, - struct nlmsghdr *nlh, struct nfattr *cda[]) +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, + unsigned int group, int echo, gfp_t flags) { - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); - u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); - u_int16_t attr_count = subsys->cb[cb_id].attr_count; - - /* check attribute lengths. */ - if (likely(nlh->nlmsg_len > min_len)) { - struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh)); - int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - nfattr_parse(cda, attr_count, attr, attrlen); - } - - /* implicit: if nlmsg_len == min_len, we return 0, and an empty - * (zeroed) cda[] array. The message is valid, but empty. */ - - return 0; + return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); } +EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_has_listeners(unsigned int group) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_has_listeners(nfnl, group); + return netlink_set_err(net->nfnl, portid, group, error); } -EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); +EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, + int flags) { - int err = 0; - - NETLINK_CB(skb).dst_group = group; - if (echo) - atomic_inc(&skb->users); - netlink_broadcast(nfnl, skb, pid, group, gfp_any()); - if (echo) - err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT); - - return err; -} -EXPORT_SYMBOL_GPL(nfnetlink_send); - -int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) -{ - return netlink_unicast(nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, portid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct nfnl_callback *nc; - struct nfnetlink_subsystem *ss; + struct net *net = sock_net(skb->sk); + const struct nfnl_callback *nc; + const struct nfnetlink_subsystem *ss; int type, err; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) - return -EPERM; - /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) + if (nlmsg_len(nlh) < sizeof(struct nfgenmsg)) return 0; type = nlh->nlmsg_type; +replay: + rcu_read_lock(); ss = nfnetlink_get_subsys(type); if (!ss) { -#ifdef CONFIG_KMOD - /* don't call nfnl_unlock, since it would reenter - * with further packet processing */ - __nfnl_unlock(); +#ifdef CONFIG_MODULES + rcu_read_unlock(); request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); - nfnl_lock(); + rcu_read_lock(); ss = nfnetlink_get_subsys(type); if (!ss) #endif + { + rcu_read_unlock(); return -EINVAL; + } } nc = nfnetlink_find_client(type, ss); - if (!nc) + if (!nc) { + rcu_read_unlock(); return -EINVAL; + } { - u_int16_t attr_count = - ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; - struct nfattr *cda[attr_count]; + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *attr = (void *)nlh + min_len; + int attrlen = nlh->nlmsg_len - min_len; + __u8 subsys_id = NFNL_SUBSYS_ID(type); + + err = nla_parse(cda, ss->cb[cb_id].attr_count, + attr, attrlen, ss->cb[cb_id].policy); + if (err < 0) { + rcu_read_unlock(); + return err; + } + + if (nc->call_rcu) { + err = nc->call_rcu(net->nfnl, skb, nlh, + (const struct nlattr **)cda); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + nfnl_lock(subsys_id); + if (rcu_dereference_protected(table[subsys_id].subsys, + lockdep_is_held(&table[subsys_id].mutex)) != ss || + nfnetlink_find_client(type, ss) != nc) + err = -EAGAIN; + else if (nc->call) + err = nc->call(net->nfnl, skb, nlh, + (const struct nlattr **)cda); + else + err = -EINVAL; + nfnl_unlock(subsys_id); + } + if (err == -EAGAIN) + goto replay; + return err; + } +} - memset(cda, 0, sizeof(struct nfattr *) * attr_count); +static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, + u_int16_t subsys_id) +{ + struct sk_buff *nskb, *oskb = skb; + struct net *net = sock_net(skb->sk); + const struct nfnetlink_subsystem *ss; + const struct nfnl_callback *nc; + bool success = true, done = false; + int err; - err = nfnetlink_check_attributes(ss, nlh, cda); - if (err < 0) - return err; - return nc->call(nfnl, skb, nlh, cda); + if (subsys_id >= NFNL_SUBSYS_COUNT) + return netlink_ack(skb, nlh, -EINVAL); +replay: + nskb = netlink_skb_clone(oskb, GFP_KERNEL); + if (!nskb) + return netlink_ack(oskb, nlh, -ENOMEM); + + nskb->sk = oskb->sk; + skb = nskb; + + nfnl_lock(subsys_id); + ss = rcu_dereference_protected(table[subsys_id].subsys, + lockdep_is_held(&table[subsys_id].mutex)); + if (!ss) { +#ifdef CONFIG_MODULES + nfnl_unlock(subsys_id); + request_module("nfnetlink-subsys-%d", subsys_id); + nfnl_lock(subsys_id); + ss = rcu_dereference_protected(table[subsys_id].subsys, + lockdep_is_held(&table[subsys_id].mutex)); + if (!ss) +#endif + { + nfnl_unlock(subsys_id); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(nskb); + } } + + if (!ss->commit || !ss->abort) { + nfnl_unlock(subsys_id); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(skb); + } + + while (skb->len >= nlmsg_total_size(0)) { + int msglen, type; + + nlh = nlmsg_hdr(skb); + err = 0; + + if (nlh->nlmsg_len < NLMSG_HDRLEN) { + err = -EINVAL; + goto ack; + } + + /* Only requests are handled by the kernel */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { + err = -EINVAL; + goto ack; + } + + type = nlh->nlmsg_type; + if (type == NFNL_MSG_BATCH_BEGIN) { + /* Malformed: Batch begin twice */ + success = false; + goto done; + } else if (type == NFNL_MSG_BATCH_END) { + done = true; + goto done; + } else if (type < NLMSG_MIN_TYPE) { + err = -EINVAL; + goto ack; + } + + /* We only accept a batch with messages for the same + * subsystem. + */ + if (NFNL_SUBSYS_ID(type) != subsys_id) { + err = -EINVAL; + goto ack; + } + + nc = nfnetlink_find_client(type, ss); + if (!nc) { + err = -EINVAL; + goto ack; + } + + { + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *attr = (void *)nlh + min_len; + int attrlen = nlh->nlmsg_len - min_len; + + err = nla_parse(cda, ss->cb[cb_id].attr_count, + attr, attrlen, ss->cb[cb_id].policy); + if (err < 0) + goto ack; + + if (nc->call_batch) { + err = nc->call_batch(net->nfnl, skb, nlh, + (const struct nlattr **)cda); + } + + /* The lock was released to autoload some module, we + * have to abort and start from scratch using the + * original skb. + */ + if (err == -EAGAIN) { + ss->abort(skb); + nfnl_unlock(subsys_id); + kfree_skb(nskb); + goto replay; + } + } +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) { + /* We don't stop processing the batch on errors, thus, + * userspace gets all the errors that the batch + * triggers. + */ + netlink_ack(skb, nlh, err); + if (err) + success = false; + } + + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + skb_pull(skb, msglen); + } +done: + if (success && done) + ss->commit(skb); + else + ss->abort(skb); + + nfnl_unlock(subsys_id); + kfree_skb(nskb); } -static void nfnetlink_rcv(struct sock *sk, int len) +static void nfnetlink_rcv(struct sk_buff *skb) { - unsigned int qlen = 0; + struct nlmsghdr *nlh = nlmsg_hdr(skb); + int msglen; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len) + return; + + if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { + netlink_ack(skb, nlh, -EPERM); + return; + } - do { - if (nfnl_trylock()) + if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { + struct nfgenmsg *nfgenmsg; + + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; - netlink_run_queue(sk, &qlen, nfnetlink_rcv_msg); - __nfnl_unlock(); - } while (qlen); + + nfgenmsg = nlmsg_data(nlh); + skb_pull(skb, msglen); + nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); + } else { + netlink_rcv_skb(skb, &nfnetlink_rcv_msg); + } } -static void __exit nfnetlink_exit(void) +#ifdef CONFIG_MODULES +static int nfnetlink_bind(int group) +{ + const struct nfnetlink_subsystem *ss; + int type = nfnl_group2type[group]; + + rcu_read_lock(); + ss = nfnetlink_get_subsys(type); + rcu_read_unlock(); + if (!ss) + request_module("nfnetlink-subsys-%d", type); + return 0; +} +#endif + +static int __net_init nfnetlink_net_init(struct net *net) { - printk("Removing netfilter NETLINK layer.\n"); - sock_release(nfnl->sk_socket); - return; + struct sock *nfnl; + struct netlink_kernel_cfg cfg = { + .groups = NFNLGRP_MAX, + .input = nfnetlink_rcv, +#ifdef CONFIG_MODULES + .bind = nfnetlink_bind, +#endif + }; + + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); + if (!nfnl) + return -ENOMEM; + net->nfnl_stash = nfnl; + rcu_assign_pointer(net->nfnl, nfnl); + return 0; } +static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) + RCU_INIT_POINTER(net->nfnl, NULL); + synchronize_net(); + list_for_each_entry(net, net_exit_list, exit_list) + netlink_kernel_release(net->nfnl_stash); +} + +static struct pernet_operations nfnetlink_net_ops = { + .init = nfnetlink_net_init, + .exit_batch = nfnetlink_net_exit_batch, +}; + static int __init nfnetlink_init(void) { - printk("Netfilter messages via NETLINK v%s.\n", nfversion); + int i; - nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX, - nfnetlink_rcv, NULL, THIS_MODULE); - if (!nfnl) { - printk(KERN_ERR "cannot initialize nfnetlink!\n"); - return -1; - } + for (i=0; i<NFNL_SUBSYS_COUNT; i++) + mutex_init(&table[i].mutex); - return 0; + pr_info("Netfilter messages via NETLINK v%s.\n", nfversion); + return register_pernet_subsys(&nfnetlink_net_ops); } +static void __exit nfnetlink_exit(void) +{ + pr_info("Removing netfilter NETLINK layer.\n"); + unregister_pernet_subsys(&nfnetlink_net_ops); +} module_init(nfnetlink_init); module_exit(nfnetlink_exit); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c new file mode 100644 index 00000000000..2baa125c2e8 --- /dev/null +++ b/net/netfilter/nfnetlink_acct.c @@ -0,0 +1,454 @@ +/* + * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2011 Intra2net AG <http://www.intra2net.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/atomic.h> +#include <linux/netlink.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <net/netlink.h> +#include <net/sock.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_acct.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); + +static LIST_HEAD(nfnl_acct_list); + +struct nf_acct { + atomic64_t pkts; + atomic64_t bytes; + unsigned long flags; + struct list_head head; + atomic_t refcnt; + char name[NFACCT_NAME_MAX]; + struct rcu_head rcu_head; + char data[0]; +}; + +#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) + +static int +nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + struct nf_acct *nfacct, *matching = NULL; + char *acct_name; + unsigned int size = 0; + u32 flags = 0; + + if (!tb[NFACCT_NAME]) + return -EINVAL; + + acct_name = nla_data(tb[NFACCT_NAME]); + if (strlen(acct_name) == 0) + return -EINVAL; + + list_for_each_entry(nfacct, &nfnl_acct_list, head) { + if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) + continue; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + matching = nfacct; + break; + } + + if (matching) { + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + /* reset counters if you request a replacement. */ + atomic64_set(&matching->pkts, 0); + atomic64_set(&matching->bytes, 0); + smp_mb__before_atomic(); + /* reset overquota flag if quota is enabled. */ + if ((matching->flags & NFACCT_F_QUOTA)) + clear_bit(NFACCT_F_OVERQUOTA, &matching->flags); + return 0; + } + return -EBUSY; + } + + if (tb[NFACCT_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); + if (flags & ~NFACCT_F_QUOTA) + return -EOPNOTSUPP; + if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) + return -EINVAL; + if (flags & NFACCT_F_OVERQUOTA) + return -EINVAL; + + size += sizeof(u64); + } + + nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL); + if (nfacct == NULL) + return -ENOMEM; + + if (flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)nfacct->data; + + *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); + nfacct->flags = flags; + } + + strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); + + if (tb[NFACCT_BYTES]) { + atomic64_set(&nfacct->bytes, + be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES]))); + } + if (tb[NFACCT_PKTS]) { + atomic64_set(&nfacct->pkts, + be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); + } + atomic_set(&nfacct->refcnt, 1); + list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); + return 0; +} + +static int +nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + int event, struct nf_acct *acct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + u64 pkts, bytes; + + event |= NFNL_SUBSYS_ACCT << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFACCT_NAME, acct->name)) + goto nla_put_failure; + + if (type == NFNL_MSG_ACCT_GET_CTRZERO) { + pkts = atomic64_xchg(&acct->pkts, 0); + bytes = atomic64_xchg(&acct->bytes, 0); + smp_mb__before_atomic(); + if (acct->flags & NFACCT_F_QUOTA) + clear_bit(NFACCT_F_OVERQUOTA, &acct->flags); + } else { + pkts = atomic64_read(&acct->pkts); + bytes = atomic64_read(&acct->bytes); + } + if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) || + nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) || + nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) + goto nla_put_failure; + if (acct->flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)acct->data; + + if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) || + nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) + goto nla_put_failure; + } + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_acct *cur, *last; + + if (cb->args[2]) + return 0; + + last = (struct nf_acct *)cb->args[1]; + if (cb->args[1]) + cb->args[1] = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + if (last) { + if (cur != last) + continue; + + last = NULL; + } + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + NFNL_MSG_ACCT_NEW, cur) < 0) { + cb->args[1] = (unsigned long)cur; + break; + } + } + if (!cb->args[1]) + cb->args[2] = 1; + rcu_read_unlock(); + return skb->len; +} + +static int +nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + int ret = -ENOENT; + struct nf_acct *cur; + char *acct_name; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nfnl_acct_dump, + }; + return netlink_dump_start(nfnl, skb, nlh, &c); + } + + if (!tb[NFACCT_NAME]) + return -EINVAL; + acct_name = nla_data(tb[NFACCT_NAME]); + + list_for_each_entry(cur, &nfnl_acct_list, head) { + struct sk_buff *skb2; + + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) + continue; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + ret = -ENOMEM; + break; + } + + ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + NFNL_MSG_ACCT_NEW, cur); + if (ret <= 0) { + kfree_skb(skb2); + break; + } + ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; + } + return ret; +} + +/* try to delete object, fail if it is still in use. */ +static int nfnl_acct_try_del(struct nf_acct *cur) +{ + int ret = 0; + + /* we want to avoid races with nfnl_acct_find_get. */ + if (atomic_dec_and_test(&cur->refcnt)) { + /* We are protected by nfnl mutex. */ + list_del_rcu(&cur->head); + kfree_rcu(cur, rcu_head); + } else { + /* still in use, restore reference counter. */ + atomic_inc(&cur->refcnt); + ret = -EBUSY; + } + return ret; +} + +static int +nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + char *acct_name; + struct nf_acct *cur; + int ret = -ENOENT; + + if (!tb[NFACCT_NAME]) { + list_for_each_entry(cur, &nfnl_acct_list, head) + nfnl_acct_try_del(cur); + + return 0; + } + acct_name = nla_data(tb[NFACCT_NAME]); + + list_for_each_entry(cur, &nfnl_acct_list, head) { + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) + continue; + + ret = nfnl_acct_try_del(cur); + if (ret < 0) + return ret; + + break; + } + return ret; +} + +static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { + [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, + [NFACCT_BYTES] = { .type = NLA_U64 }, + [NFACCT_PKTS] = { .type = NLA_U64 }, + [NFACCT_FLAGS] = { .type = NLA_U32 }, + [NFACCT_QUOTA] = { .type = NLA_U64 }, +}; + +static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { + [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, +}; + +static const struct nfnetlink_subsystem nfnl_acct_subsys = { + .name = "acct", + .subsys_id = NFNL_SUBSYS_ACCT, + .cb_count = NFNL_MSG_ACCT_MAX, + .cb = nfnl_acct_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); + +struct nf_acct *nfnl_acct_find_get(const char *acct_name) +{ + struct nf_acct *cur, *acct = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) + continue; + + if (!try_module_get(THIS_MODULE)) + goto err; + + if (!atomic_inc_not_zero(&cur->refcnt)) { + module_put(THIS_MODULE); + goto err; + } + + acct = cur; + break; + } +err: + rcu_read_unlock(); + return acct; +} +EXPORT_SYMBOL_GPL(nfnl_acct_find_get); + +void nfnl_acct_put(struct nf_acct *acct) +{ + atomic_dec(&acct->refcnt); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(nfnl_acct_put); + +void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) +{ + atomic64_inc(&nfacct->pkts); + atomic64_add(skb->len, &nfacct->bytes); +} +EXPORT_SYMBOL_GPL(nfnl_acct_update); + +static void nfnl_overquota_report(struct nf_acct *nfacct) +{ + int ret; + struct sk_buff *skb; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (skb == NULL) + return; + + ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, + nfacct); + if (ret <= 0) { + kfree_skb(skb); + return; + } + netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, + GFP_ATOMIC); +} + +int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) +{ + u64 now; + u64 *quota; + int ret = NFACCT_UNDERQUOTA; + + /* no place here if we don't have a quota */ + if (!(nfacct->flags & NFACCT_F_QUOTA)) + return NFACCT_NO_QUOTA; + + quota = (u64 *)nfacct->data; + now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? + atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); + + ret = now > *quota; + + if (now >= *quota && + !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) { + nfnl_overquota_report(nfacct); + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfnl_acct_overquota); + +static int __init nfnl_acct_init(void) +{ + int ret; + + pr_info("nfnl_acct: registering with nfnetlink.\n"); + ret = nfnetlink_subsys_register(&nfnl_acct_subsys); + if (ret < 0) { + pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); + goto err_out; + } + return 0; +err_out: + return ret; +} + +static void __exit nfnl_acct_exit(void) +{ + struct nf_acct *cur, *tmp; + + pr_info("nfnl_acct: unregistering from nfnetlink.\n"); + nfnetlink_subsys_unregister(&nfnl_acct_subsys); + + list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { + list_del_rcu(&cur->head); + /* We are sure that our objects have no clients at this point, + * it's safe to release them all without checking refcnt. */ + kfree_rcu(cur, rcu_head); + } +} + +module_init(nfnl_acct_init); +module_exit(nfnl_acct_exit); diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c new file mode 100644 index 00000000000..9e287cb56a0 --- /dev/null +++ b/net/netfilter/nfnetlink_cthelper.c @@ -0,0 +1,680 @@ +/* + * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + * + * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com> + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/errno.h> +#include <net/netlink.h> +#include <net/sock.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_ecache.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <linux/netfilter/nfnetlink_cthelper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); + +static int +nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + const struct nf_conn_help *help; + struct nf_conntrack_helper *helper; + + help = nfct_help(ct); + if (help == NULL) + return NF_DROP; + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + if (helper == NULL) + return NF_DROP; + + /* This is an user-space helper not yet configured, skip. */ + if ((helper->flags & + (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == + NF_CT_HELPER_F_USERSPACE) + return NF_ACCEPT; + + /* If the user-space helper is not available, don't block traffic. */ + return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS; +} + +static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = { + [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, }, + [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, }, +}; + +static int +nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, + const struct nlattr *attr) +{ + int err; + struct nlattr *tb[NFCTH_TUPLE_MAX+1]; + + err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol); + if (err < 0) + return err; + + if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) + return -EINVAL; + + tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); + tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); + + return 0; +} + +static int +nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) +{ + const struct nf_conn_help *help = nfct_help(ct); + + if (attr == NULL) + return -EINVAL; + + if (help->helper->data_len == 0) + return -EINVAL; + + memcpy(&help->data, nla_data(attr), help->helper->data_len); + return 0; +} + +static int +nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct) +{ + const struct nf_conn_help *help = nfct_help(ct); + + if (help->helper->data_len && + nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = { + [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN-1 }, + [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, }, + [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, }, +}; + +static int +nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, + const struct nlattr *attr) +{ + int err; + struct nlattr *tb[NFCTH_POLICY_MAX+1]; + + err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol); + if (err < 0) + return err; + + if (!tb[NFCTH_POLICY_NAME] || + !tb[NFCTH_POLICY_EXPECT_MAX] || + !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) + return -EINVAL; + + strncpy(expect_policy->name, + nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN); + expect_policy->max_expected = + ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); + expect_policy->timeout = + ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); + + return 0; +} + +static const struct nla_policy +nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = { + [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, }, +}; + +static int +nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, + const struct nlattr *attr) +{ + int i, ret; + struct nf_conntrack_expect_policy *expect_policy; + struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; + + ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, + nfnl_cthelper_expect_policy_set); + if (ret < 0) + return ret; + + if (!tb[NFCTH_POLICY_SET_NUM]) + return -EINVAL; + + helper->expect_class_max = + ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); + + if (helper->expect_class_max != 0 && + helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES) + return -EOVERFLOW; + + expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * + helper->expect_class_max, GFP_KERNEL); + if (expect_policy == NULL) + return -ENOMEM; + + for (i=0; i<helper->expect_class_max; i++) { + if (!tb[NFCTH_POLICY_SET+i]) + goto err; + + ret = nfnl_cthelper_expect_policy(&expect_policy[i], + tb[NFCTH_POLICY_SET+i]); + if (ret < 0) + goto err; + } + helper->expect_policy = expect_policy; + return 0; +err: + kfree(expect_policy); + return -EINVAL; +} + +static int +nfnl_cthelper_create(const struct nlattr * const tb[], + struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_helper *helper; + int ret; + + if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) + return -EINVAL; + + helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL); + if (helper == NULL) + return -ENOMEM; + + ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); + if (ret < 0) + goto err; + + strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); + helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); + helper->flags |= NF_CT_HELPER_F_USERSPACE; + memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); + + helper->me = THIS_MODULE; + helper->help = nfnl_userspace_cthelper; + helper->from_nlattr = nfnl_cthelper_from_nlattr; + helper->to_nlattr = nfnl_cthelper_to_nlattr; + + /* Default to queue number zero, this can be updated at any time. */ + if (tb[NFCTH_QUEUE_NUM]) + helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); + + if (tb[NFCTH_STATUS]) { + int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); + + switch(status) { + case NFCT_HELPER_STATUS_ENABLED: + helper->flags |= NF_CT_HELPER_F_CONFIGURED; + break; + case NFCT_HELPER_STATUS_DISABLED: + helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; + break; + } + } + + ret = nf_conntrack_helper_register(helper); + if (ret < 0) + goto err; + + return 0; +err: + kfree(helper); + return ret; +} + +static int +nfnl_cthelper_update(const struct nlattr * const tb[], + struct nf_conntrack_helper *helper) +{ + int ret; + + if (tb[NFCTH_PRIV_DATA_LEN]) + return -EBUSY; + + if (tb[NFCTH_POLICY]) { + ret = nfnl_cthelper_parse_expect_policy(helper, + tb[NFCTH_POLICY]); + if (ret < 0) + return ret; + } + if (tb[NFCTH_QUEUE_NUM]) + helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); + + if (tb[NFCTH_STATUS]) { + int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); + + switch(status) { + case NFCT_HELPER_STATUS_ENABLED: + helper->flags |= NF_CT_HELPER_F_CONFIGURED; + break; + case NFCT_HELPER_STATUS_DISABLED: + helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; + break; + } + } + return 0; +} + +static int +nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + const char *helper_name; + struct nf_conntrack_helper *cur, *helper = NULL; + struct nf_conntrack_tuple tuple; + int ret = 0, i; + + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) + return -EINVAL; + + helper_name = nla_data(tb[NFCTH_NAME]); + + ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); + if (ret < 0) + return ret; + + rcu_read_lock(); + for (i = 0; i < nf_ct_helper_hsize && !helper; i++) { + hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { + + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + if (strncmp(cur->name, helper_name, + NF_CT_HELPER_NAME_LEN) != 0) + continue; + + if ((tuple.src.l3num != cur->tuple.src.l3num || + tuple.dst.protonum != cur->tuple.dst.protonum)) + continue; + + if (nlh->nlmsg_flags & NLM_F_EXCL) { + ret = -EEXIST; + goto err; + } + helper = cur; + break; + } + } + rcu_read_unlock(); + + if (helper == NULL) + ret = nfnl_cthelper_create(tb, &tuple); + else + ret = nfnl_cthelper_update(tb, helper); + + return ret; +err: + rcu_read_unlock(); + return ret; +} + +static int +nfnl_cthelper_dump_tuple(struct sk_buff *skb, + struct nf_conntrack_helper *helper) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED); + if (nest_parms == NULL) + goto nla_put_failure; + + if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM, + htons(helper->tuple.src.l3num))) + goto nla_put_failure; + + if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum)) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + return 0; + +nla_put_failure: + return -1; +} + +static int +nfnl_cthelper_dump_policy(struct sk_buff *skb, + struct nf_conntrack_helper *helper) +{ + int i; + struct nlattr *nest_parms1, *nest_parms2; + + nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED); + if (nest_parms1 == NULL) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, + htonl(helper->expect_class_max))) + goto nla_put_failure; + + for (i=0; i<helper->expect_class_max; i++) { + nest_parms2 = nla_nest_start(skb, + (NFCTH_POLICY_SET+i) | NLA_F_NESTED); + if (nest_parms2 == NULL) + goto nla_put_failure; + + if (nla_put_string(skb, NFCTH_POLICY_NAME, + helper->expect_policy[i].name)) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX, + htonl(helper->expect_policy[i].max_expected))) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT, + htonl(helper->expect_policy[i].timeout))) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms2); + } + nla_nest_end(skb, nest_parms1); + return 0; + +nla_put_failure: + return -1; +} + +static int +nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + int event, struct nf_conntrack_helper *helper) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + int status; + + event |= NFNL_SUBSYS_CTHELPER << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFCTH_NAME, helper->name)) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num))) + goto nla_put_failure; + + if (nfnl_cthelper_dump_tuple(skb, helper) < 0) + goto nla_put_failure; + + if (nfnl_cthelper_dump_policy(skb, helper) < 0) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len))) + goto nla_put_failure; + + if (helper->flags & NF_CT_HELPER_F_CONFIGURED) + status = NFCT_HELPER_STATUS_ENABLED; + else + status = NFCT_HELPER_STATUS_DISABLED; + + if (nla_put_be32(skb, NFCTH_STATUS, htonl(status))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_helper *cur, *last; + + rcu_read_lock(); + last = (struct nf_conntrack_helper *)cb->args[1]; + for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) { +restart: + hlist_for_each_entry_rcu(cur, + &nf_ct_helper_hash[cb->args[0]], hnode) { + + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + if (cb->args[1]) { + if (cur != last) + continue; + cb->args[1] = 0; + } + if (nfnl_cthelper_fill_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + NFNL_MSG_CTHELPER_NEW, cur) < 0) { + cb->args[1] = (unsigned long)cur; + goto out; + } + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } +out: + rcu_read_unlock(); + return skb->len; +} + +static int +nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + int ret = -ENOENT, i; + struct nf_conntrack_helper *cur; + struct sk_buff *skb2; + char *helper_name = NULL; + struct nf_conntrack_tuple tuple; + bool tuple_set = false; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nfnl_cthelper_dump_table, + }; + return netlink_dump_start(nfnl, skb, nlh, &c); + } + + if (tb[NFCTH_NAME]) + helper_name = nla_data(tb[NFCTH_NAME]); + + if (tb[NFCTH_TUPLE]) { + ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); + if (ret < 0) + return ret; + + tuple_set = true; + } + + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { + + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + if (helper_name && strncmp(cur->name, helper_name, + NF_CT_HELPER_NAME_LEN) != 0) { + continue; + } + if (tuple_set && + (tuple.src.l3num != cur->tuple.src.l3num || + tuple.dst.protonum != cur->tuple.dst.protonum)) + continue; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + ret = -ENOMEM; + break; + } + + ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + NFNL_MSG_CTHELPER_NEW, cur); + if (ret <= 0) { + kfree_skb(skb2); + break; + } + + ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; + } + } + return ret; +} + +static int +nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + char *helper_name = NULL; + struct nf_conntrack_helper *cur; + struct hlist_node *tmp; + struct nf_conntrack_tuple tuple; + bool tuple_set = false, found = false; + int i, j = 0, ret; + + if (tb[NFCTH_NAME]) + helper_name = nla_data(tb[NFCTH_NAME]); + + if (tb[NFCTH_TUPLE]) { + ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); + if (ret < 0) + return ret; + + tuple_set = true; + } + + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], + hnode) { + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + j++; + + if (helper_name && strncmp(cur->name, helper_name, + NF_CT_HELPER_NAME_LEN) != 0) { + continue; + } + if (tuple_set && + (tuple.src.l3num != cur->tuple.src.l3num || + tuple.dst.protonum != cur->tuple.dst.protonum)) + continue; + + found = true; + nf_conntrack_helper_unregister(cur); + } + } + /* Make sure we return success if we flush and there is no helpers */ + return (found || j == 0) ? 0 : -ENOENT; +} + +static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { + [NFCTH_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN-1 }, + [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, +}; + +static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { + [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy }, + [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy }, + [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy }, +}; + +static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { + .name = "cthelper", + .subsys_id = NFNL_SUBSYS_CTHELPER, + .cb_count = NFNL_MSG_CTHELPER_MAX, + .cb = nfnl_cthelper_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER); + +static int __init nfnl_cthelper_init(void) +{ + int ret; + + ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys); + if (ret < 0) { + pr_err("nfnl_cthelper: cannot register with nfnetlink.\n"); + goto err_out; + } + return 0; +err_out: + return ret; +} + +static void __exit nfnl_cthelper_exit(void) +{ + struct nf_conntrack_helper *cur; + struct hlist_node *tmp; + int i; + + nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); + + for (i=0; i<nf_ct_helper_hsize; i++) { + hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], + hnode) { + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + nf_conntrack_helper_unregister(cur); + } + } +} + +module_init(nfnl_cthelper_init); +module_exit(nfnl_cthelper_exit); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c new file mode 100644 index 00000000000..476accd1714 --- /dev/null +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -0,0 +1,585 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/rculist.h> +#include <linux/rculist_nulls.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/security.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); + +static LIST_HEAD(cttimeout_list); + +static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { + [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING, + .len = CTNL_TIMEOUT_NAME_MAX - 1}, + [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 }, + [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 }, + [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, +}; + +static int +ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, + struct net *net, const struct nlattr *attr) +{ + int ret = 0; + + if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) { + struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1]; + + ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, + attr, l4proto->ctnl_timeout.nla_policy); + if (ret < 0) + return ret; + + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + } + return ret; +} + +static int +cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + __u16 l3num; + __u8 l4num; + struct nf_conntrack_l4proto *l4proto; + struct ctnl_timeout *timeout, *matching = NULL; + struct net *net = sock_net(skb->sk); + char *name; + int ret; + + if (!cda[CTA_TIMEOUT_NAME] || + !cda[CTA_TIMEOUT_L3PROTO] || + !cda[CTA_TIMEOUT_L4PROTO] || + !cda[CTA_TIMEOUT_DATA]) + return -EINVAL; + + name = nla_data(cda[CTA_TIMEOUT_NAME]); + l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + + list_for_each_entry(timeout, &cttimeout_list, head) { + if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + matching = timeout; + break; + } + + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supportted, skip. */ + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; + goto err_proto_put; + } + + if (matching) { + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + /* You cannot replace one timeout policy by another of + * different kind, sorry. + */ + if (matching->l3num != l3num || + matching->l4proto->l4proto != l4num) { + ret = -EINVAL; + goto err_proto_put; + } + + ret = ctnl_timeout_parse_policy(&matching->data, + l4proto, net, + cda[CTA_TIMEOUT_DATA]); + return ret; + } + ret = -EBUSY; + goto err_proto_put; + } + + timeout = kzalloc(sizeof(struct ctnl_timeout) + + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); + if (timeout == NULL) { + ret = -ENOMEM; + goto err_proto_put; + } + + ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net, + cda[CTA_TIMEOUT_DATA]); + if (ret < 0) + goto err; + + strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); + timeout->l3num = l3num; + timeout->l4proto = l4proto; + atomic_set(&timeout->refcnt, 1); + list_add_tail_rcu(&timeout->head, &cttimeout_list); + + return 0; +err: + kfree(timeout); +err_proto_put: + nf_ct_l4proto_put(l4proto); + return ret; +} + +static int +ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + int event, struct ctnl_timeout *timeout) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + struct nf_conntrack_l4proto *l4proto = timeout->l4proto; + + event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || + nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) || + nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || + nla_put_be32(skb, CTA_TIMEOUT_USE, + htonl(atomic_read(&timeout->refcnt)))) + goto nla_put_failure; + + if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { + struct nlattr *nest_parms; + int ret; + + nest_parms = nla_nest_start(skb, + CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); + if (ret < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + } + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ctnl_timeout *cur, *last; + + if (cb->args[2]) + return 0; + + last = (struct ctnl_timeout *)cb->args[1]; + if (cb->args[1]) + cb->args[1] = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &cttimeout_list, head) { + if (last) { + if (cur != last) + continue; + + last = NULL; + } + if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { + cb->args[1] = (unsigned long)cur; + break; + } + } + if (!cb->args[1]) + cb->args[2] = 1; + rcu_read_unlock(); + return skb->len; +} + +static int +cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + int ret = -ENOENT; + char *name; + struct ctnl_timeout *cur; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnl_timeout_dump, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + if (!cda[CTA_TIMEOUT_NAME]) + return -EINVAL; + name = nla_data(cda[CTA_TIMEOUT_NAME]); + + list_for_each_entry(cur, &cttimeout_list, head) { + struct sk_buff *skb2; + + if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + ret = -ENOMEM; + break; + } + + ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_NEW, cur); + if (ret <= 0) { + kfree_skb(skb2); + break; + } + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; + } + return ret; +} + +/* try to delete object, fail if it is still in use. */ +static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) +{ + int ret = 0; + + /* we want to avoid races with nf_ct_timeout_find_get. */ + if (atomic_dec_and_test(&timeout->refcnt)) { + /* We are protected by nfnl mutex. */ + list_del_rcu(&timeout->head); + nf_ct_l4proto_put(timeout->l4proto); + kfree_rcu(timeout, rcu_head); + } else { + /* still in use, restore reference counter. */ + atomic_inc(&timeout->refcnt); + ret = -EBUSY; + } + return ret; +} + +static int +cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + char *name; + struct ctnl_timeout *cur; + int ret = -ENOENT; + + if (!cda[CTA_TIMEOUT_NAME]) { + list_for_each_entry(cur, &cttimeout_list, head) + ctnl_timeout_try_del(cur); + + return 0; + } + name = nla_data(cda[CTA_TIMEOUT_NAME]); + + list_for_each_entry(cur, &cttimeout_list, head) { + if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + ret = ctnl_timeout_try_del(cur); + if (ret < 0) + return ret; + + break; + } + return ret; +} + +static int +cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + __u16 l3num; + __u8 l4num; + struct nf_conntrack_l4proto *l4proto; + struct net *net = sock_net(skb->sk); + unsigned int *timeouts; + int ret; + + if (!cda[CTA_TIMEOUT_L3PROTO] || + !cda[CTA_TIMEOUT_L4PROTO] || + !cda[CTA_TIMEOUT_DATA]) + return -EINVAL; + + l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supported, skip. */ + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; + goto err; + } + + timeouts = l4proto->get_timeouts(net); + + ret = ctnl_timeout_parse_policy(timeouts, l4proto, net, + cda[CTA_TIMEOUT_DATA]); + if (ret < 0) + goto err; + + nf_ct_l4proto_put(l4proto); + return 0; +err: + nf_ct_l4proto_put(l4proto); + return ret; +} + +static int +cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, + u32 seq, u32 type, int event, + struct nf_conntrack_l4proto *l4proto) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + + event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) || + nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) + goto nla_put_failure; + + if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { + struct nlattr *nest_parms; + unsigned int *timeouts = l4proto->get_timeouts(net); + int ret; + + nest_parms = nla_nest_start(skb, + CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); + if (ret < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + } + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + __u16 l3num; + __u8 l4num; + struct nf_conntrack_l4proto *l4proto; + struct net *net = sock_net(skb->sk); + struct sk_buff *skb2; + int ret, err; + + if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) + return -EINVAL; + + l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supported, skip. */ + if (l4proto->l4proto != l4num) { + err = -EOPNOTSUPP; + goto err; + } + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + err = -ENOMEM; + goto err; + } + + ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_DEFAULT_SET, + l4proto); + if (ret <= 0) { + kfree_skb(skb2); + err = -ENOMEM; + goto err; + } + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; +err: + nf_ct_l4proto_put(l4proto); + return err; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static struct ctnl_timeout *ctnl_timeout_find_get(const char *name) +{ + struct ctnl_timeout *timeout, *matching = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(timeout, &cttimeout_list, head) { + if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + if (!try_module_get(THIS_MODULE)) + goto err; + + if (!atomic_inc_not_zero(&timeout->refcnt)) { + module_put(THIS_MODULE); + goto err; + } + matching = timeout; + break; + } +err: + rcu_read_unlock(); + return matching; +} + +static void ctnl_timeout_put(struct ctnl_timeout *timeout) +{ + atomic_dec(&timeout->refcnt); + module_put(THIS_MODULE); +} +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + +static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { + [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, +}; + +static const struct nfnetlink_subsystem cttimeout_subsys = { + .name = "conntrack_timeout", + .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT, + .cb_count = IPCTNL_MSG_TIMEOUT_MAX, + .cb = cttimeout_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); + +static int __init cttimeout_init(void) +{ + int ret; + + ret = nfnetlink_subsys_register(&cttimeout_subsys); + if (ret < 0) { + pr_err("cttimeout_init: cannot register cttimeout with " + "nfnetlink.\n"); + goto err_out; + } +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get); + RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + return 0; + +err_out: + return ret; +} + +static void __exit cttimeout_exit(void) +{ + struct ctnl_timeout *cur, *tmp; + + pr_info("cttimeout: unregistering from nfnetlink.\n"); + + nfnetlink_subsys_unregister(&cttimeout_subsys); + list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) { + list_del_rcu(&cur->head); + /* We are sure that our objects have no clients at this point, + * it's safe to release them all without checking refcnt. + */ + nf_ct_l4proto_put(cur->l4proto); + kfree_rcu(cur, rcu_head); + } +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); + RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ +} + +module_init(cttimeout_init); +module_exit(cttimeout_exit); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index e32e30e7a17..d292c8d286e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -3,6 +3,7 @@ * nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ipt_ULOG.c: * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> @@ -13,12 +14,13 @@ */ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> #include <linux/spinlock.h> @@ -26,11 +28,13 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/list.h> -#include <linux/jhash.h> -#include <linux/random.h> +#include <linux/slab.h> #include <net/sock.h> +#include <net/netfilter/nf_log.h> +#include <net/netns/generic.h> +#include <net/netfilter/nfnetlink_log.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #ifdef CONFIG_BRIDGE_NETFILTER #include "../bridge/br_private.h" @@ -39,18 +43,11 @@ #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ +#define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */ #define PRINTR(x, args...) do { if (net_ratelimit()) \ printk(x, ## args); } while (0); -#if 0 -#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ - __FILE__, __LINE__, __FUNCTION__, \ - ## args) -#else -#define UDEBUG(x, ...) -#endif - struct nfulnl_instance { struct hlist_node hlist; /* global list of instances */ spinlock_t lock; @@ -58,9 +55,10 @@ struct nfulnl_instance { unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ struct timer_list timer; - int peer_pid; /* PID of the peer process */ + struct net *net; + struct user_namespace *peer_user_ns; /* User namespace of the peer process */ + int peer_portid; /* PORTID of the peer process */ /* configurable parameters */ unsigned int flushtimeout; /* timeout until queue flush */ @@ -71,14 +69,23 @@ struct nfulnl_instance { u_int16_t group_num; /* number of this queue */ u_int16_t flags; u_int8_t copy_mode; + struct rcu_head rcu; }; -static DEFINE_RWLOCK(instances_lock); -static atomic_t global_seq; - #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS]; -static unsigned int hash_init; + +static int nfnl_log_net_id __read_mostly; + +struct nfnl_log_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; + atomic_t global_seq; +}; + +static struct nfnl_log_net *nfnl_log_pernet(struct net *net) +{ + return net_generic(net, nfnl_log_net_id); +} static inline u_int8_t instance_hashfn(u_int16_t group_num) { @@ -86,16 +93,13 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num) } static struct nfulnl_instance * -__instance_lookup(u_int16_t group_num) +__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) { struct hlist_head *head; - struct hlist_node *pos; struct nfulnl_instance *inst; - UDEBUG("entering (group_num=%u)\n", group_num); - - head = &instance_table[instance_hashfn(group_num)]; - hlist_for_each_entry(inst, pos, head, hlist) { + head = &log->instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->group_num == group_num) return inst; } @@ -109,49 +113,63 @@ instance_get(struct nfulnl_instance *inst) } static struct nfulnl_instance * -instance_lookup_get(u_int16_t group_num) +instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) { struct nfulnl_instance *inst; - read_lock_bh(&instances_lock); - inst = __instance_lookup(group_num); - if (inst) - instance_get(inst); - read_unlock_bh(&instances_lock); + rcu_read_lock_bh(); + inst = __instance_lookup(log, group_num); + if (inst && !atomic_inc_not_zero(&inst->use)) + inst = NULL; + rcu_read_unlock_bh(); return inst; } +static void nfulnl_instance_free_rcu(struct rcu_head *head) +{ + struct nfulnl_instance *inst = + container_of(head, struct nfulnl_instance, rcu); + + put_net(inst->net); + kfree(inst); + module_put(THIS_MODULE); +} + static void instance_put(struct nfulnl_instance *inst) { - if (inst && atomic_dec_and_test(&inst->use)) { - UDEBUG("kfree(inst=%p)\n", inst); - kfree(inst); - module_put(THIS_MODULE); - } + if (inst && atomic_dec_and_test(&inst->use)) + call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu); } static void nfulnl_timer(unsigned long data); static struct nfulnl_instance * -instance_create(u_int16_t group_num, int pid) +instance_create(struct net *net, u_int16_t group_num, + int portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; + struct nfnl_log_net *log = nfnl_log_pernet(net); + int err; - UDEBUG("entering (group_num=%u, pid=%d)\n", group_num, - pid); - - write_lock_bh(&instances_lock); - if (__instance_lookup(group_num)) { - inst = NULL; - UDEBUG("aborting, instance already exists\n"); + spin_lock_bh(&log->instances_lock); + if (__instance_lookup(log, group_num)) { + err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + if (!try_module_get(THIS_MODULE)) { + kfree(inst); + err = -EAGAIN; goto out_unlock; + } INIT_HLIST_NODE(&inst->hlist); spin_lock_init(&inst->lock); @@ -160,72 +178,61 @@ instance_create(u_int16_t group_num, int pid) setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); - inst->peer_pid = pid; + inst->net = get_net(net); + inst->peer_user_ns = user_ns; + inst->peer_portid = portid; inst->group_num = group_num; inst->qthreshold = NFULNL_QTHRESH_DEFAULT; inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; inst->copy_mode = NFULNL_COPY_PACKET; - inst->copy_range = 0xffff; - - if (!try_module_get(THIS_MODULE)) - goto out_free; + inst->copy_range = NFULNL_COPY_RANGE_MAX; - hlist_add_head(&inst->hlist, - &instance_table[instance_hashfn(group_num)]); + hlist_add_head_rcu(&inst->hlist, + &log->instance_table[instance_hashfn(group_num)]); - UDEBUG("newly added node: %p, next=%p\n", &inst->hlist, - inst->hlist.next); - write_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); return inst; -out_free: - instance_put(inst); out_unlock: - write_unlock_bh(&instances_lock); - return NULL; + spin_unlock_bh(&log->instances_lock); + return ERR_PTR(err); } -static int __nfulnl_send(struct nfulnl_instance *inst); +static void __nfulnl_flush(struct nfulnl_instance *inst); +/* called with BH disabled */ static void __instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ - UDEBUG("removing instance %p (queuenum=%u) from hash\n", - inst, inst->group_num); - - hlist_del(&inst->hlist); + hlist_del_rcu(&inst->hlist); /* then flush all pending packets from skb */ - spin_lock_bh(&inst->lock); - if (inst->skb) { - /* timer "holds" one reference (we have one more) */ - if (del_timer(&inst->timer)) - instance_put(inst); - if (inst->qlen) - __nfulnl_send(inst); - if (inst->skb) { - kfree_skb(inst->skb); - inst->skb = NULL; - } - } - spin_unlock_bh(&inst->lock); + spin_lock(&inst->lock); + + /* lockless readers wont be able to use us */ + inst->copy_mode = NFULNL_COPY_DISABLED; + + if (inst->skb) + __nfulnl_flush(inst); + spin_unlock(&inst->lock); /* and finally put the refcount */ instance_put(inst); } static inline void -instance_destroy(struct nfulnl_instance *inst) +instance_destroy(struct nfnl_log_net *log, + struct nfulnl_instance *inst) { - write_lock_bh(&instances_lock); + spin_lock_bh(&log->instances_lock); __instance_destroy(inst); - write_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } static int @@ -245,11 +252,8 @@ nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, case NFULNL_COPY_PACKET: inst->copy_mode = mode; - /* we're using struct nfattr which has 16bit nfa_len */ - if (range > 0xffff) - inst->copy_range = 0xffff; - else - inst->copy_range = range; + inst->copy_range = min_t(unsigned int, + range, NFULNL_COPY_RANGE_MAX); break; default: @@ -311,31 +315,28 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) return 0; } -static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, - unsigned int pkt_size) +static struct sk_buff * +nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, + unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; - UDEBUG("entered (%u, %u)\n", inst_size, pkt_size); - /* alloc skb which should be big enough for a whole multipart * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = alloc_skb(n, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC); if (!skb) { - PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n", - inst_size); - if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = alloc_skb(pkt_size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(net, pkt_size, + peer_portid, GFP_ATOMIC); if (!skb) - PRINTR("nfnetlink_log: can't even alloc %u " - "bytes\n", pkt_size); + pr_err("nfnetlink_log: can't even alloc %u bytes\n", + pkt_size); } } @@ -345,29 +346,39 @@ static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, static int __nfulnl_send(struct nfulnl_instance *inst) { - int status; - - if (inst->qlen > 1) - inst->lastnlh->nlmsg_type = NLMSG_DONE; - - status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT); - if (status < 0) { - UDEBUG("netlink_unicast() failed\n"); - /* FIXME: statistics */ + int status = -1; + + if (inst->qlen > 1) { + struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0, + NLMSG_DONE, + sizeof(struct nfgenmsg), + 0); + if (!nlh) + goto out; } + status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, + MSG_DONTWAIT); inst->qlen = 0; inst->skb = NULL; - inst->lastnlh = NULL; - +out: return status; } -static void nfulnl_timer(unsigned long data) +static void +__nfulnl_flush(struct nfulnl_instance *inst) { - struct nfulnl_instance *inst = (struct nfulnl_instance *)data; + /* timer holds a reference */ + if (del_timer(&inst->timer)) + instance_put(inst); + if (inst->skb) + __nfulnl_send(inst); +} - UDEBUG("timer function called, flushing buffer\n"); +static void +nfulnl_timer(unsigned long data) +{ + struct nfulnl_instance *inst = (struct nfulnl_instance *)data; spin_lock_bh(&inst->lock); if (inst->skb) @@ -379,113 +390,139 @@ static void nfulnl_timer(unsigned long data) /* This is an inline function, we don't really care about a long * list of arguments */ static inline int -__build_packet_message(struct nfulnl_instance *inst, +__build_packet_message(struct nfnl_log_net *log, + struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, - unsigned int pf, + u_int8_t pf, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, - const struct nf_loginfo *li, const char *prefix, unsigned int plen) { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - __be32 tmp_uint; sk_buff_data_t old_tail = inst->skb->tail; + struct sock *sk; + const unsigned char *hwhdrp; - UDEBUG("entered\n"); - - nlh = NLMSG_PUT(inst->skb, 0, 0, + nlh = nlmsg_put(inst->skb, 0, 0, NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, - sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); + sizeof(struct nfgenmsg), 0); + if (!nlh) + return -1; + nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = pf; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(inst->group_num); + memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; pmsg.hook = hooknum; - NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); + if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg)) + goto nla_put_failure; - if (prefix) - NFA_PUT(inst->skb, NFULA_PREFIX, plen, prefix); + if (prefix && + nla_put(inst->skb, NFULA_PREFIX, plen, prefix)) + goto nla_put_failure; if (indev) { - tmp_uint = htonl(indev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER - NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint), - &tmp_uint); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, - sizeof(tmp_uint), &tmp_uint); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)) || /* this is the bridge group "brX" */ - tmp_uint = htonl(indev->br_port->br->dev->ifindex); - NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, - sizeof(tmp_uint), &tmp_uint); + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex))) + goto nla_put_failure; } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ - NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, - sizeof(tmp_uint), &tmp_uint); - if (skb->nf_bridge && skb->nf_bridge->physindev) { - tmp_uint = - htonl(skb->nf_bridge->physindev->ifindex); - NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, - sizeof(tmp_uint), &tmp_uint); - } + if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; + if (skb->nf_bridge && skb->nf_bridge->physindev && + nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(skb->nf_bridge->physindev->ifindex))) + goto nla_put_failure; } #endif } if (outdev) { - tmp_uint = htonl(outdev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER - NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint), - &tmp_uint); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, - sizeof(tmp_uint), &tmp_uint); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ - tmp_uint = htonl(outdev->br_port->br->dev->ifindex); - NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, - sizeof(tmp_uint), &tmp_uint); + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) + goto nla_put_failure; } else { /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ - NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, - sizeof(tmp_uint), &tmp_uint); - if (skb->nf_bridge && skb->nf_bridge->physoutdev) { - tmp_uint = - htonl(skb->nf_bridge->physoutdev->ifindex); - NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, - sizeof(tmp_uint), &tmp_uint); - } + if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; + if (skb->nf_bridge && skb->nf_bridge->physoutdev && + nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(skb->nf_bridge->physoutdev->ifindex))) + goto nla_put_failure; } #endif } - if (skb->mark) { - tmp_uint = htonl(skb->mark); - NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint); - } + if (skb->mark && + nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark))) + goto nla_put_failure; - if (indev && skb->dev && skb->dev->hard_header_parse) { + if (indev && skb->dev && + skb->mac_header != skb->network_header) { struct nfulnl_msg_packet_hw phw; - int len = skb->dev->hard_header_parse((struct sk_buff *)skb, - phw.hw_addr); - phw.hw_addrlen = htons(len); - NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(skb, phw.hw_addr); + if (len > 0) { + phw.hw_addrlen = htons(len); + if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) + goto nla_put_failure; + } + } + + if (indev && skb_mac_header_was_set(skb)) { + if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || + nla_put_be16(inst->skb, NFULA_HWLEN, + htons(skb->dev->hard_header_len))) + goto nla_put_failure; + + hwhdrp = skb_mac_header(skb); + + if (skb->dev->type == ARPHRD_SIT) + hwhdrp -= ETH_HLEN; + + if (hwhdrp >= skb->head && + nla_put(inst->skb, NFULA_HWHEADER, + skb->dev->hard_header_len, hwhdrp)) + goto nla_put_failure; } if (skb->tstamp.tv64) { @@ -494,56 +531,60 @@ __build_packet_message(struct nfulnl_instance *inst, ts.sec = cpu_to_be64(tv.tv_sec); ts.usec = cpu_to_be64(tv.tv_usec); - NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); + if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) + goto nla_put_failure; } /* UID */ - if (skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) { - __be32 uid = htonl(skb->sk->sk_socket->file->f_uid); - /* need to unlock here since NFA_PUT may goto */ - read_unlock_bh(&skb->sk->sk_callback_lock); - NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid); + sk = skb->sk; + if (sk && sk->sk_state != TCP_TIME_WAIT) { + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + struct file *file = sk->sk_socket->file; + const struct cred *cred = file->f_cred; + struct user_namespace *user_ns = inst->peer_user_ns; + __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid)); + __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid)); + read_unlock_bh(&sk->sk_callback_lock); + if (nla_put_be32(inst->skb, NFULA_UID, uid) || + nla_put_be32(inst->skb, NFULA_GID, gid)) + goto nla_put_failure; } else - read_unlock_bh(&skb->sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } /* local sequence number */ - if (inst->flags & NFULNL_CFG_F_SEQ) { - tmp_uint = htonl(inst->seq++); - NFA_PUT(inst->skb, NFULA_SEQ, sizeof(tmp_uint), &tmp_uint); - } + if ((inst->flags & NFULNL_CFG_F_SEQ) && + nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++))) + goto nla_put_failure; + /* global sequence number */ - if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) { - tmp_uint = htonl(atomic_inc_return(&global_seq)); - NFA_PUT(inst->skb, NFULA_SEQ_GLOBAL, sizeof(tmp_uint), &tmp_uint); - } + if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && + nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, + htonl(atomic_inc_return(&log->global_seq)))) + goto nla_put_failure; if (data_len) { - struct nfattr *nfa; - int size = NFA_LENGTH(data_len); + struct nlattr *nla; + int size = nla_attr_size(data_len); - if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) { + if (skb_tailroom(inst->skb) < nla_total_size(data_len)) { printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); - goto nlmsg_failure; + return -1; } - nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size)); - nfa->nfa_type = NFULA_PAYLOAD; - nfa->nfa_len = size; + nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len)); + nla->nla_type = NFULA_PAYLOAD; + nla->nla_len = size; - if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len)) + if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) BUG(); } nlh->nlmsg_len = inst->skb->tail - old_tail; - inst->lastnlh = nlh; return 0; -nlmsg_failure: - UDEBUG("nlmsg_failure\n"); -nfattr_failure: +nla_put_failure: PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); return -1; } @@ -562,8 +603,9 @@ static struct nf_loginfo default_loginfo = { }; /* log handler for internal netfilter logging api */ -static void -nfulnl_log_packet(unsigned int pf, +void +nfulnl_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -576,13 +618,14 @@ nfulnl_log_packet(unsigned int pf, const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; + struct nfnl_log_net *log = nfnl_log_pernet(net); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; - inst = instance_lookup_get(li->u.ulog.group); + inst = instance_lookup_get(log, li->u.ulog.group); if (!inst) return; @@ -590,37 +633,43 @@ nfulnl_log_packet(unsigned int pf, if (prefix) plen = strlen(prefix) + 1; - /* all macros expand to constant values at compile time */ /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) - + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr)) - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #ifdef CONFIG_BRIDGE_NETFILTER - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif - + NFA_SPACE(sizeof(u_int32_t)) /* mark */ - + NFA_SPACE(sizeof(u_int32_t)) /* uid */ - + NFA_SPACE(plen) /* prefix */ - + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw)) - + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp)); - - UDEBUG("initial size=%u\n", size); + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t)) /* gid */ + + nla_total_size(plen) /* prefix */ + + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)); + + if (in && skb_mac_header_was_set(skb)) { + size += nla_total_size(skb->dev->hard_header_len) + + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + + nla_total_size(sizeof(u_int16_t)); /* hwlen */ + } spin_lock_bh(&inst->lock); if (inst->flags & NFULNL_CFG_F_SEQ) - size += NFA_SPACE(sizeof(u_int32_t)); + size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) - size += NFA_SPACE(sizeof(u_int32_t)); + size += nla_total_size(sizeof(u_int32_t)); qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ - if (qthreshold > li->u.ulog.qthreshold) - qthreshold = li->u.ulog.qthreshold; + if (li->u.ulog.qthreshold) + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + switch (inst->copy_mode) { case NFULNL_COPY_META: @@ -635,41 +684,38 @@ nfulnl_log_packet(unsigned int pf, else data_len = inst->copy_range; - size += NFA_SPACE(data_len); - UDEBUG("copy_packet, therefore size now %u\n", size); + size += nla_total_size(data_len); break; + case NFULNL_COPY_DISABLED: default: goto unlock_and_release; } - if (inst->qlen >= qthreshold || - (inst->skb && size > skb_tailroom(inst->skb))) { + if (inst->skb && + size > skb_tailroom(inst->skb) - sizeof(struct nfgenmsg)) { /* either the queue len is too high or we don't have * enough room in the skb left. flush to userspace. */ - UDEBUG("flushing old skb\n"); - - /* timer "holds" one reference (we have another one) */ - if (del_timer(&inst->timer)) - instance_put(inst); - __nfulnl_send(inst); + __nfulnl_flush(inst); } if (!inst->skb) { - inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + inst->skb = nfulnl_alloc_skb(net, inst->peer_portid, + inst->nlbufsiz, size); if (!inst->skb) goto alloc_failure; } - UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold); inst->qlen++; - __build_packet_message(inst, skb, data_len, pf, - hooknum, in, out, li, prefix, plen); + __build_packet_message(log, inst, skb, data_len, pf, + hooknum, in, out, prefix, plen); + if (inst->qlen >= qthreshold) + __nfulnl_flush(inst); /* timer_pending always called within inst->lock, so there * is no chance of a race here */ - if (!timer_pending(&inst->timer)) { + else if (!timer_pending(&inst->timer)) { instance_get(inst); inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); add_timer(&inst->timer); @@ -681,35 +727,34 @@ unlock_and_release: return; alloc_failure: - UDEBUG("error allocating skb\n"); /* FIXME: statistics */ goto unlock_and_release; } +EXPORT_SYMBOL_GPL(nfulnl_log_packet); static int nfulnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; + struct nfnl_log_net *log = nfnl_log_pernet(n->net); - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_NETFILTER && n->pid) { + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; - /* destroy all instances for this pid */ - write_lock_bh(&instances_lock); + /* destroy all instances for this portid */ + spin_lock_bh(&log->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp, *t2; + struct hlist_node *t2; struct nfulnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &log->instance_table[i]; - hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - UDEBUG("node = %p\n", inst); - if (n->pid == inst->peer_pid) + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) __instance_destroy(inst); } } - write_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } return NOTIFY_DONE; } @@ -720,65 +765,61 @@ static struct notifier_block nfulnl_rtnl_notifier = { static int nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[]) + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) { return -ENOTSUPP; } -static struct nf_logger nfulnl_logger = { +static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", .logfn = &nfulnl_log_packet, .me = THIS_MODULE, }; -static const int nfula_min[NFULA_MAX] = { - [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr), - [NFULA_MARK-1] = sizeof(u_int32_t), - [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp), - [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t), - [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t), - [NFULA_IFINDEX_PHYSINDEV-1] = sizeof(u_int32_t), - [NFULA_IFINDEX_PHYSOUTDEV-1] = sizeof(u_int32_t), - [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw), - [NFULA_PAYLOAD-1] = 0, - [NFULA_PREFIX-1] = 0, - [NFULA_UID-1] = sizeof(u_int32_t), - [NFULA_SEQ-1] = sizeof(u_int32_t), - [NFULA_SEQ_GLOBAL-1] = sizeof(u_int32_t), -}; - -static const int nfula_cfg_min[NFULA_CFG_MAX] = { - [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd), - [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode), - [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t), - [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t), - [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t), - [NFULA_CFG_FLAGS-1] = sizeof(u_int16_t), +static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { + [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) }, + [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) }, + [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 }, + [NFULA_CFG_QTHRESH] = { .type = NLA_U32 }, + [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 }, + [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; static int nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfula[]) + const struct nlmsghdr *nlh, + const struct nlattr * const nfula[]) { - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t group_num = ntohs(nfmsg->res_id); struct nfulnl_instance *inst; + struct nfulnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_log_net *log = nfnl_log_pernet(net); int ret = 0; - UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); + if (nfula[NFULA_CFG_CMD]) { + u_int8_t pf = nfmsg->nfgen_family; + cmd = nla_data(nfula[NFULA_CFG_CMD]); - if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) { - UDEBUG("bad attribute size\n"); - return -EINVAL; + /* Commands without queue context */ + switch (cmd->command) { + case NFULNL_CFG_CMD_PF_BIND: + return nf_log_bind_pf(net, pf, &nfulnl_logger); + case NFULNL_CFG_CMD_PF_UNBIND: + nf_log_unbind_pf(net, pf); + return 0; + } } - inst = instance_lookup_get(group_num); - if (nfula[NFULA_CFG_CMD-1]) { - u_int8_t pf = nfmsg->nfgen_family; - struct nfulnl_msg_config_cmd *cmd; - cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]); - UDEBUG("found CFG_CMD for\n"); + inst = instance_lookup_get(log, group_num); + if (inst && inst->peer_portid != NETLINK_CB(skb).portid) { + ret = -EPERM; + goto out_put; + } + if (cmd != NULL) { switch (cmd->command) { case NFULNL_CFG_CMD_BIND: if (inst) { @@ -786,10 +827,11 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out_put; } - inst = instance_create(group_num, - NETLINK_CB(skb).pid); - if (!inst) { - ret = -EINVAL; + inst = instance_create(net, group_num, + NETLINK_CB(skb).portid, + sk_user_ns(NETLINK_CB(skb).sk)); + if (IS_ERR(inst)) { + ret = PTR_ERR(inst); goto out; } break; @@ -799,78 +841,63 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out; } - if (inst->peer_pid != NETLINK_CB(skb).pid) { - ret = -EPERM; - goto out_put; - } - - instance_destroy(inst); - goto out; - case NFULNL_CFG_CMD_PF_BIND: - UDEBUG("registering log handler for pf=%u\n", pf); - ret = nf_log_register(pf, &nfulnl_logger); - break; - case NFULNL_CFG_CMD_PF_UNBIND: - UDEBUG("unregistering log handler for pf=%u\n", pf); - /* This is a bug and a feature. We cannot unregister - * other handlers, like nfnetlink_inst can */ - nf_log_unregister_pf(pf); - break; + instance_destroy(log, inst); + goto out_put; default: - ret = -EINVAL; + ret = -ENOTSUPP; break; } - - if (!inst) - goto out; - } else { - if (!inst) { - UDEBUG("no config command, and no instance for " - "group=%u pid=%u =>ENOENT\n", - group_num, NETLINK_CB(skb).pid); - ret = -ENOENT; - goto out; - } - - if (inst->peer_pid != NETLINK_CB(skb).pid) { - UDEBUG("no config command, and wrong pid\n"); - ret = -EPERM; - goto out_put; - } } - if (nfula[NFULA_CFG_MODE-1]) { + if (nfula[NFULA_CFG_MODE]) { struct nfulnl_msg_config_mode *params; - params = NFA_DATA(nfula[NFULA_CFG_MODE-1]); + params = nla_data(nfula[NFULA_CFG_MODE]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_mode(inst, params->copy_mode, ntohl(params->copy_range)); } - if (nfula[NFULA_CFG_TIMEOUT-1]) { - __be32 timeout = - *(__be32 *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]); + if (nfula[NFULA_CFG_TIMEOUT]) { + __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_timeout(inst, ntohl(timeout)); } - if (nfula[NFULA_CFG_NLBUFSIZ-1]) { - __be32 nlbufsiz = - *(__be32 *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]); + if (nfula[NFULA_CFG_NLBUFSIZ]) { + __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); } - if (nfula[NFULA_CFG_QTHRESH-1]) { - __be32 qthresh = - *(__be32 *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]); + if (nfula[NFULA_CFG_QTHRESH]) { + __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_qthresh(inst, ntohl(qthresh)); } - if (nfula[NFULA_CFG_FLAGS-1]) { - __be16 flags = - *(__be16 *)NFA_DATA(nfula[NFULA_CFG_FLAGS-1]); + if (nfula[NFULA_CFG_FLAGS]) { + __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]); + + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_flags(inst, ntohs(flags)); } @@ -880,14 +907,15 @@ out: return ret; } -static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { +static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, }, + .attr_count = NFULA_CFG_MAX, + .policy = nfula_cfg_policy }, }; -static struct nfnetlink_subsystem nfulnl_subsys = { +static const struct nfnetlink_subsystem nfulnl_subsys = { .name = "log", .subsys_id = NFNL_SUBSYS_ULOG, .cb_count = NFULNL_MSG_MAX, @@ -896,59 +924,74 @@ static struct nfnetlink_subsystem nfulnl_subsys = { #ifdef CONFIG_PROC_FS struct iter_state { + struct seq_net_private p; unsigned int bucket; }; -static struct hlist_node *get_first(struct iter_state *st) +static struct hlist_node *get_first(struct net *net, struct iter_state *st) { + struct nfnl_log_net *log; if (!st) return NULL; + log = nfnl_log_pernet(net); + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return instance_table[st->bucket].first; + struct hlist_head *head = &log->instance_table[st->bucket]; + + if (!hlist_empty(head)) + return rcu_dereference_bh(hlist_first_rcu(head)); } return NULL; } -static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) +static struct hlist_node *get_next(struct net *net, struct iter_state *st, + struct hlist_node *h) { - h = h->next; + h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { + struct nfnl_log_net *log; + struct hlist_head *head; + if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = instance_table[st->bucket].first; + log = nfnl_log_pernet(net); + head = &log->instance_table[st->bucket]; + h = rcu_dereference_bh(hlist_first_rcu(head)); } return h; } -static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) +static struct hlist_node *get_idx(struct net *net, struct iter_state *st, + loff_t pos) { struct hlist_node *head; - head = get_first(st); + head = get_first(net, st); if (head) - while (pos && (head = get_next(st, head))) + while (pos && (head = get_next(net, st, head))) pos--; return pos ? NULL : head; } -static void *seq_start(struct seq_file *seq, loff_t *pos) +static void *seq_start(struct seq_file *s, loff_t *pos) + __acquires(rcu_bh) { - read_lock_bh(&instances_lock); - return get_idx(seq->private, *pos); + rcu_read_lock_bh(); + return get_idx(seq_file_net(s), s->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; - return get_next(s->private, v); + return get_next(seq_file_net(s), s->private, v); } static void seq_stop(struct seq_file *s, void *v) + __releases(rcu_bh) { - read_unlock_bh(&instances_lock); + rcu_read_unlock_bh(); } static int seq_show(struct seq_file *s, void *v) @@ -957,12 +1000,12 @@ static int seq_show(struct seq_file *s, void *v) return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", inst->group_num, - inst->peer_pid, inst->qlen, + inst->peer_portid, inst->qlen, inst->copy_mode, inst->copy_range, inst->flushtimeout, atomic_read(&inst->use)); } -static struct seq_operations nful_seq_ops = { +static const struct seq_operations nful_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, @@ -971,22 +1014,8 @@ static struct seq_operations nful_seq_ops = { static int nful_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct iter_state *is; - int ret; - - is = kzalloc(sizeof(*is), GFP_KERNEL); - if (!is) - return -ENOMEM; - ret = seq_open(file, &nful_seq_ops); - if (ret < 0) - goto out_free; - seq = file->private_data; - seq->private = is; - return ret; -out_free: - kfree(is); - return ret; + return seq_open_net(inode, file, &nful_seq_ops, + sizeof(struct iter_state)); } static const struct file_operations nful_file_ops = { @@ -994,46 +1023,71 @@ static const struct file_operations nful_file_ops = { .open = nful_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* PROC_FS */ -static int __init nfnetlink_log_init(void) +static int __net_init nfnl_log_net_init(struct net *net) { - int i, status = -ENOMEM; + unsigned int i; + struct nfnl_log_net *log = nfnl_log_pernet(net); + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&log->instance_table[i]); + spin_lock_init(&log->instances_lock); + #ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc_nful; + if (!proc_create("nfnetlink_log", 0440, + net->nf.proc_netfilter, &nful_file_ops)) + return -ENOMEM; #endif + return 0; +} - for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); +static void __net_exit nfnl_log_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); +#endif + nf_log_unset(net, &nfulnl_logger); +} + +static struct pernet_operations nfnl_log_net_ops = { + .init = nfnl_log_net_init, + .exit = nfnl_log_net_exit, + .id = &nfnl_log_net_id, + .size = sizeof(struct nfnl_log_net), +}; - /* it's not really all that important to have a random value, so - * we can do this from the init function, even if there hasn't - * been that much entropy yet */ - get_random_bytes(&hash_init, sizeof(hash_init)); +static int __init nfnetlink_log_init(void) +{ + int status = -ENOMEM; netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { - printk(KERN_ERR "log: failed to create netlink socket\n"); + pr_err("log: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } -#ifdef CONFIG_PROC_FS - proc_nful = create_proc_entry("nfnetlink_log", 0440, - proc_net_netfilter); - if (!proc_nful) + status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); + if (status < 0) { + pr_err("log: failed to register logger\n"); goto cleanup_subsys; - proc_nful->proc_fops = &nful_file_ops; -#endif + } + + status = register_pernet_subsys(&nfnl_log_net_ops); + if (status < 0) { + pr_err("log: failed to register pernet ops\n"); + goto cleanup_logger; + } return status; -#ifdef CONFIG_PROC_FS +cleanup_logger: + nf_log_unregister(&nfulnl_logger); cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); -#endif cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); return status; @@ -1041,10 +1095,8 @@ cleanup_netlink_notifier: static void __exit nfnetlink_log_fini(void) { + unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_log", proc_net_netfilter); -#endif nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c deleted file mode 100644 index 7a97bec6772..00000000000 --- a/net/netfilter/nfnetlink_queue.c +++ /dev/null @@ -1,1144 +0,0 @@ -/* - * This is a module which is used for queueing packets and communicating with - * userspace via nfetlink. - * - * (C) 2005 by Harald Welte <laforge@netfilter.org> - * - * Based on the old ipv4-only ip_queue.c: - * (C) 2000-2002 James Morris <jmorris@intercode.com.au> - * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/proc_fs.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_queue.h> -#include <linux/list.h> -#include <net/sock.h> - -#include <asm/atomic.h> - -#ifdef CONFIG_BRIDGE_NETFILTER -#include "../bridge/br_private.h" -#endif - -#define NFQNL_QMAX_DEFAULT 1024 - -#if 0 -#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ - __FILE__, __LINE__, __FUNCTION__, \ - ## args) -#else -#define QDEBUG(x, ...) -#endif - -struct nfqnl_queue_entry { - struct list_head list; - struct nf_info *info; - struct sk_buff *skb; - unsigned int id; -}; - -struct nfqnl_instance { - struct hlist_node hlist; /* global list of queues */ - atomic_t use; - - int peer_pid; - unsigned int queue_maxlen; - unsigned int copy_range; - unsigned int queue_total; - unsigned int queue_dropped; - unsigned int queue_user_dropped; - - atomic_t id_sequence; /* 'sequence' of pkt ids */ - - u_int16_t queue_num; /* number of this queue */ - u_int8_t copy_mode; - - spinlock_t lock; - - struct list_head queue_list; /* packets in queue */ -}; - -typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long); - -static DEFINE_RWLOCK(instances_lock); - -#define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS]; - -static inline u_int8_t instance_hashfn(u_int16_t queue_num) -{ - return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; -} - -static struct nfqnl_instance * -__instance_lookup(u_int16_t queue_num) -{ - struct hlist_head *head; - struct hlist_node *pos; - struct nfqnl_instance *inst; - - head = &instance_table[instance_hashfn(queue_num)]; - hlist_for_each_entry(inst, pos, head, hlist) { - if (inst->queue_num == queue_num) - return inst; - } - return NULL; -} - -static struct nfqnl_instance * -instance_lookup_get(u_int16_t queue_num) -{ - struct nfqnl_instance *inst; - - read_lock_bh(&instances_lock); - inst = __instance_lookup(queue_num); - if (inst) - atomic_inc(&inst->use); - read_unlock_bh(&instances_lock); - - return inst; -} - -static void -instance_put(struct nfqnl_instance *inst) -{ - if (inst && atomic_dec_and_test(&inst->use)) { - QDEBUG("kfree(inst=%p)\n", inst); - kfree(inst); - } -} - -static struct nfqnl_instance * -instance_create(u_int16_t queue_num, int pid) -{ - struct nfqnl_instance *inst; - - QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid); - - write_lock_bh(&instances_lock); - if (__instance_lookup(queue_num)) { - inst = NULL; - QDEBUG("aborting, instance already exists\n"); - goto out_unlock; - } - - inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) - goto out_unlock; - - inst->queue_num = queue_num; - inst->peer_pid = pid; - inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = 0xfffff; - inst->copy_mode = NFQNL_COPY_NONE; - atomic_set(&inst->id_sequence, 0); - /* needs to be two, since we _put() after creation */ - atomic_set(&inst->use, 2); - spin_lock_init(&inst->lock); - INIT_LIST_HEAD(&inst->queue_list); - - if (!try_module_get(THIS_MODULE)) - goto out_free; - - hlist_add_head(&inst->hlist, - &instance_table[instance_hashfn(queue_num)]); - - write_unlock_bh(&instances_lock); - - QDEBUG("successfully created new instance\n"); - - return inst; - -out_free: - kfree(inst); -out_unlock: - write_unlock_bh(&instances_lock); - return NULL; -} - -static void nfqnl_flush(struct nfqnl_instance *queue, int verdict); - -static void -_instance_destroy2(struct nfqnl_instance *inst, int lock) -{ - /* first pull it out of the global list */ - if (lock) - write_lock_bh(&instances_lock); - - QDEBUG("removing instance %p (queuenum=%u) from hash\n", - inst, inst->queue_num); - hlist_del(&inst->hlist); - - if (lock) - write_unlock_bh(&instances_lock); - - /* then flush all pending skbs from the queue */ - nfqnl_flush(inst, NF_DROP); - - /* and finally put the refcount */ - instance_put(inst); - - module_put(THIS_MODULE); -} - -static inline void -__instance_destroy(struct nfqnl_instance *inst) -{ - _instance_destroy2(inst, 0); -} - -static inline void -instance_destroy(struct nfqnl_instance *inst) -{ - _instance_destroy2(inst, 1); -} - - - -static void -issue_verdict(struct nfqnl_queue_entry *entry, int verdict) -{ - QDEBUG("entering for entry %p, verdict %u\n", entry, verdict); - - /* TCP input path (and probably other bits) assume to be called - * from softirq context, not from syscall, like issue_verdict is - * called. TCP input path deadlocks with locks taken from timer - * softirq, e.g. We therefore emulate this by local_bh_disable() */ - - local_bh_disable(); - nf_reinject(entry->skb, entry->info, verdict); - local_bh_enable(); - - kfree(entry); -} - -static inline void -__enqueue_entry(struct nfqnl_instance *queue, - struct nfqnl_queue_entry *entry) -{ - list_add(&entry->list, &queue->queue_list); - queue->queue_total++; -} - -/* - * Find and return a queued entry matched by cmpfn, or return the last - * entry if cmpfn is NULL. - */ -static inline struct nfqnl_queue_entry * -__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, - unsigned long data) -{ - struct list_head *p; - - list_for_each_prev(p, &queue->queue_list) { - struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p; - - if (!cmpfn || cmpfn(entry, data)) - return entry; - } - return NULL; -} - -static inline void -__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry) -{ - list_del(&entry->list); - q->queue_total--; -} - -static inline struct nfqnl_queue_entry * -__find_dequeue_entry(struct nfqnl_instance *queue, - nfqnl_cmpfn cmpfn, unsigned long data) -{ - struct nfqnl_queue_entry *entry; - - entry = __find_entry(queue, cmpfn, data); - if (entry == NULL) - return NULL; - - __dequeue_entry(queue, entry); - return entry; -} - - -static inline void -__nfqnl_flush(struct nfqnl_instance *queue, int verdict) -{ - struct nfqnl_queue_entry *entry; - - while ((entry = __find_dequeue_entry(queue, NULL, 0))) - issue_verdict(entry, verdict); -} - -static inline int -__nfqnl_set_mode(struct nfqnl_instance *queue, - unsigned char mode, unsigned int range) -{ - int status = 0; - - switch (mode) { - case NFQNL_COPY_NONE: - case NFQNL_COPY_META: - queue->copy_mode = mode; - queue->copy_range = 0; - break; - - case NFQNL_COPY_PACKET: - queue->copy_mode = mode; - /* we're using struct nfattr which has 16bit nfa_len */ - if (range > 0xffff) - queue->copy_range = 0xffff; - else - queue->copy_range = range; - break; - - default: - status = -EINVAL; - - } - return status; -} - -static struct nfqnl_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, - nfqnl_cmpfn cmpfn, unsigned long data) -{ - struct nfqnl_queue_entry *entry; - - spin_lock_bh(&queue->lock); - entry = __find_dequeue_entry(queue, cmpfn, data); - spin_unlock_bh(&queue->lock); - - return entry; -} - -static void -nfqnl_flush(struct nfqnl_instance *queue, int verdict) -{ - spin_lock_bh(&queue->lock); - __nfqnl_flush(queue, verdict); - spin_unlock_bh(&queue->lock); -} - -static struct sk_buff * -nfqnl_build_packet_message(struct nfqnl_instance *queue, - struct nfqnl_queue_entry *entry, int *errp) -{ - sk_buff_data_t old_tail; - size_t size; - size_t data_len = 0; - struct sk_buff *skb; - struct nfqnl_msg_packet_hdr pmsg; - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct nf_info *entinf = entry->info; - struct sk_buff *entskb = entry->skb; - struct net_device *indev; - struct net_device *outdev; - __be32 tmp_uint; - - QDEBUG("entered\n"); - - /* all macros expand to constant values at compile time */ - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + - + NFA_SPACE(sizeof(struct nfqnl_msg_packet_hdr)) - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ -#ifdef CONFIG_BRIDGE_NETFILTER - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ - + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ -#endif - + NFA_SPACE(sizeof(u_int32_t)) /* mark */ - + NFA_SPACE(sizeof(struct nfqnl_msg_packet_hw)) - + NFA_SPACE(sizeof(struct nfqnl_msg_packet_timestamp)); - - outdev = entinf->outdev; - - spin_lock_bh(&queue->lock); - - switch (queue->copy_mode) { - case NFQNL_COPY_META: - case NFQNL_COPY_NONE: - data_len = 0; - break; - - case NFQNL_COPY_PACKET: - if ((entskb->ip_summed == CHECKSUM_PARTIAL || - entskb->ip_summed == CHECKSUM_COMPLETE) && - (*errp = skb_checksum_help(entskb))) { - spin_unlock_bh(&queue->lock); - return NULL; - } - if (queue->copy_range == 0 - || queue->copy_range > entskb->len) - data_len = entskb->len; - else - data_len = queue->copy_range; - - size += NFA_SPACE(data_len); - break; - - default: - *errp = -EINVAL; - spin_unlock_bh(&queue->lock); - return NULL; - } - - spin_unlock_bh(&queue->lock); - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - goto nlmsg_failure; - - old_tail = skb->tail; - nlh = NLMSG_PUT(skb, 0, 0, - NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, - sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - nfmsg->nfgen_family = entinf->pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(queue->queue_num); - - pmsg.packet_id = htonl(entry->id); - pmsg.hw_protocol = entskb->protocol; - pmsg.hook = entinf->hook; - - NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); - - indev = entinf->indev; - if (indev) { - tmp_uint = htonl(indev->ifindex); -#ifndef CONFIG_BRIDGE_NETFILTER - NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); -#else - if (entinf->pf == PF_BRIDGE) { - /* Case 1: indev is physical input device, we need to - * look for bridge group (when called from - * netfilter_bridge) */ - NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), - &tmp_uint); - /* this is the bridge group "brX" */ - tmp_uint = htonl(indev->br_port->br->dev->ifindex); - NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), - &tmp_uint); - } else { - /* Case 2: indev is bridge group, we need to look for - * physical device (when called from ipv4) */ - NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), - &tmp_uint); - if (entskb->nf_bridge - && entskb->nf_bridge->physindev) { - tmp_uint = htonl(entskb->nf_bridge->physindev->ifindex); - NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, - sizeof(tmp_uint), &tmp_uint); - } - } -#endif - } - - if (outdev) { - tmp_uint = htonl(outdev->ifindex); -#ifndef CONFIG_BRIDGE_NETFILTER - NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); -#else - if (entinf->pf == PF_BRIDGE) { - /* Case 1: outdev is physical output device, we need to - * look for bridge group (when called from - * netfilter_bridge) */ - NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), - &tmp_uint); - /* this is the bridge group "brX" */ - tmp_uint = htonl(outdev->br_port->br->dev->ifindex); - NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), - &tmp_uint); - } else { - /* Case 2: outdev is bridge group, we need to look for - * physical output device (when called from ipv4) */ - NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), - &tmp_uint); - if (entskb->nf_bridge - && entskb->nf_bridge->physoutdev) { - tmp_uint = htonl(entskb->nf_bridge->physoutdev->ifindex); - NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, - sizeof(tmp_uint), &tmp_uint); - } - } -#endif - } - - if (entskb->mark) { - tmp_uint = htonl(entskb->mark); - NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint); - } - - if (indev && entskb->dev - && entskb->dev->hard_header_parse) { - struct nfqnl_msg_packet_hw phw; - - int len = entskb->dev->hard_header_parse(entskb, - phw.hw_addr); - phw.hw_addrlen = htons(len); - NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); - } - - if (entskb->tstamp.tv64) { - struct nfqnl_msg_packet_timestamp ts; - struct timeval tv = ktime_to_timeval(entskb->tstamp); - ts.sec = cpu_to_be64(tv.tv_sec); - ts.usec = cpu_to_be64(tv.tv_usec); - - NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); - } - - if (data_len) { - struct nfattr *nfa; - int size = NFA_LENGTH(data_len); - - if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) { - printk(KERN_WARNING "nf_queue: no tailroom!\n"); - goto nlmsg_failure; - } - - nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); - nfa->nfa_type = NFQA_PAYLOAD; - nfa->nfa_len = size; - - if (skb_copy_bits(entskb, 0, NFA_DATA(nfa), data_len)) - BUG(); - } - - nlh->nlmsg_len = skb->tail - old_tail; - return skb; - -nlmsg_failure: -nfattr_failure: - if (skb) - kfree_skb(skb); - *errp = -EINVAL; - if (net_ratelimit()) - printk(KERN_ERR "nf_queue: error creating packet message\n"); - return NULL; -} - -static int -nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, - unsigned int queuenum, void *data) -{ - int status = -EINVAL; - struct sk_buff *nskb; - struct nfqnl_instance *queue; - struct nfqnl_queue_entry *entry; - - QDEBUG("entered\n"); - - queue = instance_lookup_get(queuenum); - if (!queue) { - QDEBUG("no queue instance matching\n"); - return -EINVAL; - } - - if (queue->copy_mode == NFQNL_COPY_NONE) { - QDEBUG("mode COPY_NONE, aborting\n"); - status = -EAGAIN; - goto err_out_put; - } - - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) { - if (net_ratelimit()) - printk(KERN_ERR - "nf_queue: OOM in nfqnl_enqueue_packet()\n"); - status = -ENOMEM; - goto err_out_put; - } - - entry->info = info; - entry->skb = skb; - entry->id = atomic_inc_return(&queue->id_sequence); - - nskb = nfqnl_build_packet_message(queue, entry, &status); - if (nskb == NULL) - goto err_out_free; - - spin_lock_bh(&queue->lock); - - if (!queue->peer_pid) - goto err_out_free_nskb; - - if (queue->queue_total >= queue->queue_maxlen) { - queue->queue_dropped++; - status = -ENOSPC; - if (net_ratelimit()) - printk(KERN_WARNING "nf_queue: full at %d entries, " - "dropping packets(s). Dropped: %d\n", - queue->queue_total, queue->queue_dropped); - goto err_out_free_nskb; - } - - /* nfnetlink_unicast will either free the nskb or add it to a socket */ - status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); - if (status < 0) { - queue->queue_user_dropped++; - goto err_out_unlock; - } - - __enqueue_entry(queue, entry); - - spin_unlock_bh(&queue->lock); - instance_put(queue); - return status; - -err_out_free_nskb: - kfree_skb(nskb); - -err_out_unlock: - spin_unlock_bh(&queue->lock); - -err_out_free: - kfree(entry); -err_out_put: - instance_put(queue); - return status; -} - -static int -nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e) -{ - int diff; - - diff = data_len - e->skb->len; - if (diff < 0) { - if (pskb_trim(e->skb, data_len)) - return -ENOMEM; - } else if (diff > 0) { - if (data_len > 0xFFFF) - return -EINVAL; - if (diff > skb_tailroom(e->skb)) { - struct sk_buff *newskb; - - newskb = skb_copy_expand(e->skb, - skb_headroom(e->skb), - diff, - GFP_ATOMIC); - if (newskb == NULL) { - printk(KERN_WARNING "nf_queue: OOM " - "in mangle, dropping packet\n"); - return -ENOMEM; - } - if (e->skb->sk) - skb_set_owner_w(newskb, e->skb->sk); - kfree_skb(e->skb); - e->skb = newskb; - } - skb_put(e->skb, diff); - } - if (!skb_make_writable(&e->skb, data_len)) - return -ENOMEM; - skb_copy_to_linear_data(e->skb, data, data_len); - e->skb->ip_summed = CHECKSUM_NONE; - return 0; -} - -static inline int -id_cmp(struct nfqnl_queue_entry *e, unsigned long id) -{ - return (id == e->id); -} - -static int -nfqnl_set_mode(struct nfqnl_instance *queue, - unsigned char mode, unsigned int range) -{ - int status; - - spin_lock_bh(&queue->lock); - status = __nfqnl_set_mode(queue, mode, range); - spin_unlock_bh(&queue->lock); - - return status; -} - -static int -dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex) -{ - struct nf_info *entinf = entry->info; - - if (entinf->indev) - if (entinf->indev->ifindex == ifindex) - return 1; - if (entinf->outdev) - if (entinf->outdev->ifindex == ifindex) - return 1; -#ifdef CONFIG_BRIDGE_NETFILTER - if (entry->skb->nf_bridge) { - if (entry->skb->nf_bridge->physindev && - entry->skb->nf_bridge->physindev->ifindex == ifindex) - return 1; - if (entry->skb->nf_bridge->physoutdev && - entry->skb->nf_bridge->physoutdev->ifindex == ifindex) - return 1; - } -#endif - return 0; -} - -/* drop all packets with either indev or outdev == ifindex from all queue - * instances */ -static void -nfqnl_dev_drop(int ifindex) -{ - int i; - - QDEBUG("entering for ifindex %u\n", ifindex); - - /* this only looks like we have to hold the readlock for a way too long - * time, issue_verdict(), nf_reinject(), ... - but we always only - * issue NF_DROP, which is processed directly in nf_reinject() */ - read_lock_bh(&instances_lock); - - for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp; - struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; - - hlist_for_each_entry(inst, tmp, head, hlist) { - struct nfqnl_queue_entry *entry; - while ((entry = find_dequeue_entry(inst, dev_cmp, - ifindex)) != NULL) - issue_verdict(entry, NF_DROP); - } - } - - read_unlock_bh(&instances_lock); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static int -nfqnl_rcv_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = ptr; - - /* Drop any packets associated with the downed device */ - if (event == NETDEV_DOWN) - nfqnl_dev_drop(dev->ifindex); - return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_dev_notifier = { - .notifier_call = nfqnl_rcv_dev_event, -}; - -static int -nfqnl_rcv_nl_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct netlink_notify *n = ptr; - - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_NETFILTER && n->pid) { - int i; - - /* destroy all instances for this pid */ - write_lock_bh(&instances_lock); - for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp, *t2; - struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; - - hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if (n->pid == inst->peer_pid) - __instance_destroy(inst); - } - } - write_unlock_bh(&instances_lock); - } - return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_rtnl_notifier = { - .notifier_call = nfqnl_rcv_nl_event, -}; - -static const int nfqa_verdict_min[NFQA_MAX] = { - [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr), - [NFQA_MARK-1] = sizeof(u_int32_t), - [NFQA_PAYLOAD-1] = 0, -}; - -static int -nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[]) -{ - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); - - struct nfqnl_msg_verdict_hdr *vhdr; - struct nfqnl_instance *queue; - unsigned int verdict; - struct nfqnl_queue_entry *entry; - int err; - - if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) { - QDEBUG("bad attribute size\n"); - return -EINVAL; - } - - queue = instance_lookup_get(queue_num); - if (!queue) - return -ENODEV; - - if (queue->peer_pid != NETLINK_CB(skb).pid) { - err = -EPERM; - goto err_out_put; - } - - if (!nfqa[NFQA_VERDICT_HDR-1]) { - err = -EINVAL; - goto err_out_put; - } - - vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]); - verdict = ntohl(vhdr->verdict); - - if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { - err = -EINVAL; - goto err_out_put; - } - - entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id)); - if (entry == NULL) { - err = -ENOENT; - goto err_out_put; - } - - if (nfqa[NFQA_PAYLOAD-1]) { - if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]), - NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0) - verdict = NF_DROP; - } - - if (nfqa[NFQA_MARK-1]) - entry->skb->mark = ntohl(*(__be32 *) - NFA_DATA(nfqa[NFQA_MARK-1])); - - issue_verdict(entry, verdict); - instance_put(queue); - return 0; - -err_out_put: - instance_put(queue); - return err; -} - -static int -nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[]) -{ - return -ENOTSUPP; -} - -static const int nfqa_cfg_min[NFQA_CFG_MAX] = { - [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd), - [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params), -}; - -static struct nf_queue_handler nfqh = { - .name = "nf_queue", - .outfn = &nfqnl_enqueue_packet, -}; - -static int -nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[]) -{ - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); - struct nfqnl_instance *queue; - int ret = 0; - - QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); - - if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) { - QDEBUG("bad attribute size\n"); - return -EINVAL; - } - - queue = instance_lookup_get(queue_num); - if (nfqa[NFQA_CFG_CMD-1]) { - struct nfqnl_msg_config_cmd *cmd; - cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]); - QDEBUG("found CFG_CMD\n"); - - switch (cmd->command) { - case NFQNL_CFG_CMD_BIND: - if (queue) - return -EBUSY; - - queue = instance_create(queue_num, NETLINK_CB(skb).pid); - if (!queue) - return -EINVAL; - break; - case NFQNL_CFG_CMD_UNBIND: - if (!queue) - return -ENODEV; - - if (queue->peer_pid != NETLINK_CB(skb).pid) { - ret = -EPERM; - goto out_put; - } - - instance_destroy(queue); - break; - case NFQNL_CFG_CMD_PF_BIND: - QDEBUG("registering queue handler for pf=%u\n", - ntohs(cmd->pf)); - ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh); - break; - case NFQNL_CFG_CMD_PF_UNBIND: - QDEBUG("unregistering queue handler for pf=%u\n", - ntohs(cmd->pf)); - /* This is a bug and a feature. We can unregister - * other handlers(!) */ - ret = nf_unregister_queue_handler(ntohs(cmd->pf)); - break; - default: - ret = -EINVAL; - break; - } - } else { - if (!queue) { - QDEBUG("no config command, and no instance ENOENT\n"); - ret = -ENOENT; - goto out_put; - } - - if (queue->peer_pid != NETLINK_CB(skb).pid) { - QDEBUG("no config command, and wrong pid\n"); - ret = -EPERM; - goto out_put; - } - } - - if (nfqa[NFQA_CFG_PARAMS-1]) { - struct nfqnl_msg_config_params *params; - - if (!queue) { - ret = -ENOENT; - goto out_put; - } - params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]); - nfqnl_set_mode(queue, params->copy_mode, - ntohl(params->copy_range)); - } - - if (nfqa[NFQA_CFG_QUEUE_MAXLEN-1]) { - __be32 *queue_maxlen; - queue_maxlen = NFA_DATA(nfqa[NFQA_CFG_QUEUE_MAXLEN-1]); - spin_lock_bh(&queue->lock); - queue->queue_maxlen = ntohl(*queue_maxlen); - spin_unlock_bh(&queue->lock); - } - -out_put: - instance_put(queue); - return ret; -} - -static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { - [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, }, - [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, }, - [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, }, -}; - -static struct nfnetlink_subsystem nfqnl_subsys = { - .name = "nf_queue", - .subsys_id = NFNL_SUBSYS_QUEUE, - .cb_count = NFQNL_MSG_MAX, - .cb = nfqnl_cb, -}; - -#ifdef CONFIG_PROC_FS -struct iter_state { - unsigned int bucket; -}; - -static struct hlist_node *get_first(struct seq_file *seq) -{ - struct iter_state *st = seq->private; - - if (!st) - return NULL; - - for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return instance_table[st->bucket].first; - } - return NULL; -} - -static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) -{ - struct iter_state *st = seq->private; - - h = h->next; - while (!h) { - if (++st->bucket >= INSTANCE_BUCKETS) - return NULL; - - h = instance_table[st->bucket].first; - } - return h; -} - -static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_node *head; - head = get_first(seq); - - if (head) - while (pos && (head = get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *seq_start(struct seq_file *seq, loff_t *pos) -{ - read_lock_bh(&instances_lock); - return get_idx(seq, *pos); -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - return get_next(s, v); -} - -static void seq_stop(struct seq_file *s, void *v) -{ - read_unlock_bh(&instances_lock); -} - -static int seq_show(struct seq_file *s, void *v) -{ - const struct nfqnl_instance *inst = v; - - return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", - inst->queue_num, - inst->peer_pid, inst->queue_total, - inst->copy_mode, inst->copy_range, - inst->queue_dropped, inst->queue_user_dropped, - atomic_read(&inst->id_sequence), - atomic_read(&inst->use)); -} - -static struct seq_operations nfqnl_seq_ops = { - .start = seq_start, - .next = seq_next, - .stop = seq_stop, - .show = seq_show, -}; - -static int nfqnl_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - struct iter_state *is; - int ret; - - is = kzalloc(sizeof(*is), GFP_KERNEL); - if (!is) - return -ENOMEM; - ret = seq_open(file, &nfqnl_seq_ops); - if (ret < 0) - goto out_free; - seq = file->private_data; - seq->private = is; - return ret; -out_free: - kfree(is); - return ret; -} - -static const struct file_operations nfqnl_file_ops = { - .owner = THIS_MODULE, - .open = nfqnl_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -#endif /* PROC_FS */ - -static int __init nfnetlink_queue_init(void) -{ - int i, status = -ENOMEM; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc_nfqueue; -#endif - - for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); - - netlink_register_notifier(&nfqnl_rtnl_notifier); - status = nfnetlink_subsys_register(&nfqnl_subsys); - if (status < 0) { - printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } - -#ifdef CONFIG_PROC_FS - proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440, - proc_net_netfilter); - if (!proc_nfqueue) - goto cleanup_subsys; - proc_nfqueue->proc_fops = &nfqnl_file_ops; -#endif - - register_netdevice_notifier(&nfqnl_dev_notifier); - return status; - -#ifdef CONFIG_PROC_FS -cleanup_subsys: - nfnetlink_subsys_unregister(&nfqnl_subsys); -#endif -cleanup_netlink_notifier: - netlink_unregister_notifier(&nfqnl_rtnl_notifier); - return status; -} - -static void __exit nfnetlink_queue_fini(void) -{ - nf_unregister_queue_handlers(&nfqh); - unregister_netdevice_notifier(&nfqnl_dev_notifier); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_queue", proc_net_netfilter); -#endif - nfnetlink_subsys_unregister(&nfqnl_subsys); - netlink_unregister_notifier(&nfqnl_rtnl_notifier); -} - -MODULE_DESCRIPTION("netfilter packet queue handler"); -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); - -module_init(nfnetlink_queue_init); -module_exit(nfnetlink_queue_fini); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c new file mode 100644 index 00000000000..108120f216b --- /dev/null +++ b/net/netfilter/nfnetlink_queue_core.c @@ -0,0 +1,1352 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfnetlink. + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2007 by Patrick McHardy <kaber@trash.net> + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris <jmorris@intercode.com.au> + * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/proc_fs.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> +#include <linux/list.h> +#include <net/sock.h> +#include <net/tcp_states.h> +#include <net/netfilter/nf_queue.h> +#include <net/netns/generic.h> +#include <net/netfilter/nfnetlink_queue.h> + +#include <linux/atomic.h> + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFQNL_QMAX_DEFAULT 1024 + +/* We're using struct nlattr which has 16bit nla_len. Note that nla_len + * includes the header length. Thus, the maximum packet length that we + * support is 65531 bytes. We send truncated packets if the specified length + * is larger than that. Userspace can check for presence of NFQA_CAP_LEN + * attribute to detect truncation. + */ +#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + struct rcu_head rcu; + + int peer_portid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; + u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ +/* + * Following fields are dirtied for each queued packet, + * keep them in same cache line if possible. + */ + spinlock_t lock; + unsigned int queue_total; + unsigned int id_sequence; /* 'sequence' of pkt ids */ + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); + +static int nfnl_queue_net_id __read_mostly; + +#define INSTANCE_BUCKETS 16 +struct nfnl_queue_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; +}; + +static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) +{ + return net_generic(net, nfnl_queue_net_id); +} + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) +{ + struct hlist_head *head; + struct nfqnl_instance *inst; + + head = &q->instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, + int portid) +{ + struct nfqnl_instance *inst; + unsigned int h; + int err; + + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { + err = -EEXIST; + goto out_unlock; + } + + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + inst->queue_num = queue_num; + inst->peer_portid = portid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = NFQNL_MAX_COPY_RANGE; + inst->copy_mode = NFQNL_COPY_NONE; + spin_lock_init(&inst->lock); + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) { + err = -EAGAIN; + goto out_free; + } + + h = instance_hashfn(queue_num); + hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); + + spin_unlock(&q->instances_lock); + + return inst; + +out_free: + kfree(inst); +out_unlock: + spin_unlock(&q->instances_lock); + return ERR_PTR(err); +} + +static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data); + +static void +instance_destroy_rcu(struct rcu_head *head) +{ + struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, + rcu); + + nfqnl_flush(inst, NULL, 0); + kfree(inst); + module_put(THIS_MODULE); +} + +static void +__instance_destroy(struct nfqnl_instance *inst) +{ + hlist_del_rcu(&inst->hlist); + call_rcu(&inst->rcu, instance_destroy_rcu); +} + +static void +instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) +{ + spin_lock(&q->instances_lock); + __instance_destroy(inst); + spin_unlock(&q->instances_lock); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +static void +__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_del(&entry->list); + queue->queue_total--; +} + +static struct nf_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue->lock); + + list_for_each_entry(i, &queue->queue_list, list) { + if (i->id == id) { + entry = i; + break; + } + } + + if (entry) + __dequeue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + spin_lock_bh(&queue->lock); + list_for_each_entry_safe(entry, next, &queue->queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue->queue_total--; + nf_reinject(entry, NF_DROP); + } + } + spin_unlock_bh(&queue->lock); +} + +static int +nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, + bool csum_verify) +{ + __u32 flags = 0; + + if (packet->ip_summed == CHECKSUM_PARTIAL) + flags = NFQA_SKB_CSUMNOTREADY; + else if (csum_verify) + flags = NFQA_SKB_CSUM_NOTVERIFIED; + + if (skb_is_gso(packet)) + flags |= NFQA_SKB_GSO; + + return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; +} + +static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) +{ + const struct cred *cred; + + if (sk->sk_state == TCP_TIME_WAIT) + return 0; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + cred = sk->sk_socket->file->f_cred; + if (nla_put_be32(skb, NFQA_UID, + htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) + goto nla_put_failure; + if (nla_put_be32(skb, NFQA_GID, + htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) + goto nla_put_failure; + } + read_unlock_bh(&sk->sk_callback_lock); + return 0; + +nla_put_failure: + read_unlock_bh(&sk->sk_callback_lock); + return -1; +} + +static struct sk_buff * +nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry, + __be32 **packet_id_ptr) +{ + size_t size; + size_t data_len = 0, cap_len = 0; + unsigned int hlen = 0; + struct sk_buff *skb; + struct nlattr *nla; + struct nfqnl_msg_packet_hdr *pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct sk_buff *entskb = entry->skb; + struct net_device *indev; + struct net_device *outdev; + struct nf_conn *ct = NULL; + enum ip_conntrack_info uninitialized_var(ctinfo); + bool csum_verify; + + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#endif + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + + nla_total_size(sizeof(u_int32_t)); /* cap_len */ + + if (entskb->tstamp.tv64) + size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + + if (entry->hook <= NF_INET_FORWARD || + (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) + csum_verify = !skb_csum_unnecessary(entskb); + else + csum_verify = false; + + outdev = entry->outdev; + + switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + break; + + case NFQNL_COPY_PACKET: + if (!(queue->flags & NFQA_CFG_F_GSO) && + entskb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(entskb)) + return NULL; + + data_len = ACCESS_ONCE(queue->copy_range); + if (data_len > entskb->len) + data_len = entskb->len; + + hlen = skb_zerocopy_headlen(entskb); + hlen = min_t(unsigned int, hlen, data_len); + size += sizeof(struct nlattr) + hlen; + cap_len = entskb->len; + break; + } + + if (queue->flags & NFQA_CFG_F_CONNTRACK) + ct = nfqnl_ct_get(entskb, &size, &ctinfo); + + if (queue->flags & NFQA_CFG_F_UID_GID) { + size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t))); /* gid */ + } + + skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, + GFP_ATOMIC); + if (!skb) { + skb_tx_error(entskb); + return NULL; + } + + nlh = nlmsg_put(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg), 0); + if (!nlh) { + skb_tx_error(entskb); + kfree_skb(skb); + return NULL; + } + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = entry->pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); + pmsg = nla_data(nla); + pmsg->hw_protocol = entskb->protocol; + pmsg->hook = entry->hook; + *packet_id_ptr = &pmsg->packet_id; + + indev = entry->indev; + if (indev) { +#ifndef CONFIG_BRIDGE_NETFILTER + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) + goto nla_put_failure; +#else + if (entry->pf == PF_BRIDGE) { + /* Case 1: indev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)) || + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex))) + goto nla_put_failure; + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; + if (entskb->nf_bridge && entskb->nf_bridge->physindev && + nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(entskb->nf_bridge->physindev->ifindex))) + goto nla_put_failure; + } +#endif + } + + if (outdev) { +#ifndef CONFIG_BRIDGE_NETFILTER + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) + goto nla_put_failure; +#else + if (entry->pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)) || + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) + goto nla_put_failure; + } else { + /* Case 2: outdev is bridge group, we need to look for + * physical output device (when called from ipv4) */ + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; + if (entskb->nf_bridge && entskb->nf_bridge->physoutdev && + nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(entskb->nf_bridge->physoutdev->ifindex))) + goto nla_put_failure; + } +#endif + } + + if (entskb->mark && + nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) + goto nla_put_failure; + + if (indev && entskb->dev && + entskb->mac_header != entskb->network_header) { + struct nfqnl_msg_packet_hw phw; + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(entskb, phw.hw_addr); + if (len) { + phw.hw_addrlen = htons(len); + if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) + goto nla_put_failure; + } + } + + if (entskb->tstamp.tv64) { + struct nfqnl_msg_packet_timestamp ts; + struct timeval tv = ktime_to_timeval(entskb->tstamp); + ts.sec = cpu_to_be64(tv.tv_sec); + ts.usec = cpu_to_be64(tv.tv_usec); + + if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) + goto nla_put_failure; + } + + if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && + nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) + goto nla_put_failure; + + if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) + goto nla_put_failure; + + if (cap_len > data_len && + nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) + goto nla_put_failure; + + if (nfqnl_put_packet_info(skb, entskb, csum_verify)) + goto nla_put_failure; + + if (data_len) { + struct nlattr *nla; + + if (skb_tailroom(skb) < sizeof(*nla) + hlen) + goto nla_put_failure; + + nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); + nla->nla_type = NFQA_PAYLOAD; + nla->nla_len = nla_attr_size(data_len); + + if (skb_zerocopy(skb, entskb, data_len, hlen)) + goto nla_put_failure; + } + + nlh->nlmsg_len = skb->len; + return skb; + +nla_put_failure: + skb_tx_error(entskb); + kfree_skb(skb); + net_err_ratelimited("nf_queue: error creating packet message\n"); + return NULL; +} + +static int +__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry) +{ + struct sk_buff *nskb; + int err = -ENOBUFS; + __be32 *packet_id_ptr; + int failopen = 0; + + nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); + if (nskb == NULL) { + err = -ENOMEM; + goto err_out; + } + spin_lock_bh(&queue->lock); + + if (queue->queue_total >= queue->queue_maxlen) { + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_dropped++; + net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", + queue->queue_total); + } + goto err_out_free_nskb; + } + entry->id = ++queue->id_sequence; + *packet_id_ptr = htonl(entry->id); + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + if (err < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + return 0; + +err_out_free_nskb: + kfree_skb(nskb); +err_out_unlock: + spin_unlock_bh(&queue->lock); + if (failopen) + nf_reinject(entry, NF_ACCEPT); +err_out: + return err; +} + +static struct nf_queue_entry * +nf_queue_entry_dup(struct nf_queue_entry *e) +{ + struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); + if (entry) { + if (nf_queue_entry_get_refs(entry)) + return entry; + kfree(entry); + } + return NULL; +} + +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} + +static int +__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, + struct sk_buff *skb, struct nf_queue_entry *entry) +{ + int ret = -ENOMEM; + struct nf_queue_entry *entry_seg; + + nf_bridge_adjust_segmented_data(skb); + + if (skb->next == NULL) { /* last packet, no need to copy entry */ + struct sk_buff *gso_skb = entry->skb; + entry->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry); + if (ret) + entry->skb = gso_skb; + return ret; + } + + skb->next = NULL; + + entry_seg = nf_queue_entry_dup(entry); + if (entry_seg) { + entry_seg->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry_seg); + if (ret) + free_entry(entry_seg); + } + return ret; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + unsigned int queued; + struct nfqnl_instance *queue; + struct sk_buff *skb, *segs; + int err = -ENOBUFS; + struct net *net = dev_net(entry->indev ? + entry->indev : entry->outdev); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(q, queuenum); + if (!queue) + return -ESRCH; + + if (queue->copy_mode == NFQNL_COPY_NONE) + return -EINVAL; + + skb = entry->skb; + + switch (entry->pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + return __nfqnl_enqueue_packet(net, queue, entry); + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to + * mean 'ignore this hook'. + */ + if (IS_ERR(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + if (err == 0) + err = __nfqnl_enqueue_packet_gso(net, queue, + segs, entry); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + if (err) /* some segments are already queued */ + free_entry(entry); + kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); + return err; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +{ + struct sk_buff *nskb; + + if (diff < 0) { + if (pskb_trim(e->skb, data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "nf_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, data, data_len); + e->skb->ip_summed = CHECKSUM_NONE; + return 0; +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + spin_lock_bh(&queue->lock); + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + if (range == 0 || range > NFQNL_MAX_COPY_RANGE) + queue->copy_range = NFQNL_MAX_COPY_RANGE; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->indev) + if (entry->indev->ifindex == ifindex) + return 1; + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) + return 1; +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + if (entry->skb->nf_bridge->physindev && + entry->skb->nf_bridge->physindev->ifindex == ifindex) + return 1; + if (entry->skb->nf_bridge->physoutdev && + entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + return 1; + } +#endif + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(struct net *net, int ifindex) +{ + int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + rcu_read_lock(); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_rcu(inst, head, hlist) + nfqnl_flush(inst, dev_cmp, ifindex); + } + + rcu_read_unlock(); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev_net(dev), dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { + int i; + + /* destroy all instances for this portid */ + spin_lock(&q->instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) + __instance_destroy(inst); + } + } + spin_unlock(&q->instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, + [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, + [NFQA_CT] = { .type = NLA_UNSPEC }, + [NFQA_EXP] = { .type = NLA_UNSPEC }, +}; + +static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, +}; + +static struct nfqnl_instance * +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid) +{ + struct nfqnl_instance *queue; + + queue = instance_lookup(q, queue_num); + if (!queue) + return ERR_PTR(-ENODEV); + + if (queue->peer_portid != nlportid) + return ERR_PTR(-EPERM); + + return queue; +} + +static struct nfqnl_msg_verdict_hdr* +verdicthdr_get(const struct nlattr * const nfqa[]) +{ + struct nfqnl_msg_verdict_hdr *vhdr; + unsigned int verdict; + + if (!nfqa[NFQA_VERDICT_HDR]) + return NULL; + + vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); + verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; + if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) + return NULL; + return vhdr; +} + +static int nfq_id_after(unsigned int id, unsigned int max) +{ + return (int)(id - max) > 0; +} + +static int +nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_queue_entry *entry, *tmp; + unsigned int verdict, maxid; + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + LIST_HEAD(batch_list); + u16 queue_num = ntohs(nfmsg->res_id); + + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + maxid = ntohl(vhdr->id); + + spin_lock_bh(&queue->lock); + + list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { + if (nfq_id_after(entry->id, maxid)) + break; + __dequeue_entry(queue, entry); + list_add_tail(&entry->list, &batch_list); + } + + spin_unlock_bh(&queue->lock); + + if (list_empty(&batch_list)) + return -ENOENT; + + list_for_each_entry_safe(entry, tmp, &batch_list, list) { + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + nf_reinject(entry, verdict); + } + return 0; +} + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nf_queue_entry *entry; + enum ip_conntrack_info uninitialized_var(ctinfo); + struct nf_conn *ct = NULL; + + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = instance_lookup(q, queue_num); + if (!queue) + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); + if (entry == NULL) + return -ENOENT; + + if (nfqa[NFQA_CT]) { + ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); + if (ct && nfqa[NFQA_EXP]) { + nfqnl_attach_expect(ct, nfqa[NFQA_EXP], + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + } + } + + if (nfqa[NFQA_PAYLOAD]) { + u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); + int diff = payload_len - entry->skb->len; + + if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), + payload_len, entry, diff) < 0) + verdict = NF_DROP; + + if (ct) + nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff); + } + + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + + nf_reinject(entry, verdict); + return 0; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + return -ENOTSUPP; +} + +static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { + [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, + [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, +}; + +static const struct nf_queue_handler nfqh = { + .outfn = &nfqnl_enqueue_packet, +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + struct nfqnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int ret = 0; + + if (nfqa[NFQA_CFG_CMD]) { + cmd = nla_data(nfqa[NFQA_CFG_CMD]); + + /* Obsolete commands without queue context */ + switch (cmd->command) { + case NFQNL_CFG_CMD_PF_BIND: return 0; + case NFQNL_CFG_CMD_PF_UNBIND: return 0; + } + } + + rcu_read_lock(); + queue = instance_lookup(q, queue_num); + if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { + ret = -EPERM; + goto err_out_unlock; + } + + if (cmd != NULL) { + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) { + ret = -EBUSY; + goto err_out_unlock; + } + queue = instance_create(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) { + ret = PTR_ERR(queue); + goto err_out_unlock; + } + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + instance_destroy(q, queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + case NFQNL_CFG_CMD_PF_UNBIND: + break; + default: + ret = -ENOTSUPP; + break; + } + } + + if (nfqa[NFQA_CFG_PARAMS]) { + struct nfqnl_msg_config_params *params; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + params = nla_data(nfqa[NFQA_CFG_PARAMS]); + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + + if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { + __be32 *queue_maxlen; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); + spin_lock_bh(&queue->lock); + queue->queue_maxlen = ntohl(*queue_maxlen); + spin_unlock_bh(&queue->lock); + } + + if (nfqa[NFQA_CFG_FLAGS]) { + __u32 flags, mask; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + + if (!nfqa[NFQA_CFG_MASK]) { + /* A mask is needed to specify which flags are being + * changed. + */ + ret = -EINVAL; + goto err_out_unlock; + } + + flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); + mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); + + if (flags >= NFQA_CFG_F_MAX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } + + spin_lock_bh(&queue->lock); + queue->flags &= ~mask; + queue->flags |= flags & mask; + spin_unlock_bh(&queue->lock); + } + +err_out_unlock: + rcu_read_unlock(); + return ret; +} + +static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, }, + [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy }, + [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_batch_policy }, +}; + +static const struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .cb = nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + struct net *net; + struct nfnl_queue_net *q; + + if (!st) + return NULL; + + net = seq_file_net(seq); + q = nfnl_queue_pernet(net); + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&q->instance_table[st->bucket])) + return q->instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + struct net *net = seq_file_net(seq); + + h = h->next; + while (!h) { + struct nfnl_queue_net *q; + + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + q = nfnl_queue_pernet(net); + h = q->instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *s, loff_t *pos) + __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ + spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); + return get_idx(s, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) + __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ + spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + inst->queue_num, + inst->peer_portid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + inst->id_sequence, 1); +} + +static const struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &nfqnl_seq_ops, + sizeof(struct iter_state)); +} + +static const struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif /* PROC_FS */ + +static int __net_init nfnl_queue_net_init(struct net *net) +{ + unsigned int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&q->instance_table[i]); + + spin_lock_init(&q->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + net->nf.proc_netfilter, &nfqnl_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_queue_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +#endif +} + +static struct pernet_operations nfnl_queue_net_ops = { + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), +}; + +static int __init nfnetlink_queue_init(void) +{ + int status = -ENOMEM; + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + pr_err("nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); + goto cleanup_subsys; + } + register_netdevice_notifier(&nfqnl_dev_notifier); + nf_register_queue_handler(&nfqh); + return status; + +cleanup_subsys: + nfnetlink_subsys_unregister(&nfqnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + return status; +} + +static void __exit nfnetlink_queue_fini(void) +{ + nf_unregister_queue_handler(); + unregister_netdevice_notifier(&nfqnl_dev_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); + nfnetlink_subsys_unregister(&nfqnl_subsys); + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(nfnetlink_queue_init); +module_exit(nfnetlink_queue_fini); diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c new file mode 100644 index 00000000000..96cac50e0d1 --- /dev/null +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -0,0 +1,113 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nfnetlink_queue.h> + +struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size, + enum ip_conntrack_info *ctinfo) +{ + struct nfq_ct_hook *nfq_ct; + struct nf_conn *ct; + + /* rcu_read_lock()ed by __nf_queue already. */ + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return NULL; + + ct = nf_ct_get(entskb, ctinfo); + if (ct) { + if (!nf_ct_is_untracked(ct)) + *size += nfq_ct->build_size(ct); + else + ct = NULL; + } + return ct; +} + +struct nf_conn * +nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr, + enum ip_conntrack_info *ctinfo) +{ + struct nfq_ct_hook *nfq_ct; + struct nf_conn *ct; + + /* rcu_read_lock()ed by __nf_queue already. */ + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return NULL; + + ct = nf_ct_get(skb, ctinfo); + if (ct && !nf_ct_is_untracked(ct)) + nfq_ct->parse(attr, ct); + + return ct; +} + +int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct nfq_ct_hook *nfq_ct; + struct nlattr *nest_parms; + u_int32_t tmp; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return 0; + + nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nfq_ct->build(skb, ct) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + tmp = ctinfo; + if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int diff) +{ + struct nfq_ct_hook *nfq_ct; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return; + + if ((ct->status & IPS_NAT_MASK) && diff) + nfq_ct->seq_adjust(skb, ct, ctinfo, diff); +} + +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + struct nfq_ct_hook *nfq_ct; + + if (nf_ct_is_untracked(ct)) + return 0; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return -EOPNOTSUPP; + + return nfq_ct->attach_expect(attr, ct, portid, report); +} diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c new file mode 100644 index 00000000000..4fb6ee2c110 --- /dev/null +++ b/net/netfilter/nft_bitwise.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_bitwise { + enum nft_registers sreg:8; + enum nft_registers dreg:8; + u8 len; + struct nft_data mask; + struct nft_data xor; +}; + +static void nft_bitwise_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + const struct nft_data *src = &data[priv->sreg]; + struct nft_data *dst = &data[priv->dreg]; + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) { + dst->data[i] = (src->data[i] & priv->mask.data[i]) ^ + priv->xor.data[i]; + } +} + +static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { + [NFTA_BITWISE_SREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_DREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, + [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, + [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, +}; + +static int nft_bitwise_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_bitwise *priv = nft_expr_priv(expr); + struct nft_data_desc d1, d2; + int err; + + if (tb[NFTA_BITWISE_SREG] == NULL || + tb[NFTA_BITWISE_DREG] == NULL || + tb[NFTA_BITWISE_LEN] == NULL || + tb[NFTA_BITWISE_MASK] == NULL || + tb[NFTA_BITWISE_XOR] == NULL) + return -EINVAL; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + + err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]); + if (err < 0) + return err; + if (d1.len != priv->len) + return -EINVAL; + + err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]); + if (err < 0) + return err; + if (d2.len != priv->len) + return -EINVAL; + + return 0; +} + +static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_bitwise_type; +static const struct nft_expr_ops nft_bitwise_ops = { + .type = &nft_bitwise_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), + .eval = nft_bitwise_eval, + .init = nft_bitwise_init, + .dump = nft_bitwise_dump, +}; + +static struct nft_expr_type nft_bitwise_type __read_mostly = { + .name = "bitwise", + .ops = &nft_bitwise_ops, + .policy = nft_bitwise_policy, + .maxattr = NFTA_BITWISE_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_bitwise_module_init(void) +{ + return nft_register_expr(&nft_bitwise_type); +} + +void nft_bitwise_module_exit(void) +{ + nft_unregister_expr(&nft_bitwise_type); +} diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c new file mode 100644 index 00000000000..c39ed8d29df --- /dev/null +++ b/net/netfilter/nft_byteorder.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_byteorder { + enum nft_registers sreg:8; + enum nft_registers dreg:8; + enum nft_byteorder_ops op:8; + u8 len; + u8 size; +}; + +static void nft_byteorder_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_byteorder *priv = nft_expr_priv(expr); + struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg]; + union { u32 u32; u16 u16; } *s, *d; + unsigned int i; + + s = (void *)src->data; + d = (void *)dst->data; + + switch (priv->size) { + case 4: + switch (priv->op) { + case NFT_BYTEORDER_NTOH: + for (i = 0; i < priv->len / 4; i++) + d[i].u32 = ntohl((__force __be32)s[i].u32); + break; + case NFT_BYTEORDER_HTON: + for (i = 0; i < priv->len / 4; i++) + d[i].u32 = (__force __u32)htonl(s[i].u32); + break; + } + break; + case 2: + switch (priv->op) { + case NFT_BYTEORDER_NTOH: + for (i = 0; i < priv->len / 2; i++) + d[i].u16 = ntohs((__force __be16)s[i].u16); + break; + case NFT_BYTEORDER_HTON: + for (i = 0; i < priv->len / 2; i++) + d[i].u16 = (__force __u16)htons(s[i].u16); + break; + } + break; + } +} + +static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { + [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_OP] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 }, +}; + +static int nft_byteorder_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_byteorder *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_BYTEORDER_SREG] == NULL || + tb[NFTA_BYTEORDER_DREG] == NULL || + tb[NFTA_BYTEORDER_LEN] == NULL || + tb[NFTA_BYTEORDER_SIZE] == NULL || + tb[NFTA_BYTEORDER_OP] == NULL) + return -EINVAL; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); + switch (priv->op) { + case NFT_BYTEORDER_NTOH: + case NFT_BYTEORDER_HTON: + break; + default: + return -EINVAL; + } + + priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); + if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + + priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); + switch (priv->size) { + case 2: + case 4: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_byteorder *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_byteorder_type; +static const struct nft_expr_ops nft_byteorder_ops = { + .type = &nft_byteorder_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), + .eval = nft_byteorder_eval, + .init = nft_byteorder_init, + .dump = nft_byteorder_dump, +}; + +static struct nft_expr_type nft_byteorder_type __read_mostly = { + .name = "byteorder", + .ops = &nft_byteorder_ops, + .policy = nft_byteorder_policy, + .maxattr = NFTA_BYTEORDER_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_byteorder_module_init(void) +{ + return nft_register_expr(&nft_byteorder_type); +} + +void nft_byteorder_module_exit(void) +{ + nft_unregister_expr(&nft_byteorder_type); +} diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c new file mode 100644 index 00000000000..e2b3f51c81f --- /dev/null +++ b/net/netfilter/nft_cmp.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_cmp_expr { + struct nft_data data; + enum nft_registers sreg:8; + u8 len; + enum nft_cmp_ops op:8; +}; + +static void nft_cmp_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_cmp_expr *priv = nft_expr_priv(expr); + int d; + + d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len); + switch (priv->op) { + case NFT_CMP_EQ: + if (d != 0) + goto mismatch; + break; + case NFT_CMP_NEQ: + if (d == 0) + goto mismatch; + break; + case NFT_CMP_LT: + if (d == 0) + goto mismatch; + case NFT_CMP_LTE: + if (d > 0) + goto mismatch; + break; + case NFT_CMP_GT: + if (d == 0) + goto mismatch; + case NFT_CMP_GTE: + if (d < 0) + goto mismatch; + break; + } + return; + +mismatch: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = { + [NFTA_CMP_SREG] = { .type = NLA_U32 }, + [NFTA_CMP_OP] = { .type = NLA_U32 }, + [NFTA_CMP_DATA] = { .type = NLA_NESTED }, +}; + +static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_cmp_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc; + int err; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); + + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); + BUG_ON(err < 0); + + priv->len = desc.len; + return 0; +} + +static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_cmp_expr *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_cmp_type; +static const struct nft_expr_ops nft_cmp_ops = { + .type = &nft_cmp_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), + .eval = nft_cmp_eval, + .init = nft_cmp_init, + .dump = nft_cmp_dump, +}; + +static int nft_cmp_fast_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc; + struct nft_data data; + u32 mask; + int err; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); + BUG_ON(err < 0); + desc.len *= BITS_PER_BYTE; + + mask = nft_cmp_fast_mask(desc.len); + priv->data = data.data[0] & mask; + priv->len = desc.len; + return 0; +} + +static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + struct nft_data data; + + if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ))) + goto nla_put_failure; + + data.data[0] = priv->data; + if (nft_data_dump(skb, NFTA_CMP_DATA, &data, + NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +const struct nft_expr_ops nft_cmp_fast_ops = { + .type = &nft_cmp_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)), + .eval = NULL, /* inlined */ + .init = nft_cmp_fast_init, + .dump = nft_cmp_fast_dump, +}; + +static const struct nft_expr_ops * +nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) +{ + struct nft_data_desc desc; + struct nft_data data; + enum nft_registers sreg; + enum nft_cmp_ops op; + int err; + + if (tb[NFTA_CMP_SREG] == NULL || + tb[NFTA_CMP_OP] == NULL || + tb[NFTA_CMP_DATA] == NULL) + return ERR_PTR(-EINVAL); + + sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + err = nft_validate_input_register(sreg); + if (err < 0) + return ERR_PTR(err); + + op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); + switch (op) { + case NFT_CMP_EQ: + case NFT_CMP_NEQ: + case NFT_CMP_LT: + case NFT_CMP_LTE: + case NFT_CMP_GT: + case NFT_CMP_GTE: + break; + default: + return ERR_PTR(-EINVAL); + } + + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); + if (err < 0) + return ERR_PTR(err); + + if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) + return &nft_cmp_fast_ops; + else + return &nft_cmp_ops; +} + +static struct nft_expr_type nft_cmp_type __read_mostly = { + .name = "cmp", + .select_ops = nft_cmp_select_ops, + .policy = nft_cmp_policy, + .maxattr = NFTA_CMP_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_cmp_module_init(void) +{ + return nft_register_expr(&nft_cmp_type); +} + +void nft_cmp_module_exit(void) +{ + nft_unregister_expr(&nft_cmp_type); +} diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c new file mode 100644 index 00000000000..1840989092e --- /dev/null +++ b/net/netfilter/nft_compat.c @@ -0,0 +1,793 @@ +/* + * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This software has been sponsored by Sophos Astaro <http://www.sophos.com> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/netfilter/nf_tables_compat.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <asm/uaccess.h> /* for set_fs */ +#include <net/netfilter/nf_tables.h> + +union nft_entry { + struct ipt_entry e4; + struct ip6t_entry e6; +}; + +static inline void +nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +{ + par->target = xt; + par->targinfo = xt_info; + par->hotdrop = false; +} + +static void nft_target_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + void *info = nft_expr_priv(expr); + struct xt_target *target = expr->ops->data; + struct sk_buff *skb = pkt->skb; + int ret; + + nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + + ret = target->target(skb, &pkt->xt); + + if (pkt->xt.hotdrop) + ret = NF_DROP; + + switch(ret) { + case XT_CONTINUE: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + break; + default: + data[NFT_REG_VERDICT].verdict = ret; + break; + } + return; +} + +static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { + [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_TARGET_REV] = { .type = NLA_U32 }, + [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, +}; + +static void +nft_target_set_tgchk_param(struct xt_tgchk_param *par, + const struct nft_ctx *ctx, + struct xt_target *target, void *info, + union nft_entry *entry, u8 proto, bool inv) +{ + par->net = &init_net; + par->table = ctx->table->name; + switch (ctx->afi->family) { + case AF_INET: + entry->e4.ip.proto = proto; + entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; + break; + case AF_INET6: + entry->e6.ipv6.proto = proto; + entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; + break; + } + par->entryinfo = entry; + par->target = target; + par->targinfo = info; + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + par->hook_mask = 1 << ops->hooknum; + } + par->family = ctx->afi->family; +} + +static void target_compat_from_user(struct xt_target *t, void *in, void *out) +{ +#ifdef CONFIG_COMPAT + if (t->compat_from_user) { + int pad; + + t->compat_from_user(out, in); + pad = XT_ALIGN(t->targetsize) - t->targetsize; + if (pad > 0) + memset(out + t->targetsize, 0, pad); + } else +#endif + memcpy(out, in, XT_ALIGN(t->targetsize)); +} + +static inline int nft_compat_target_offset(struct xt_target *target) +{ +#ifdef CONFIG_COMPAT + return xt_compat_target_offset(target); +#else + return 0; +#endif +} + +static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { + [NFTA_RULE_COMPAT_PROTO] = { .type = NLA_U32 }, + [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 }, +}; + +static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) +{ + struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; + u32 flags; + int err; + + err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr, + nft_rule_compat_policy); + if (err < 0) + return err; + + if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS]) + return -EINVAL; + + flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); + if (flags & ~NFT_RULE_COMPAT_F_MASK) + return -EINVAL; + if (flags & NFT_RULE_COMPAT_F_INV) + *inv = true; + + *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + return 0; +} + +static int +nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + void *info = nft_expr_priv(expr); + struct xt_target *target = expr->ops->data; + struct xt_tgchk_param par; + size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); + u8 proto = 0; + bool inv = false; + union nft_entry e = {}; + int ret; + + target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info); + + if (ctx->nla[NFTA_RULE_COMPAT]) { + ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); + if (ret < 0) + goto err; + } + + nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + + ret = xt_check_target(&par, size, proto, inv); + if (ret < 0) + goto err; + + /* The standard target cannot be used */ + if (target->target == NULL) { + ret = -EINVAL; + goto err; + } + + return 0; +err: + module_put(target->me); + return ret; +} + +static void +nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgdtor_param par; + + par.net = ctx->net; + par.target = target; + par.targinfo = info; + par.family = ctx->afi->family; + if (par.target->destroy != NULL) + par.target->destroy(&par); + + module_put(target->me); +} + +static int +target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in) +{ + int ret; + +#ifdef CONFIG_COMPAT + if (t->compat_to_user) { + mm_segment_t old_fs; + void *out; + + out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC); + if (out == NULL) + return -ENOMEM; + + /* We want to reuse existing compat_to_user */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + t->compat_to_user(out, in); + set_fs(old_fs); + ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out); + kfree(out); + } else +#endif + ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in); + + return ret; +} + +static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + + if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || + nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || + target_dump_info(skb, target, info)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_target_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + struct xt_target *target = expr->ops->data; + unsigned int hook_mask = 0; + + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + hook_mask = 1 << ops->hooknum; + if (hook_mask & target->hooks) + return 0; + + /* This target is being called from an invalid chain */ + return -EINVAL; + } + return 0; +} + +static void nft_match_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + void *info = nft_expr_priv(expr); + struct xt_match *match = expr->ops->data; + struct sk_buff *skb = pkt->skb; + bool ret; + + nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + + ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + + if (pkt->xt.hotdrop) { + data[NFT_REG_VERDICT].verdict = NF_DROP; + return; + } + + switch(ret) { + case true: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + break; + case false: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; + break; + } +} + +static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { + [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_MATCH_REV] = { .type = NLA_U32 }, + [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, +}; + +/* struct xt_mtchk_param and xt_tgchk_param look very similar */ +static void +nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, + struct xt_match *match, void *info, + union nft_entry *entry, u8 proto, bool inv) +{ + par->net = &init_net; + par->table = ctx->table->name; + switch (ctx->afi->family) { + case AF_INET: + entry->e4.ip.proto = proto; + entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; + break; + case AF_INET6: + entry->e6.ipv6.proto = proto; + entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; + break; + } + par->entryinfo = entry; + par->match = match; + par->matchinfo = info; + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + par->hook_mask = 1 << ops->hooknum; + } + par->family = ctx->afi->family; +} + +static void match_compat_from_user(struct xt_match *m, void *in, void *out) +{ +#ifdef CONFIG_COMPAT + if (m->compat_from_user) { + int pad; + + m->compat_from_user(out, in); + pad = XT_ALIGN(m->matchsize) - m->matchsize; + if (pad > 0) + memset(out + m->matchsize, 0, pad); + } else +#endif + memcpy(out, in, XT_ALIGN(m->matchsize)); +} + +static int +nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + void *info = nft_expr_priv(expr); + struct xt_match *match = expr->ops->data; + struct xt_mtchk_param par; + size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); + u8 proto = 0; + bool inv = false; + union nft_entry e = {}; + int ret; + + match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info); + + if (ctx->nla[NFTA_RULE_COMPAT]) { + ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); + if (ret < 0) + goto err; + } + + nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + + ret = xt_check_match(&par, size, proto, inv); + if (ret < 0) + goto err; + + return 0; +err: + module_put(match->me); + return ret; +} + +static void +nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + struct xt_match *match = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_mtdtor_param par; + + par.net = ctx->net; + par.match = match; + par.matchinfo = info; + par.family = ctx->afi->family; + if (par.match->destroy != NULL) + par.match->destroy(&par); + + module_put(match->me); +} + +static int +match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in) +{ + int ret; + +#ifdef CONFIG_COMPAT + if (m->compat_to_user) { + mm_segment_t old_fs; + void *out; + + out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC); + if (out == NULL) + return -ENOMEM; + + /* We want to reuse existing compat_to_user */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + m->compat_to_user(out, in); + set_fs(old_fs); + ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out); + kfree(out); + } else +#endif + ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in); + + return ret; +} + +static inline int nft_compat_match_offset(struct xt_match *match) +{ +#ifdef CONFIG_COMPAT + return xt_compat_match_offset(match); +#else + return 0; +#endif +} + +static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + void *info = nft_expr_priv(expr); + struct xt_match *match = expr->ops->data; + + if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || + nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || + match_dump_info(skb, match, info)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_match_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + struct xt_match *match = expr->ops->data; + unsigned int hook_mask = 0; + + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + hook_mask = 1 << ops->hooknum; + if (hook_mask & match->hooks) + return 0; + + /* This match is being called from an invalid chain */ + return -EINVAL; + } + return 0; +} + +static int +nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + int event, u16 family, const char *name, + int rev, int target) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + + event |= NFNL_SUBSYS_NFT_COMPAT << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || + nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || + nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + int ret = 0, target; + struct nfgenmsg *nfmsg; + const char *fmt; + const char *name; + u32 rev; + struct sk_buff *skb2; + + if (tb[NFTA_COMPAT_NAME] == NULL || + tb[NFTA_COMPAT_REV] == NULL || + tb[NFTA_COMPAT_TYPE] == NULL) + return -EINVAL; + + name = nla_data(tb[NFTA_COMPAT_NAME]); + rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); + target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); + + nfmsg = nlmsg_data(nlh); + + switch(nfmsg->nfgen_family) { + case AF_INET: + fmt = "ipt_%s"; + break; + case AF_INET6: + fmt = "ip6t_%s"; + break; + default: + pr_err("nft_compat: unsupported protocol %d\n", + nfmsg->nfgen_family); + return -EINVAL; + } + + try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, + rev, target, &ret), + fmt, name); + + if (ret < 0) + return ret; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + /* include the best revision for this extension in the message */ + if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + NFNL_MSG_COMPAT_GET, + nfmsg->nfgen_family, + name, ret, target) <= 0) { + kfree_skb(skb2); + return -ENOSPC; + } + + ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + return ret == -EAGAIN ? -ENOBUFS : ret; +} + +static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { + [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, + .len = NFT_COMPAT_NAME_MAX-1 }, + [NFTA_COMPAT_REV] = { .type = NLA_U32 }, + [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, +}; + +static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { + [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get, + .attr_count = NFTA_COMPAT_MAX, + .policy = nfnl_compat_policy_get }, +}; + +static const struct nfnetlink_subsystem nfnl_compat_subsys = { + .name = "nft-compat", + .subsys_id = NFNL_SUBSYS_NFT_COMPAT, + .cb_count = NFNL_MSG_COMPAT_MAX, + .cb = nfnl_nft_compat_cb, +}; + +static LIST_HEAD(nft_match_list); + +struct nft_xt { + struct list_head head; + struct nft_expr_ops ops; +}; + +static struct nft_expr_type nft_match_type; + +static const struct nft_expr_ops * +nft_match_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + struct nft_xt *nft_match; + struct xt_match *match; + char *mt_name; + __u32 rev, family; + + if (tb[NFTA_MATCH_NAME] == NULL || + tb[NFTA_MATCH_REV] == NULL || + tb[NFTA_MATCH_INFO] == NULL) + return ERR_PTR(-EINVAL); + + mt_name = nla_data(tb[NFTA_MATCH_NAME]); + rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); + family = ctx->afi->family; + + /* Re-use the existing match if it's already loaded. */ + list_for_each_entry(nft_match, &nft_match_list, head) { + struct xt_match *match = nft_match->ops.data; + + if (strcmp(match->name, mt_name) == 0 && + match->revision == rev && match->family == family) + return &nft_match->ops; + } + + match = xt_request_find_match(family, mt_name, rev); + if (IS_ERR(match)) + return ERR_PTR(-ENOENT); + + /* This is the first time we use this match, allocate operations */ + nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); + if (nft_match == NULL) + return ERR_PTR(-ENOMEM); + + nft_match->ops.type = &nft_match_type; + nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) + + nft_compat_match_offset(match)); + nft_match->ops.eval = nft_match_eval; + nft_match->ops.init = nft_match_init; + nft_match->ops.destroy = nft_match_destroy; + nft_match->ops.dump = nft_match_dump; + nft_match->ops.validate = nft_match_validate; + nft_match->ops.data = match; + + list_add(&nft_match->head, &nft_match_list); + + return &nft_match->ops; +} + +static void nft_match_release(void) +{ + struct nft_xt *nft_match, *tmp; + + list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head) + kfree(nft_match); +} + +static struct nft_expr_type nft_match_type __read_mostly = { + .name = "match", + .select_ops = nft_match_select_ops, + .policy = nft_match_policy, + .maxattr = NFTA_MATCH_MAX, + .owner = THIS_MODULE, +}; + +static LIST_HEAD(nft_target_list); + +static struct nft_expr_type nft_target_type; + +static const struct nft_expr_ops * +nft_target_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + struct nft_xt *nft_target; + struct xt_target *target; + char *tg_name; + __u32 rev, family; + + if (tb[NFTA_TARGET_NAME] == NULL || + tb[NFTA_TARGET_REV] == NULL || + tb[NFTA_TARGET_INFO] == NULL) + return ERR_PTR(-EINVAL); + + tg_name = nla_data(tb[NFTA_TARGET_NAME]); + rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); + family = ctx->afi->family; + + /* Re-use the existing target if it's already loaded. */ + list_for_each_entry(nft_target, &nft_match_list, head) { + struct xt_target *target = nft_target->ops.data; + + if (strcmp(target->name, tg_name) == 0 && + target->revision == rev && target->family == family) + return &nft_target->ops; + } + + target = xt_request_find_target(family, tg_name, rev); + if (IS_ERR(target)) + return ERR_PTR(-ENOENT); + + /* This is the first time we use this target, allocate operations */ + nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); + if (nft_target == NULL) + return ERR_PTR(-ENOMEM); + + nft_target->ops.type = &nft_target_type; + nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) + + nft_compat_target_offset(target)); + nft_target->ops.eval = nft_target_eval; + nft_target->ops.init = nft_target_init; + nft_target->ops.destroy = nft_target_destroy; + nft_target->ops.dump = nft_target_dump; + nft_target->ops.validate = nft_target_validate; + nft_target->ops.data = target; + + list_add(&nft_target->head, &nft_target_list); + + return &nft_target->ops; +} + +static void nft_target_release(void) +{ + struct nft_xt *nft_target, *tmp; + + list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head) + kfree(nft_target); +} + +static struct nft_expr_type nft_target_type __read_mostly = { + .name = "target", + .select_ops = nft_target_select_ops, + .policy = nft_target_policy, + .maxattr = NFTA_TARGET_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_compat_module_init(void) +{ + int ret; + + ret = nft_register_expr(&nft_match_type); + if (ret < 0) + return ret; + + ret = nft_register_expr(&nft_target_type); + if (ret < 0) + goto err_match; + + ret = nfnetlink_subsys_register(&nfnl_compat_subsys); + if (ret < 0) { + pr_err("nft_compat: cannot register with nfnetlink.\n"); + goto err_target; + } + + pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n"); + + return ret; + +err_target: + nft_unregister_expr(&nft_target_type); +err_match: + nft_unregister_expr(&nft_match_type); + return ret; +} + +static void __exit nft_compat_module_exit(void) +{ + nfnetlink_subsys_unregister(&nfnl_compat_subsys); + nft_unregister_expr(&nft_target_type); + nft_unregister_expr(&nft_match_type); + nft_match_release(); + nft_target_release(); +} + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); + +module_init(nft_compat_module_init); +module_exit(nft_compat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_EXPR("match"); +MODULE_ALIAS_NFT_EXPR("target"); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c new file mode 100644 index 00000000000..c89ee486ce5 --- /dev/null +++ b/net/netfilter/nft_counter.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/seqlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_counter { + seqlock_t lock; + u64 bytes; + u64 packets; +}; + +static void nft_counter_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_counter *priv = nft_expr_priv(expr); + + write_seqlock_bh(&priv->lock); + priv->bytes += pkt->skb->len; + priv->packets++; + write_sequnlock_bh(&priv->lock); +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_counter *priv = nft_expr_priv(expr); + unsigned int seq; + u64 bytes; + u64 packets; + + do { + seq = read_seqbegin(&priv->lock); + bytes = priv->bytes; + packets = priv->packets; + } while (read_seqretry(&priv->lock, seq)); + + if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes))) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { + [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, + [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, +}; + +static int nft_counter_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_counter *priv = nft_expr_priv(expr); + + if (tb[NFTA_COUNTER_PACKETS]) + priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + if (tb[NFTA_COUNTER_BYTES]) + priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + + seqlock_init(&priv->lock); + return 0; +} + +static struct nft_expr_type nft_counter_type; +static const struct nft_expr_ops nft_counter_ops = { + .type = &nft_counter_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)), + .eval = nft_counter_eval, + .init = nft_counter_init, + .dump = nft_counter_dump, +}; + +static struct nft_expr_type nft_counter_type __read_mostly = { + .name = "counter", + .ops = &nft_counter_ops, + .policy = nft_counter_policy, + .maxattr = NFTA_COUNTER_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_counter_module_init(void) +{ + return nft_register_expr(&nft_counter_type); +} + +static void __exit nft_counter_module_exit(void) +{ + nft_unregister_expr(&nft_counter_type); +} + +module_init(nft_counter_module_init); +module_exit(nft_counter_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("counter"); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c new file mode 100644 index 00000000000..cc560301624 --- /dev/null +++ b/net/netfilter/nft_ct.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +struct nft_ct { + enum nft_ct_keys key:8; + enum ip_conntrack_dir dir:8; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; +}; + +static void nft_ct_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + struct nft_data *dest = &data[priv->dreg]; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + const struct nf_conn_help *help; + const struct nf_conntrack_tuple *tuple; + const struct nf_conntrack_helper *helper; + long diff; + unsigned int state; + + ct = nf_ct_get(pkt->skb, &ctinfo); + + switch (priv->key) { + case NFT_CT_STATE: + if (ct == NULL) + state = NF_CT_STATE_INVALID_BIT; + else if (nf_ct_is_untracked(ct)) + state = NF_CT_STATE_UNTRACKED_BIT; + else + state = NF_CT_STATE_BIT(ctinfo); + dest->data[0] = state; + return; + } + + if (ct == NULL) + goto err; + + switch (priv->key) { + case NFT_CT_DIRECTION: + dest->data[0] = CTINFO2DIR(ctinfo); + return; + case NFT_CT_STATUS: + dest->data[0] = ct->status; + return; +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + dest->data[0] = ct->mark; + return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + dest->data[0] = ct->secmark; + return; +#endif + case NFT_CT_EXPIRATION: + diff = (long)jiffies - (long)ct->timeout.expires; + if (diff < 0) + diff = 0; + dest->data[0] = jiffies_to_msecs(diff); + return; + case NFT_CT_HELPER: + if (ct->master == NULL) + goto err; + help = nfct_help(ct->master); + if (help == NULL) + goto err; + helper = rcu_dereference(help->helper); + if (helper == NULL) + goto err; + if (strlen(helper->name) >= sizeof(dest->data)) + goto err; + strncpy((char *)dest->data, helper->name, sizeof(dest->data)); + return; +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: { + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + unsigned int size; + + if (!labels) { + memset(dest->data, 0, sizeof(dest->data)); + return; + } + + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data)); + size = labels->words * sizeof(long); + + memcpy(dest->data, labels->bits, size); + if (size < sizeof(dest->data)) + memset(((char *) dest->data) + size, 0, + sizeof(dest->data) - size); + return; + } +#endif + } + + tuple = &ct->tuplehash[priv->dir].tuple; + switch (priv->key) { + case NFT_CT_L3PROTOCOL: + dest->data[0] = nf_ct_l3num(ct); + return; + case NFT_CT_SRC: + memcpy(dest->data, tuple->src.u3.all, + nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); + return; + case NFT_CT_DST: + memcpy(dest->data, tuple->dst.u3.all, + nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); + return; + case NFT_CT_PROTOCOL: + dest->data[0] = nf_ct_protonum(ct); + return; + case NFT_CT_PROTO_SRC: + dest->data[0] = (__force __u16)tuple->src.u.all; + return; + case NFT_CT_PROTO_DST: + dest->data[0] = (__force __u16)tuple->dst.u.all; + return; + } + return; +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static void nft_ct_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; +#ifdef CONFIG_NF_CONNTRACK_MARK + u32 value = data[priv->sreg].data[0]; +#endif + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return; + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + if (ct->mark != value) { + ct->mark = value; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; +#endif + } +} + +static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { + [NFTA_CT_DREG] = { .type = NLA_U32 }, + [NFTA_CT_KEY] = { .type = NLA_U32 }, + [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, + [NFTA_CT_SREG] = { .type = NLA_U32 }, +}; + +static int nft_ct_l3proto_try_module_get(uint8_t family) +{ + int err; + + if (family == NFPROTO_INET) { + err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_l3proto_try_module_get(family); + if (err < 0) + goto err1; + } + return 0; + +err2: + nf_ct_l3proto_module_put(NFPROTO_IPV4); +err1: + return err; +} + +static void nft_ct_l3proto_module_put(uint8_t family) +{ + if (family == NFPROTO_INET) { + nf_ct_l3proto_module_put(NFPROTO_IPV4); + nf_ct_l3proto_module_put(NFPROTO_IPV6); + } else + nf_ct_l3proto_module_put(family); +} + +static int nft_ct_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ct *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + switch (priv->key) { + case NFT_CT_STATE: + case NFT_CT_DIRECTION: + case NFT_CT_STATUS: +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: +#endif + case NFT_CT_EXPIRATION: + case NFT_CT_HELPER: + if (tb[NFTA_CT_DIRECTION] != NULL) + return -EINVAL; + break; + case NFT_CT_L3PROTOCOL: + case NFT_CT_PROTOCOL: + case NFT_CT_SRC: + case NFT_CT_DST: + case NFT_CT_PROTO_SRC: + case NFT_CT_PROTO_DST: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + break; + default: + return -EOPNOTSUPP; + } + + if (tb[NFTA_CT_DIRECTION] != NULL) { + priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + case IP_CT_DIR_REPLY: + break; + default: + return -EINVAL; + } + } + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + + return 0; +} + +static int nft_ct_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ct *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + break; +#endif + default: + return -EOPNOTSUPP; + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + + return 0; +} + +static void nft_ct_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nft_ct_l3proto_module_put(ctx->afi->family); +} + +static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) + goto nla_put_failure; + + switch (priv->key) { + case NFT_CT_PROTOCOL: + case NFT_CT_SRC: + case NFT_CT_DST: + case NFT_CT_PROTO_SRC: + case NFT_CT_PROTO_DST: + if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + default: + break; + } + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_ct_type; +static const struct nft_expr_ops nft_ct_get_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_destroy, + .dump = nft_ct_get_dump, +}; + +static const struct nft_expr_ops nft_ct_set_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_set_eval, + .init = nft_ct_set_init, + .destroy = nft_ct_destroy, + .dump = nft_ct_set_dump, +}; + +static const struct nft_expr_ops * +nft_ct_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_CT_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_CT_DREG]) + return &nft_ct_get_ops; + + if (tb[NFTA_CT_SREG]) + return &nft_ct_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_ct_type __read_mostly = { + .name = "ct", + .select_ops = &nft_ct_select_ops, + .policy = nft_ct_policy, + .maxattr = NFTA_CT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_ct_module_init(void) +{ + return nft_register_expr(&nft_ct_type); +} + +static void __exit nft_ct_module_exit(void) +{ + nft_unregister_expr(&nft_ct_type); +} + +module_init(nft_ct_module_init); +module_exit(nft_ct_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("ct"); diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c new file mode 100644 index 00000000000..b6eed4d5a09 --- /dev/null +++ b/net/netfilter/nft_expr_template.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_template { + +}; + +static void nft_template_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_template *priv = nft_expr_priv(expr); + +} + +static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = { + [NFTA_TEMPLATE_ATTR] = { .type = NLA_U32 }, +}; + +static int nft_template_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_template *priv = nft_expr_priv(expr); + + return 0; +} + +static void nft_template_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_template *priv = nft_expr_priv(expr); + +} + +static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_template *priv = nft_expr_priv(expr); + + NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field); + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_template_type; +static const struct nft_expr_ops nft_template_ops = { + .type = &nft_template_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_template)), + .eval = nft_template_eval, + .init = nft_template_init, + .destroy = nft_template_destroy, + .dump = nft_template_dump, +}; + +static struct nft_expr_type nft_template_type __read_mostly = { + .name = "template", + .ops = &nft_template_ops, + .policy = nft_template_policy, + .maxattr = NFTA_TEMPLATE_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_template_module_init(void) +{ + return nft_register_expr(&nft_template_type); +} + +static void __exit nft_template_module_exit(void) +{ + nft_unregister_expr(&nft_template_type); +} + +module_init(nft_template_module_init); +module_exit(nft_template_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("template"); diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c new file mode 100644 index 00000000000..55c939f5371 --- /dev/null +++ b/net/netfilter/nft_exthdr.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +// FIXME: +#include <net/ipv6.h> + +struct nft_exthdr { + u8 type; + u8 offset; + u8 len; + enum nft_registers dreg:8; +}; + +static void nft_exthdr_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + struct nft_data *dest = &data[priv->dreg]; + unsigned int offset = 0; + int err; + + err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); + if (err < 0) + goto err; + offset += priv->offset; + + if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0) + goto err; + return; +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { + [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, + [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, + [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, + [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, +}; + +static int nft_exthdr_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_EXTHDR_DREG] == NULL || + tb[NFTA_EXTHDR_TYPE] == NULL || + tb[NFTA_EXTHDR_OFFSET] == NULL || + tb[NFTA_EXTHDR_LEN] == NULL) + return -EINVAL; + + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); + priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); + if (priv->len == 0 || + priv->len > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_exthdr_type; +static const struct nft_expr_ops nft_exthdr_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + +static struct nft_expr_type nft_exthdr_type __read_mostly = { + .name = "exthdr", + .ops = &nft_exthdr_ops, + .policy = nft_exthdr_policy, + .maxattr = NFTA_EXTHDR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_exthdr_module_init(void) +{ + return nft_register_expr(&nft_exthdr_type); +} + +static void __exit nft_exthdr_module_exit(void) +{ + nft_unregister_expr(&nft_exthdr_type); +} + +module_init(nft_exthdr_module_init); +module_exit(nft_exthdr_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("exthdr"); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c new file mode 100644 index 00000000000..4080ed6a072 --- /dev/null +++ b/net/netfilter/nft_hash.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/log2.h> +#include <linux/jhash.h> +#include <linux/netlink.h> +#include <linux/vmalloc.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +#define NFT_HASH_MIN_SIZE 4UL + +struct nft_hash { + struct nft_hash_table __rcu *tbl; +}; + +struct nft_hash_table { + unsigned int size; + struct nft_hash_elem __rcu *buckets[]; +}; + +struct nft_hash_elem { + struct nft_hash_elem __rcu *next; + struct nft_data key; + struct nft_data data[]; +}; + +#define nft_hash_for_each_entry(i, head) \ + for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next)) +#define nft_hash_for_each_entry_rcu(i, head) \ + for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next)) + +static u32 nft_hash_rnd __read_mostly; +static bool nft_hash_rnd_initted __read_mostly; + +static unsigned int nft_hash_data(const struct nft_data *data, + unsigned int hsize, unsigned int len) +{ + unsigned int h; + + h = jhash(data->data, len, nft_hash_rnd); + return h & (hsize - 1); +} + +static bool nft_hash_lookup(const struct nft_set *set, + const struct nft_data *key, + struct nft_data *data) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = rcu_dereference(priv->tbl); + const struct nft_hash_elem *he; + unsigned int h; + + h = nft_hash_data(key, tbl->size, set->klen); + nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) { + if (nft_data_cmp(&he->key, key, set->klen)) + continue; + if (set->flags & NFT_SET_MAP) + nft_data_copy(data, he->data); + return true; + } + return false; +} + +static void nft_hash_tbl_free(const struct nft_hash_table *tbl) +{ + kvfree(tbl); +} + +static unsigned int nft_hash_tbl_size(unsigned int nelem) +{ + return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); +} + +static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) +{ + struct nft_hash_table *tbl; + size_t size; + + size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); + tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN); + if (tbl == NULL) + tbl = vzalloc(size); + if (tbl == NULL) + return NULL; + tbl->size = nbuckets; + + return tbl; +} + +static void nft_hash_chain_unzip(const struct nft_set *set, + const struct nft_hash_table *ntbl, + struct nft_hash_table *tbl, unsigned int n) +{ + struct nft_hash_elem *he, *last, *next; + unsigned int h; + + he = nft_dereference(tbl->buckets[n]); + if (he == NULL) + return; + h = nft_hash_data(&he->key, ntbl->size, set->klen); + + /* Find last element of first chain hashing to bucket h */ + last = he; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + break; + last = he; + } + + /* Unlink first chain from the old table */ + RCU_INIT_POINTER(tbl->buckets[n], last->next); + + /* If end of chain reached, done */ + if (he == NULL) + return; + + /* Find first element of second chain hashing to bucket h */ + next = NULL; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + continue; + next = he; + break; + } + + /* Link the two chains */ + RCU_INIT_POINTER(last->next, next); +} + +static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem *he; + unsigned int i, h; + bool complete; + + ntbl = nft_hash_tbl_alloc(tbl->size * 2); + if (ntbl == NULL) + return -ENOMEM; + + /* Link new table's buckets to first element in the old table + * hashing to the new bucket. + */ + for (i = 0; i < ntbl->size; i++) { + h = i < tbl->size ? i : i - tbl->size; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != i) + continue; + RCU_INIT_POINTER(ntbl->buckets[i], he); + break; + } + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + + /* Unzip interleaved hash chains */ + do { + /* Wait for readers to use new table/unzipped chains */ + synchronize_rcu(); + + complete = true; + for (i = 0; i < tbl->size; i++) { + nft_hash_chain_unzip(set, ntbl, tbl, i); + if (tbl->buckets[i] != NULL) + complete = false; + } + } while (!complete); + + nft_hash_tbl_free(tbl); + return 0; +} + +static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem __rcu **pprev; + unsigned int i; + + ntbl = nft_hash_tbl_alloc(tbl->size / 2); + if (ntbl == NULL) + return -ENOMEM; + + for (i = 0; i < ntbl->size; i++) { + ntbl->buckets[i] = tbl->buckets[i]; + + for (pprev = &ntbl->buckets[i]; *pprev != NULL; + pprev = &nft_dereference(*pprev)->next) + ; + RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + synchronize_rcu(); + + nft_hash_tbl_free(tbl); + return 0; +} + +static int nft_hash_insert(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he; + unsigned int size, h; + + if (elem->flags != 0) + return -EINVAL; + + size = sizeof(*he); + if (set->flags & NFT_SET_MAP) + size += sizeof(he->data[0]); + + he = kzalloc(size, GFP_KERNEL); + if (he == NULL) + return -ENOMEM; + + nft_data_copy(&he->key, &elem->key); + if (set->flags & NFT_SET_MAP) + nft_data_copy(he->data, &elem->data); + + h = nft_hash_data(&he->key, tbl->size, set->klen); + RCU_INIT_POINTER(he->next, tbl->buckets[h]); + rcu_assign_pointer(tbl->buckets[h], he); + + /* Expand table when exceeding 75% load */ + if (set->nelems + 1 > tbl->size / 4 * 3) + nft_hash_tbl_expand(set, priv); + + return 0; +} + +static void nft_hash_elem_destroy(const struct nft_set *set, + struct nft_hash_elem *he) +{ + nft_data_uninit(&he->key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP) + nft_data_uninit(he->data, set->dtype); + kfree(he); +} + +static void nft_hash_remove(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, __rcu **pprev; + + pprev = elem->cookie; + he = nft_dereference((*pprev)); + + RCU_INIT_POINTER(*pprev, he->next); + synchronize_rcu(); + kfree(he); + + /* Shrink table beneath 30% load */ + if (set->nelems - 1 < tbl->size * 3 / 10 && + tbl->size > NFT_HASH_MIN_SIZE) + nft_hash_tbl_shrink(set, priv); +} + +static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem __rcu * const *pprev; + struct nft_hash_elem *he; + unsigned int h; + + h = nft_hash_data(&elem->key, tbl->size, set->klen); + pprev = &tbl->buckets[h]; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_data_cmp(&he->key, &elem->key, set->klen)) { + pprev = &he->next; + continue; + } + + elem->cookie = (void *)pprev; + elem->flags = 0; + if (set->flags & NFT_SET_MAP) + nft_data_copy(&elem->data, he->data); + return 0; + } + return -ENOENT; +} + +static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, + struct nft_set_iter *iter) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + const struct nft_hash_elem *he; + struct nft_set_elem elem; + unsigned int i; + + for (i = 0; i < tbl->size; i++) { + nft_hash_for_each_entry(he, tbl->buckets[i]) { + if (iter->count < iter->skip) + goto cont; + + memcpy(&elem.key, &he->key, sizeof(elem.key)); + if (set->flags & NFT_SET_MAP) + memcpy(&elem.data, he->data, sizeof(elem.data)); + elem.flags = 0; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + return; +cont: + iter->count++; + } + } +} + +static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) +{ + return sizeof(struct nft_hash); +} + +static int nft_hash_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const tb[]) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl; + unsigned int size; + + if (unlikely(!nft_hash_rnd_initted)) { + get_random_bytes(&nft_hash_rnd, 4); + nft_hash_rnd_initted = true; + } + + size = NFT_HASH_MIN_SIZE; + if (desc->size) + size = nft_hash_tbl_size(desc->size); + + tbl = nft_hash_tbl_alloc(size); + if (tbl == NULL) + return -ENOMEM; + RCU_INIT_POINTER(priv->tbl, tbl); + return 0; +} + +static void nft_hash_destroy(const struct nft_set *set) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, *next; + unsigned int i; + + for (i = 0; i < tbl->size; i++) { + for (he = nft_dereference(tbl->buckets[i]); he != NULL; + he = next) { + next = nft_dereference(he->next); + nft_hash_elem_destroy(set, he); + } + } + kfree(tbl); +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int esize; + + esize = sizeof(struct nft_hash_elem); + if (features & NFT_SET_MAP) + esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); + + if (desc->size) { + est->size = sizeof(struct nft_hash) + + nft_hash_tbl_size(desc->size) * + sizeof(struct nft_hash_elem *) + + desc->size * esize; + } else { + /* Resizing happens when the load drops below 30% or goes + * above 75%. The average of 52.5% load (approximated by 50%) + * is used for the size estimation of the hash buckets, + * meaning we calculate two buckets per element. + */ + est->size = esize + 2 * sizeof(struct nft_hash_elem *); + } + + est->class = NFT_SET_CLASS_O_1; + + return true; +} + +static struct nft_set_ops nft_hash_ops __read_mostly = { + .privsize = nft_hash_privsize, + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .get = nft_hash_get, + .insert = nft_hash_insert, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup, + .walk = nft_hash_walk, + .features = NFT_SET_MAP, + .owner = THIS_MODULE, +}; + +static int __init nft_hash_module_init(void) +{ + return nft_register_set(&nft_hash_ops); +} + +static void __exit nft_hash_module_exit(void) +{ + nft_unregister_set(&nft_hash_ops); +} + +module_init(nft_hash_module_init); +module_exit(nft_hash_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c new file mode 100644 index 00000000000..810385eb724 --- /dev/null +++ b/net/netfilter/nft_immediate.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_immediate_expr { + struct nft_data data; + enum nft_registers dreg:8; + u8 dlen; +}; + +static void nft_immediate_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + nft_data_copy(&data[priv->dreg], &priv->data); +} + +static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { + [NFTA_IMMEDIATE_DREG] = { .type = NLA_U32 }, + [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, +}; + +static int nft_immediate_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_immediate_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc; + int err; + + if (tb[NFTA_IMMEDIATE_DREG] == NULL || + tb[NFTA_IMMEDIATE_DATA] == NULL) + return -EINVAL; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); + if (err < 0) + return err; + priv->dlen = desc.len; + + err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type); + if (err < 0) + goto err1; + + return 0; + +err1: + nft_data_uninit(&priv->data, desc.type); + return err; +} + +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); +} + +static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg))) + goto nla_put_failure; + + return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, + nft_dreg_to_type(priv->dreg), priv->dlen); + +nla_put_failure: + return -1; +} + +static int nft_immediate_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + *data = &priv->data; + + return 0; +} + +static struct nft_expr_type nft_imm_type; +static const struct nft_expr_ops nft_imm_ops = { + .type = &nft_imm_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), + .eval = nft_immediate_eval, + .init = nft_immediate_init, + .destroy = nft_immediate_destroy, + .dump = nft_immediate_dump, + .validate = nft_immediate_validate, +}; + +static struct nft_expr_type nft_imm_type __read_mostly = { + .name = "immediate", + .ops = &nft_imm_ops, + .policy = nft_immediate_policy, + .maxattr = NFTA_IMMEDIATE_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_immediate_module_init(void) +{ + return nft_register_expr(&nft_imm_type); +} + +void nft_immediate_module_exit(void) +{ + nft_unregister_expr(&nft_imm_type); +} diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c new file mode 100644 index 00000000000..85da5bd02f6 --- /dev/null +++ b/net/netfilter/nft_limit.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(limit_lock); + +struct nft_limit { + u64 tokens; + u64 rate; + u64 unit; + unsigned long stamp; +}; + +static void nft_limit_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + spin_lock_bh(&limit_lock); + if (time_after_eq(jiffies, priv->stamp)) { + priv->tokens = priv->rate; + priv->stamp = jiffies + priv->unit * HZ; + } + + if (priv->tokens >= 1) { + priv->tokens--; + spin_unlock_bh(&limit_lock); + return; + } + spin_unlock_bh(&limit_lock); + + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { + [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, + [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, +}; + +static int nft_limit_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + if (tb[NFTA_LIMIT_RATE] == NULL || + tb[NFTA_LIMIT_UNIT] == NULL) + return -EINVAL; + + priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); + priv->stamp = jiffies + priv->unit * HZ; + priv->tokens = priv->rate; + return 0; +} + +static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_limit *priv = nft_expr_priv(expr); + + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate))) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_limit_type; +static const struct nft_expr_ops nft_limit_ops = { + .type = &nft_limit_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), + .eval = nft_limit_eval, + .init = nft_limit_init, + .dump = nft_limit_dump, +}; + +static struct nft_expr_type nft_limit_type __read_mostly = { + .name = "limit", + .ops = &nft_limit_ops, + .policy = nft_limit_policy, + .maxattr = NFTA_LIMIT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_limit_module_init(void) +{ + return nft_register_expr(&nft_limit_type); +} + +static void __exit nft_limit_module_exit(void) +{ + nft_unregister_expr(&nft_limit_type); +} + +module_init(nft_limit_module_init); +module_exit(nft_limit_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("limit"); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c new file mode 100644 index 00000000000..10cfb156cdf --- /dev/null +++ b/net/netfilter/nft_log.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netdevice.h> + +static const char *nft_log_null_prefix = ""; + +struct nft_log { + struct nf_loginfo loginfo; + char *prefix; +}; + +static void nft_log_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_log *priv = nft_expr_priv(expr); + struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + + nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, + pkt->out, &priv->loginfo, "%s", priv->prefix); +} + +static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { + [NFTA_LOG_GROUP] = { .type = NLA_U16 }, + [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, + [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, + [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, +}; + +static int nft_log_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_log *priv = nft_expr_priv(expr); + struct nf_loginfo *li = &priv->loginfo; + const struct nlattr *nla; + + nla = tb[NFTA_LOG_PREFIX]; + if (nla != NULL) { + priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); + if (priv->prefix == NULL) + return -ENOMEM; + nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); + } else + priv->prefix = (char *)nft_log_null_prefix; + + li->type = NF_LOG_TYPE_ULOG; + if (tb[NFTA_LOG_GROUP] != NULL) + li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); + + if (tb[NFTA_LOG_SNAPLEN] != NULL) + li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); + if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { + li->u.ulog.qthreshold = + ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + } + + return 0; +} + +static void nft_log_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_log *priv = nft_expr_priv(expr); + + if (priv->prefix != nft_log_null_prefix) + kfree(priv->prefix); +} + +static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_log *priv = nft_expr_priv(expr); + const struct nf_loginfo *li = &priv->loginfo; + + if (priv->prefix != nft_log_null_prefix) + if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) + goto nla_put_failure; + if (li->u.ulog.group) + if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) + goto nla_put_failure; + if (li->u.ulog.copy_len) + if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, + htonl(li->u.ulog.copy_len))) + goto nla_put_failure; + if (li->u.ulog.qthreshold) + if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, + htons(li->u.ulog.qthreshold))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_log_type; +static const struct nft_expr_ops nft_log_ops = { + .type = &nft_log_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_log)), + .eval = nft_log_eval, + .init = nft_log_init, + .destroy = nft_log_destroy, + .dump = nft_log_dump, +}; + +static struct nft_expr_type nft_log_type __read_mostly = { + .name = "log", + .ops = &nft_log_ops, + .policy = nft_log_policy, + .maxattr = NFTA_LOG_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_log_module_init(void) +{ + return nft_register_expr(&nft_log_type); +} + +static void __exit nft_log_module_exit(void) +{ + nft_unregister_expr(&nft_log_type); +} + +module_init(nft_log_module_init); +module_exit(nft_log_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("log"); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c new file mode 100644 index 00000000000..6404a726d17 --- /dev/null +++ b/net/netfilter/nft_lookup.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> + +struct nft_lookup { + struct nft_set *set; + enum nft_registers sreg:8; + enum nft_registers dreg:8; + struct nft_set_binding binding; +}; + +static void nft_lookup_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_lookup *priv = nft_expr_priv(expr); + const struct nft_set *set = priv->set; + + if (set->ops->lookup(set, &data[priv->sreg], &data[priv->dreg])) + return; + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { + [NFTA_LOOKUP_SET] = { .type = NLA_STRING }, + [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, + [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, +}; + +static int nft_lookup_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + struct nft_set *set; + int err; + + if (tb[NFTA_LOOKUP_SET] == NULL || + tb[NFTA_LOOKUP_SREG] == NULL) + return -EINVAL; + + set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); + if (IS_ERR(set)) { + if (tb[NFTA_LOOKUP_SET_ID]) { + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_LOOKUP_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + if (tb[NFTA_LOOKUP_DREG] != NULL) { + if (!(set->flags & NFT_SET_MAP)) + return -EINVAL; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + if (priv->dreg == NFT_REG_VERDICT) { + if (set->dtype != NFT_DATA_VERDICT) + return -EINVAL; + } else if (set->dtype == NFT_DATA_VERDICT) + return -EINVAL; + } else if (set->flags & NFT_SET_MAP) + return -EINVAL; + + err = nf_tables_bind_set(ctx, set, &priv->binding); + if (err < 0) + return err; + + priv->set = set; + return 0; +} + +static void nft_lookup_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_lookup *priv = nft_expr_priv(expr); + + if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_LOOKUP_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (priv->set->flags & NFT_SET_MAP) + if (nla_put_be32(skb, NFTA_LOOKUP_DREG, htonl(priv->dreg))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_lookup_type; +static const struct nft_expr_ops nft_lookup_ops = { + .type = &nft_lookup_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), + .eval = nft_lookup_eval, + .init = nft_lookup_init, + .destroy = nft_lookup_destroy, + .dump = nft_lookup_dump, +}; + +static struct nft_expr_type nft_lookup_type __read_mostly = { + .name = "lookup", + .ops = &nft_lookup_ops, + .policy = nft_lookup_policy, + .maxattr = NFTA_LOOKUP_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_lookup_module_init(void) +{ + return nft_register_expr(&nft_lookup_type); +} + +void nft_lookup_module_exit(void) +{ + nft_unregister_expr(&nft_lookup_type); +} diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c new file mode 100644 index 00000000000..852b178c6ae --- /dev/null +++ b/net/netfilter/nft_meta.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/tcp_states.h> /* for TCP_TIME_WAIT */ +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> + +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + const struct net_device *in = pkt->in, *out = pkt->out; + struct nft_data *dest = &data[priv->dreg]; + + switch (priv->key) { + case NFT_META_LEN: + dest->data[0] = skb->len; + break; + case NFT_META_PROTOCOL: + *(__be16 *)dest->data = skb->protocol; + break; + case NFT_META_NFPROTO: + dest->data[0] = pkt->ops->pf; + break; + case NFT_META_L4PROTO: + dest->data[0] = pkt->tprot; + break; + case NFT_META_PRIORITY: + dest->data[0] = skb->priority; + break; + case NFT_META_MARK: + dest->data[0] = skb->mark; + break; + case NFT_META_IIF: + if (in == NULL) + goto err; + dest->data[0] = in->ifindex; + break; + case NFT_META_OIF: + if (out == NULL) + goto err; + dest->data[0] = out->ifindex; + break; + case NFT_META_IIFNAME: + if (in == NULL) + goto err; + strncpy((char *)dest->data, in->name, sizeof(dest->data)); + break; + case NFT_META_OIFNAME: + if (out == NULL) + goto err; + strncpy((char *)dest->data, out->name, sizeof(dest->data)); + break; + case NFT_META_IIFTYPE: + if (in == NULL) + goto err; + *(u16 *)dest->data = in->type; + break; + case NFT_META_OIFTYPE: + if (out == NULL) + goto err; + *(u16 *)dest->data = out->type; + break; + case NFT_META_SKUID: + if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + goto err; + + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket == NULL || + skb->sk->sk_socket->file == NULL) { + read_unlock_bh(&skb->sk->sk_callback_lock); + goto err; + } + + dest->data[0] = + from_kuid_munged(&init_user_ns, + skb->sk->sk_socket->file->f_cred->fsuid); + read_unlock_bh(&skb->sk->sk_callback_lock); + break; + case NFT_META_SKGID: + if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + goto err; + + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket == NULL || + skb->sk->sk_socket->file == NULL) { + read_unlock_bh(&skb->sk->sk_callback_lock); + goto err; + } + dest->data[0] = + from_kgid_munged(&init_user_ns, + skb->sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&skb->sk->sk_callback_lock); + break; +#ifdef CONFIG_IP_ROUTE_CLASSID + case NFT_META_RTCLASSID: { + const struct dst_entry *dst = skb_dst(skb); + + if (dst == NULL) + goto err; + dest->data[0] = dst->tclassid; + break; + } +#endif +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: + dest->data[0] = skb->secmark; + break; +#endif + default: + WARN_ON(1); + goto err; + } + return; + +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} +EXPORT_SYMBOL_GPL(nft_meta_get_eval); + +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *meta = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + u32 value = data[meta->sreg].data[0]; + + switch (meta->key) { + case NFT_META_MARK: + skb->mark = value; + break; + case NFT_META_PRIORITY: + skb->priority = value; + break; + case NFT_META_NFTRACE: + skb->nf_trace = 1; + break; + default: + WARN_ON(1); + } +} +EXPORT_SYMBOL_GPL(nft_meta_set_eval); + +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { + [NFTA_META_DREG] = { .type = NLA_U32 }, + [NFTA_META_KEY] = { .type = NLA_U32 }, + [NFTA_META_SREG] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_meta_policy); + +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_LEN: + case NFT_META_PROTOCOL: + case NFT_META_NFPROTO: + case NFT_META_L4PROTO: + case NFT_META_PRIORITY: + case NFT_META_MARK: + case NFT_META_IIF: + case NFT_META_OIF: + case NFT_META_IIFNAME: + case NFT_META_OIFNAME: + case NFT_META_IIFTYPE: + case NFT_META_OIFTYPE: + case NFT_META_SKUID: + case NFT_META_SKGID: +#ifdef CONFIG_IP_ROUTE_CLASSID + case NFT_META_RTCLASSID: +#endif +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: +#endif + break; + default: + return -EOPNOTSUPP; + } + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_get_init); + +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_MARK: + case NFT_META_PRIORITY: + case NFT_META_NFTRACE: + break; + default: + return -EOPNOTSUPP; + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_set_init); + +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_get_dump); + +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_META_SREG, htonl(priv->sreg))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_set_dump); + +static struct nft_expr_type nft_meta_type; +static const struct nft_expr_ops nft_meta_get_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_get_eval, + .init = nft_meta_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_set_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .dump = nft_meta_set_dump, +}; + +static const struct nft_expr_ops * +nft_meta_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_type __read_mostly = { + .name = "meta", + .select_ops = &nft_meta_select_ops, + .policy = nft_meta_policy, + .maxattr = NFTA_META_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_meta_module_init(void) +{ + return nft_register_expr(&nft_meta_type); +} + +static void __exit nft_meta_module_exit(void) +{ + nft_unregister_expr(&nft_meta_type); +} + +module_init(nft_meta_module_init); +module_exit(nft_meta_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("meta"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c new file mode 100644 index 00000000000..79ff58cd36d --- /dev/null +++ b/net/netfilter/nft_nat.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/string.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +struct nft_nat { + enum nft_registers sreg_addr_min:8; + enum nft_registers sreg_addr_max:8; + enum nft_registers sreg_proto_min:8; + enum nft_registers sreg_proto_max:8; + enum nf_nat_manip_type type:8; + u8 family; +}; + +static void nft_nat_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_nat *priv = nft_expr_priv(expr); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); + struct nf_nat_range range; + + memset(&range, 0, sizeof(range)); + if (priv->sreg_addr_min) { + if (priv->family == AF_INET) { + range.min_addr.ip = (__force __be32) + data[priv->sreg_addr_min].data[0]; + range.max_addr.ip = (__force __be32) + data[priv->sreg_addr_max].data[0]; + + } else { + memcpy(range.min_addr.ip6, + data[priv->sreg_addr_min].data, + sizeof(struct nft_data)); + memcpy(range.max_addr.ip6, + data[priv->sreg_addr_max].data, + sizeof(struct nft_data)); + } + range.flags |= NF_NAT_RANGE_MAP_IPS; + } + + if (priv->sreg_proto_min) { + range.min_proto.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + range.max_proto.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + data[NFT_REG_VERDICT].verdict = + nf_nat_setup_info(ct, &range, priv->type); +} + +static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { + [NFTA_NAT_TYPE] = { .type = NLA_U32 }, + [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, + [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, + [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, + [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, + [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, +}; + +static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_nat *priv = nft_expr_priv(expr); + u32 family; + int err; + + if (tb[NFTA_NAT_TYPE] == NULL) + return -EINVAL; + + switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { + case NFT_NAT_SNAT: + priv->type = NF_NAT_MANIP_SRC; + break; + case NFT_NAT_DNAT: + priv->type = NF_NAT_MANIP_DST; + break; + default: + return -EINVAL; + } + + if (tb[NFTA_NAT_FAMILY] == NULL) + return -EINVAL; + + family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + if (family != ctx->afi->family) + return -EOPNOTSUPP; + priv->family = family; + + if (tb[NFTA_NAT_REG_ADDR_MIN]) { + priv->sreg_addr_min = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_ADDR_MIN])); + err = nft_validate_input_register(priv->sreg_addr_min); + if (err < 0) + return err; + } + + if (tb[NFTA_NAT_REG_ADDR_MAX]) { + priv->sreg_addr_max = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_ADDR_MAX])); + err = nft_validate_input_register(priv->sreg_addr_max); + if (err < 0) + return err; + } else + priv->sreg_addr_max = priv->sreg_addr_min; + + if (tb[NFTA_NAT_REG_PROTO_MIN]) { + priv->sreg_proto_min = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_PROTO_MIN])); + err = nft_validate_input_register(priv->sreg_proto_min); + if (err < 0) + return err; + } + + if (tb[NFTA_NAT_REG_PROTO_MAX]) { + priv->sreg_proto_max = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_PROTO_MAX])); + err = nft_validate_input_register(priv->sreg_proto_max); + if (err < 0) + return err; + } else + priv->sreg_proto_max = priv->sreg_proto_min; + + return 0; +} + +static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_nat *priv = nft_expr_priv(expr); + + switch (priv->type) { + case NF_NAT_MANIP_SRC: + if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) + goto nla_put_failure; + break; + case NF_NAT_MANIP_DST: + if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) + goto nla_put_failure; + break; + } + + if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) + goto nla_put_failure; + if (nla_put_be32(skb, + NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min))) + goto nla_put_failure; + if (nla_put_be32(skb, + NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) + goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_nat_type; +static const struct nft_expr_ops nft_nat_ops = { + .type = &nft_nat_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), + .eval = nft_nat_eval, + .init = nft_nat_init, + .dump = nft_nat_dump, +}; + +static struct nft_expr_type nft_nat_type __read_mostly = { + .name = "nat", + .ops = &nft_nat_ops, + .policy = nft_nat_policy, + .maxattr = NFTA_NAT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_nat_module_init(void) +{ + return nft_register_expr(&nft_nat_type); +} + +static void __exit nft_nat_module_exit(void) +{ + nft_unregister_expr(&nft_nat_type); +} + +module_init(nft_nat_module_init); +module_exit(nft_nat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_EXPR("nat"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c new file mode 100644 index 00000000000..85daa84bfdf --- /dev/null +++ b/net/netfilter/nft_payload.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +static void nft_payload_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + struct nft_data *dest = &data[priv->dreg]; + int offset; + + switch (priv->base) { + case NFT_PAYLOAD_LL_HEADER: + if (!skb_mac_header_was_set(skb)) + goto err; + offset = skb_mac_header(skb) - skb->data; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + offset = skb_network_offset(skb); + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + offset = pkt->xt.thoff; + break; + default: + BUG(); + } + offset += priv->offset; + + if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0) + goto err; + return; +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { + [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, +}; + +static int nft_payload_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload *priv = nft_expr_priv(expr); + int err; + + priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) || + nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || + nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || + nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_payload_type; +static const struct nft_expr_ops nft_payload_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .eval = nft_payload_eval, + .init = nft_payload_init, + .dump = nft_payload_dump, +}; + +const struct nft_expr_ops nft_payload_fast_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .eval = nft_payload_eval, + .init = nft_payload_init, + .dump = nft_payload_dump, +}; + +static const struct nft_expr_ops * +nft_payload_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + enum nft_payload_bases base; + unsigned int offset, len; + + if (tb[NFTA_PAYLOAD_DREG] == NULL || + tb[NFTA_PAYLOAD_BASE] == NULL || + tb[NFTA_PAYLOAD_OFFSET] == NULL || + tb[NFTA_PAYLOAD_LEN] == NULL) + return ERR_PTR(-EINVAL); + + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + switch (base) { + case NFT_PAYLOAD_LL_HEADER: + case NFT_PAYLOAD_NETWORK_HEADER: + case NFT_PAYLOAD_TRANSPORT_HEADER: + break; + default: + return ERR_PTR(-EOPNOTSUPP); + } + + offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data)) + return ERR_PTR(-EINVAL); + + if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && + base != NFT_PAYLOAD_LL_HEADER) + return &nft_payload_fast_ops; + else + return &nft_payload_ops; +} + +static struct nft_expr_type nft_payload_type __read_mostly = { + .name = "payload", + .select_ops = nft_payload_select_ops, + .policy = nft_payload_policy, + .maxattr = NFTA_PAYLOAD_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_payload_module_init(void) +{ + return nft_register_expr(&nft_payload_type); +} + +void nft_payload_module_exit(void) +{ + nft_unregister_expr(&nft_payload_type); +} diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c new file mode 100644 index 00000000000..e8ae2f6bf23 --- /dev/null +++ b/net/netfilter/nft_queue.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code partly funded by OISF + * (http://www.openinfosecfoundation.org/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/jhash.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_queue.h> + +static u32 jhash_initval __read_mostly; + +struct nft_queue { + u16 queuenum; + u16 queues_total; + u16 flags; +}; + +static void nft_queue_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_queue *priv = nft_expr_priv(expr); + u32 queue = priv->queuenum; + u32 ret; + + if (priv->queues_total > 1) { + if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { + int cpu = smp_processor_id(); + + queue = priv->queuenum + cpu % priv->queues_total; + } else { + queue = nfqueue_hash(pkt->skb, queue, + priv->queues_total, pkt->ops->pf, + jhash_initval); + } + } + + ret = NF_QUEUE_NR(queue); + if (priv->flags & NFT_QUEUE_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + data[NFT_REG_VERDICT].verdict = ret; +} + +static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { + [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, + [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, + [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, +}; + +static int nft_queue_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_queue *priv = nft_expr_priv(expr); + + if (tb[NFTA_QUEUE_NUM] == NULL) + return -EINVAL; + + init_hashrandom(&jhash_initval); + priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); + + if (tb[NFTA_QUEUE_TOTAL] != NULL) + priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); + if (tb[NFTA_QUEUE_FLAGS] != NULL) { + priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); + if (priv->flags & ~NFT_QUEUE_FLAG_MASK) + return -EINVAL; + } + return 0; +} + +static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_queue *priv = nft_expr_priv(expr); + + if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) || + nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) || + nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_queue_type; +static const struct nft_expr_ops nft_queue_ops = { + .type = &nft_queue_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), + .eval = nft_queue_eval, + .init = nft_queue_init, + .dump = nft_queue_dump, +}; + +static struct nft_expr_type nft_queue_type __read_mostly = { + .name = "queue", + .ops = &nft_queue_ops, + .policy = nft_queue_policy, + .maxattr = NFTA_QUEUE_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_queue_module_init(void) +{ + return nft_register_expr(&nft_queue_type); +} + +static void __exit nft_queue_module_exit(void) +{ + nft_unregister_expr(&nft_queue_type); +} + +module_init(nft_queue_module_init); +module_exit(nft_queue_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); +MODULE_ALIAS_NFT_EXPR("queue"); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c new file mode 100644 index 00000000000..e1836ff8819 --- /dev/null +++ b/net/netfilter/nft_rbtree.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(nft_rbtree_lock); + +struct nft_rbtree { + struct rb_root root; +}; + +struct nft_rbtree_elem { + struct rb_node node; + u16 flags; + struct nft_data key; + struct nft_data data[]; +}; + +static bool nft_rbtree_lookup(const struct nft_set *set, + const struct nft_data *key, + struct nft_data *data) +{ + const struct nft_rbtree *priv = nft_set_priv(set); + const struct nft_rbtree_elem *rbe, *interval = NULL; + const struct rb_node *parent = priv->root.rb_node; + int d; + + spin_lock_bh(&nft_rbtree_lock); + while (parent != NULL) { + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + + d = nft_data_cmp(&rbe->key, key, set->klen); + if (d < 0) { + parent = parent->rb_left; + interval = rbe; + } else if (d > 0) + parent = parent->rb_right; + else { +found: + if (rbe->flags & NFT_SET_ELEM_INTERVAL_END) + goto out; + if (set->flags & NFT_SET_MAP) + nft_data_copy(data, rbe->data); + + spin_unlock_bh(&nft_rbtree_lock); + return true; + } + } + + if (set->flags & NFT_SET_INTERVAL && interval != NULL) { + rbe = interval; + goto found; + } +out: + spin_unlock_bh(&nft_rbtree_lock); + return false; +} + +static void nft_rbtree_elem_destroy(const struct nft_set *set, + struct nft_rbtree_elem *rbe) +{ + nft_data_uninit(&rbe->key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(rbe->data, set->dtype); + + kfree(rbe); +} + +static int __nft_rbtree_insert(const struct nft_set *set, + struct nft_rbtree_elem *new) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *parent, **p; + int d; + + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_data_cmp(&rbe->key, &new->key, set->klen); + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else + return -EEXIST; + } + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &priv->root); + return 0; +} + +static int nft_rbtree_insert(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_rbtree_elem *rbe; + unsigned int size; + int err; + + size = sizeof(*rbe); + if (set->flags & NFT_SET_MAP && + !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) + size += sizeof(rbe->data[0]); + + rbe = kzalloc(size, GFP_KERNEL); + if (rbe == NULL) + return -ENOMEM; + + rbe->flags = elem->flags; + nft_data_copy(&rbe->key, &elem->key); + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_copy(rbe->data, &elem->data); + + spin_lock_bh(&nft_rbtree_lock); + err = __nft_rbtree_insert(set, rbe); + if (err < 0) + kfree(rbe); + + spin_unlock_bh(&nft_rbtree_lock); + return err; +} + +static void nft_rbtree_remove(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe = elem->cookie; + + spin_lock_bh(&nft_rbtree_lock); + rb_erase(&rbe->node, &priv->root); + spin_unlock_bh(&nft_rbtree_lock); + kfree(rbe); +} + +static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) +{ + const struct nft_rbtree *priv = nft_set_priv(set); + const struct rb_node *parent = priv->root.rb_node; + struct nft_rbtree_elem *rbe; + int d; + + spin_lock_bh(&nft_rbtree_lock); + while (parent != NULL) { + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + + d = nft_data_cmp(&rbe->key, &elem->key, set->klen); + if (d < 0) + parent = parent->rb_left; + else if (d > 0) + parent = parent->rb_right; + else { + elem->cookie = rbe; + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_copy(&elem->data, rbe->data); + elem->flags = rbe->flags; + spin_unlock_bh(&nft_rbtree_lock); + return 0; + } + } + spin_unlock_bh(&nft_rbtree_lock); + return -ENOENT; +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + const struct nft_set *set, + struct nft_set_iter *iter) +{ + const struct nft_rbtree *priv = nft_set_priv(set); + const struct nft_rbtree_elem *rbe; + struct nft_set_elem elem; + struct rb_node *node; + + spin_lock_bh(&nft_rbtree_lock); + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + if (iter->count < iter->skip) + goto cont; + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + nft_data_copy(&elem.key, &rbe->key); + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_copy(&elem.data, rbe->data); + elem.flags = rbe->flags; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) { + spin_unlock_bh(&nft_rbtree_lock); + return; + } +cont: + iter->count++; + } + spin_unlock_bh(&nft_rbtree_lock); +} + +static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) +{ + return sizeof(struct nft_rbtree); +} + +static int nft_rbtree_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + priv->root = RB_ROOT; + return 0; +} + +static void nft_rbtree_destroy(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *node; + + spin_lock_bh(&nft_rbtree_lock); + while ((node = priv->root.rb_node) != NULL) { + rb_erase(node, &priv->root); + rbe = rb_entry(node, struct nft_rbtree_elem, node); + nft_rbtree_elem_destroy(set, rbe); + } + spin_unlock_bh(&nft_rbtree_lock); +} + +static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int nsize; + + nsize = sizeof(struct nft_rbtree_elem); + if (features & NFT_SET_MAP) + nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); + + if (desc->size) + est->size = sizeof(struct nft_rbtree) + desc->size * nsize; + else + est->size = nsize; + + est->class = NFT_SET_CLASS_O_LOG_N; + + return true; +} + +static struct nft_set_ops nft_rbtree_ops __read_mostly = { + .privsize = nft_rbtree_privsize, + .estimate = nft_rbtree_estimate, + .init = nft_rbtree_init, + .destroy = nft_rbtree_destroy, + .insert = nft_rbtree_insert, + .remove = nft_rbtree_remove, + .get = nft_rbtree_get, + .lookup = nft_rbtree_lookup, + .walk = nft_rbtree_walk, + .features = NFT_SET_INTERVAL | NFT_SET_MAP, + .owner = THIS_MODULE, +}; + +static int __init nft_rbtree_module_init(void) +{ + return nft_register_set(&nft_rbtree_ops); +} + +static void __exit nft_rbtree_module_exit(void) +{ + nft_unregister_set(&nft_rbtree_ops); +} + +module_init(nft_rbtree_module_init); +module_exit(nft_rbtree_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c new file mode 100644 index 00000000000..f3448c29644 --- /dev/null +++ b/net/netfilter/nft_reject.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { + [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, +}; +EXPORT_SYMBOL_GPL(nft_reject_policy); + +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_reject_init); + +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_reject_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 00000000000..b718a52a465 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_inet_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + return nft_reject_ipv4_eval(expr, data, pkt); + case NFPROTO_IPV6: + return nft_reject_ipv6_eval(expr, data, pkt); + } +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { + .type = &nft_reject_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_inet_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "reject", + .ops = &nft_reject_inet_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ + return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ + nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 0eb2504b89b..227aa11e840 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -2,6 +2,7 @@ * x_tables core - Backend for {ip,ip6,arp}_tables * * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> + * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on existing ip_tables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling @@ -12,8 +13,9 @@ * published by the Free Software Foundation. * */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> +#include <linux/module.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/proc_fs.h> @@ -22,50 +24,58 @@ #include <linux/vmalloc.h> #include <linux/mutex.h> #include <linux/mm.h> +#include <linux/slab.h> +#include <linux/audit.h> +#include <net/net_namespace.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp.h> - +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_arp/arp_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("[ip,ip6,arp]_tables backend module"); +MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) +struct compat_delta { + unsigned int offset; /* offset in kernel */ + int delta; /* delta in 32bit user land */ +}; + struct xt_af { struct mutex mutex; struct list_head match; struct list_head target; - struct list_head tables; +#ifdef CONFIG_COMPAT struct mutex compat_mutex; + struct compat_delta *compat_tab; + unsigned int number; /* number of slots in compat_tab[] */ + unsigned int cur; /* number of used slots in compat_tab[] */ +#endif }; static struct xt_af *xt; -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif - -enum { - TABLE, - TARGET, - MATCH, +static const char *const xt_prefix[NFPROTO_NUMPROTO] = { + [NFPROTO_UNSPEC] = "x", + [NFPROTO_IPV4] = "ip", + [NFPROTO_ARP] = "arp", + [NFPROTO_BRIDGE] = "eb", + [NFPROTO_IPV6] = "ip6", }; -static const char *xt_prefix[NPROTO] = { - [AF_INET] = "ip", - [AF_INET6] = "ip6", - [NF_ARP] = "arp", -}; +/* Allow this many total (re)entries. */ +static const unsigned int xt_jumpstack_multiplier = 2; /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { - int ret, af = target->family; + u_int8_t af = target->family; + int ret; ret = mutex_lock_interruptible(&xt[af].mutex); if (ret != 0) @@ -79,7 +89,7 @@ EXPORT_SYMBOL(xt_register_target); void xt_unregister_target(struct xt_target *target) { - int af = target->family; + u_int8_t af = target->family; mutex_lock(&xt[af].mutex); list_del(&target->list); @@ -110,17 +120,16 @@ EXPORT_SYMBOL(xt_register_targets); void xt_unregister_targets(struct xt_target *target, unsigned int n) { - unsigned int i; - - for (i = 0; i < n; i++) - xt_unregister_target(&target[i]); + while (n-- > 0) + xt_unregister_target(&target[n]); } EXPORT_SYMBOL(xt_unregister_targets); int xt_register_match(struct xt_match *match) { - int ret, af = match->family; + u_int8_t af = match->family; + int ret; ret = mutex_lock_interruptible(&xt[af].mutex); if (ret != 0) @@ -136,7 +145,7 @@ EXPORT_SYMBOL(xt_register_match); void xt_unregister_match(struct xt_match *match) { - int af = match->family; + u_int8_t af = match->family; mutex_lock(&xt[af].mutex); list_del(&match->list); @@ -167,10 +176,8 @@ EXPORT_SYMBOL(xt_register_matches); void xt_unregister_matches(struct xt_match *match, unsigned int n) { - unsigned int i; - - for (i = 0; i < n; i++) - xt_unregister_match(&match[i]); + while (n-- > 0) + xt_unregister_match(&match[n]); } EXPORT_SYMBOL(xt_unregister_matches); @@ -178,14 +185,14 @@ EXPORT_SYMBOL(xt_unregister_matches); /* * These are weird, but module loading must not be done with mutex * held (since they will register), and we have to have a single - * function to use try_then_request_module(). + * function to use. */ /* Find match, grabs ref. Returns ERR_PTR() on error. */ -struct xt_match *xt_find_match(int af, const char *name, u8 revision) +struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) { struct xt_match *m; - int err = 0; + int err = -ENOENT; if (mutex_lock_interruptible(&xt[af].mutex) != 0) return ERR_PTR(-EINTR); @@ -202,15 +209,35 @@ struct xt_match *xt_find_match(int af, const char *name, u8 revision) } } mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_match(NFPROTO_UNSPEC, name, revision); + return ERR_PTR(err); } EXPORT_SYMBOL(xt_find_match); +struct xt_match * +xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) +{ + struct xt_match *match; + + match = xt_find_match(nfproto, name, revision); + if (IS_ERR(match)) { + request_module("%st_%s", xt_prefix[nfproto], name); + match = xt_find_match(nfproto, name, revision); + } + + return match; +} +EXPORT_SYMBOL_GPL(xt_request_find_match); + /* Find target, grabs ref. Returns ERR_PTR() on error. */ -struct xt_target *xt_find_target(int af, const char *name, u8 revision) +struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; - int err = 0; + int err = -ENOENT; if (mutex_lock_interruptible(&xt[af].mutex) != 0) return ERR_PTR(-EINTR); @@ -227,25 +254,32 @@ struct xt_target *xt_find_target(int af, const char *name, u8 revision) } } mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_target(NFPROTO_UNSPEC, name, revision); + return ERR_PTR(err); } EXPORT_SYMBOL(xt_find_target); -struct xt_target *xt_request_find_target(int af, const char *name, u8 revision) +struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { struct xt_target *target; - target = try_then_request_module(xt_find_target(af, name, revision), - "%st_%s", xt_prefix[af], name); - if (IS_ERR(target) || !target) - return NULL; + target = xt_find_target(af, name, revision); + if (IS_ERR(target)) { + request_module("%st_%s", xt_prefix[af], name); + target = xt_find_target(af, name, revision); + } + return target; } EXPORT_SYMBOL_GPL(xt_request_find_target); -static int match_revfn(int af, const char *name, u8 revision, int *bestp) +static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) { - struct xt_match *m; + const struct xt_match *m; int have_rev = 0; list_for_each_entry(m, &xt[af].match, list) { @@ -256,12 +290,16 @@ static int match_revfn(int af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + + if (af != NFPROTO_UNSPEC && !have_rev) + return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); + return have_rev; } -static int target_revfn(int af, const char *name, u8 revision, int *bestp) +static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) { - struct xt_target *t; + const struct xt_target *t; int have_rev = 0; list_for_each_entry(t, &xt[af].target, list) { @@ -272,11 +310,15 @@ static int target_revfn(int af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + + if (af != NFPROTO_UNSPEC && !have_rev) + return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); + return have_rev; } /* Returns true or false (if no such extension at all) */ -int xt_find_revision(int af, const char *name, u8 revision, int target, +int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err) { int have_rev, best = -1; @@ -304,47 +346,168 @@ int xt_find_revision(int af, const char *name, u8 revision, int target, } EXPORT_SYMBOL_GPL(xt_find_revision); -int xt_check_match(const struct xt_match *match, unsigned short family, - unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto) +static char * +textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto) { - if (XT_ALIGN(match->matchsize) != size) { - printk("%s_tables: %s match: invalid size %Zu != %u\n", - xt_prefix[family], match->name, - XT_ALIGN(match->matchsize), size); + static const char *const inetbr_names[] = { + "PREROUTING", "INPUT", "FORWARD", + "OUTPUT", "POSTROUTING", "BROUTING", + }; + static const char *const arp_names[] = { + "INPUT", "FORWARD", "OUTPUT", + }; + const char *const *names; + unsigned int i, max; + char *p = buf; + bool np = false; + int res; + + names = (nfproto == NFPROTO_ARP) ? arp_names : inetbr_names; + max = (nfproto == NFPROTO_ARP) ? ARRAY_SIZE(arp_names) : + ARRAY_SIZE(inetbr_names); + *p = '\0'; + for (i = 0; i < max; ++i) { + if (!(mask & (1 << i))) + continue; + res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]); + if (res > 0) { + size -= res; + p += res; + } + np = true; + } + + return buf; +} + +int xt_check_match(struct xt_mtchk_param *par, + unsigned int size, u_int8_t proto, bool inv_proto) +{ + int ret; + + if (XT_ALIGN(par->match->matchsize) != size && + par->match->matchsize != -1) { + /* + * ebt_among is exempt from centralized matchsize checking + * because it uses a dynamic-size data set. + */ + pr_err("%s_tables: %s.%u match: invalid size " + "%u (kernel) != (user) %u\n", + xt_prefix[par->family], par->match->name, + par->match->revision, + XT_ALIGN(par->match->matchsize), size); return -EINVAL; } - if (match->table && strcmp(match->table, table)) { - printk("%s_tables: %s match: only valid in %s table, not %s\n", - xt_prefix[family], match->name, match->table, table); + if (par->match->table != NULL && + strcmp(par->match->table, par->table) != 0) { + pr_err("%s_tables: %s match: only valid in %s table, not %s\n", + xt_prefix[par->family], par->match->name, + par->match->table, par->table); return -EINVAL; } - if (match->hooks && (hook_mask & ~match->hooks) != 0) { - printk("%s_tables: %s match: bad hook_mask %u\n", - xt_prefix[family], match->name, hook_mask); + if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { + char used[64], allow[64]; + + pr_err("%s_tables: %s match: used from hooks %s, but only " + "valid from %s\n", + xt_prefix[par->family], par->match->name, + textify_hooks(used, sizeof(used), par->hook_mask, + par->family), + textify_hooks(allow, sizeof(allow), par->match->hooks, + par->family)); return -EINVAL; } - if (match->proto && (match->proto != proto || inv_proto)) { - printk("%s_tables: %s match: only valid for protocol %u\n", - xt_prefix[family], match->name, match->proto); + if (par->match->proto && (par->match->proto != proto || inv_proto)) { + pr_err("%s_tables: %s match: only valid for protocol %u\n", + xt_prefix[par->family], par->match->name, + par->match->proto); return -EINVAL; } + if (par->match->checkentry != NULL) { + ret = par->match->checkentry(par); + if (ret < 0) + return ret; + else if (ret > 0) + /* Flag up potential errors. */ + return -EIO; + } return 0; } EXPORT_SYMBOL_GPL(xt_check_match); #ifdef CONFIG_COMPAT -int xt_compat_match_offset(struct xt_match *match) +int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) +{ + struct xt_af *xp = &xt[af]; + + if (!xp->compat_tab) { + if (!xp->number) + return -EINVAL; + xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); + if (!xp->compat_tab) + return -ENOMEM; + xp->cur = 0; + } + + if (xp->cur >= xp->number) + return -EINVAL; + + if (xp->cur) + delta += xp->compat_tab[xp->cur - 1].delta; + xp->compat_tab[xp->cur].offset = offset; + xp->compat_tab[xp->cur].delta = delta; + xp->cur++; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_add_offset); + +void xt_compat_flush_offsets(u_int8_t af) +{ + if (xt[af].compat_tab) { + vfree(xt[af].compat_tab); + xt[af].compat_tab = NULL; + xt[af].number = 0; + xt[af].cur = 0; + } +} +EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); + +int xt_compat_calc_jump(u_int8_t af, unsigned int offset) +{ + struct compat_delta *tmp = xt[af].compat_tab; + int mid, left = 0, right = xt[af].cur - 1; + + while (left <= right) { + mid = (left + right) >> 1; + if (offset > tmp[mid].offset) + left = mid + 1; + else if (offset < tmp[mid].offset) + right = mid - 1; + else + return mid ? tmp[mid - 1].delta : 0; + } + return left ? tmp[left - 1].delta : 0; +} +EXPORT_SYMBOL_GPL(xt_compat_calc_jump); + +void xt_compat_init_offsets(u_int8_t af, unsigned int number) +{ + xt[af].number = number; + xt[af].cur = 0; +} +EXPORT_SYMBOL(xt_compat_init_offsets); + +int xt_compat_match_offset(const struct xt_match *match) { u_int16_t csize = match->compatsize ? : match->matchsize; return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize); } EXPORT_SYMBOL_GPL(xt_compat_match_offset); -void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, - int *size) +int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, + unsigned int *size) { - struct xt_match *match = m->u.kernel.match; + const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; int pad, off = xt_compat_match_offset(match); u_int16_t msize = cm->u.user.match_size; @@ -364,19 +527,22 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, *size += off; *dstptr += msize; + return 0; } EXPORT_SYMBOL_GPL(xt_compat_match_from_user); -int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr, - int *size) +int xt_compat_match_to_user(const struct xt_entry_match *m, + void __user **dstptr, unsigned int *size) { - struct xt_match *match = m->u.kernel.match; + const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match __user *cm = *dstptr; int off = xt_compat_match_offset(match); u_int16_t msize = m->u.user.match_size - off; if (copy_to_user(cm, m, sizeof(*cm)) || - put_user(msize, &cm->u.user.match_size)) + put_user(msize, &cm->u.user.match_size) || + copy_to_user(cm->u.user.name, m->u.kernel.match->name, + strlen(m->u.kernel.match->name) + 1)) return -EFAULT; if (match->compat_to_user) { @@ -394,37 +560,58 @@ int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr, EXPORT_SYMBOL_GPL(xt_compat_match_to_user); #endif /* CONFIG_COMPAT */ -int xt_check_target(const struct xt_target *target, unsigned short family, - unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto) +int xt_check_target(struct xt_tgchk_param *par, + unsigned int size, u_int8_t proto, bool inv_proto) { - if (XT_ALIGN(target->targetsize) != size) { - printk("%s_tables: %s target: invalid size %Zu != %u\n", - xt_prefix[family], target->name, - XT_ALIGN(target->targetsize), size); + int ret; + + if (XT_ALIGN(par->target->targetsize) != size) { + pr_err("%s_tables: %s.%u target: invalid size " + "%u (kernel) != (user) %u\n", + xt_prefix[par->family], par->target->name, + par->target->revision, + XT_ALIGN(par->target->targetsize), size); return -EINVAL; } - if (target->table && strcmp(target->table, table)) { - printk("%s_tables: %s target: only valid in %s table, not %s\n", - xt_prefix[family], target->name, target->table, table); + if (par->target->table != NULL && + strcmp(par->target->table, par->table) != 0) { + pr_err("%s_tables: %s target: only valid in %s table, not %s\n", + xt_prefix[par->family], par->target->name, + par->target->table, par->table); return -EINVAL; } - if (target->hooks && (hook_mask & ~target->hooks) != 0) { - printk("%s_tables: %s target: bad hook_mask %u\n", - xt_prefix[family], target->name, hook_mask); + if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { + char used[64], allow[64]; + + pr_err("%s_tables: %s target: used from hooks %s, but only " + "usable from %s\n", + xt_prefix[par->family], par->target->name, + textify_hooks(used, sizeof(used), par->hook_mask, + par->family), + textify_hooks(allow, sizeof(allow), par->target->hooks, + par->family)); return -EINVAL; } - if (target->proto && (target->proto != proto || inv_proto)) { - printk("%s_tables: %s target: only valid for protocol %u\n", - xt_prefix[family], target->name, target->proto); + if (par->target->proto && (par->target->proto != proto || inv_proto)) { + pr_err("%s_tables: %s target: only valid for protocol %u\n", + xt_prefix[par->family], par->target->name, + par->target->proto); return -EINVAL; } + if (par->target->checkentry != NULL) { + ret = par->target->checkentry(par); + if (ret < 0) + return ret; + else if (ret > 0) + /* Flag up potential errors. */ + return -EIO; + } return 0; } EXPORT_SYMBOL_GPL(xt_check_target); #ifdef CONFIG_COMPAT -int xt_compat_target_offset(struct xt_target *target) +int xt_compat_target_offset(const struct xt_target *target) { u_int16_t csize = target->compatsize ? : target->targetsize; return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize); @@ -432,9 +619,9 @@ int xt_compat_target_offset(struct xt_target *target) EXPORT_SYMBOL_GPL(xt_compat_target_offset); void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, - int *size) + unsigned int *size) { - struct xt_target *target = t->u.kernel.target; + const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; int pad, off = xt_compat_target_offset(target); u_int16_t tsize = ct->u.user.target_size; @@ -457,16 +644,18 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, } EXPORT_SYMBOL_GPL(xt_compat_target_from_user); -int xt_compat_target_to_user(struct xt_entry_target *t, void __user **dstptr, - int *size) +int xt_compat_target_to_user(const struct xt_entry_target *t, + void __user **dstptr, unsigned int *size) { - struct xt_target *target = t->u.kernel.target; + const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target __user *ct = *dstptr; int off = xt_compat_target_offset(target); u_int16_t tsize = t->u.user.target_size - off; if (copy_to_user(ct, t, sizeof(*ct)) || - put_user(tsize, &ct->u.user.target_size)) + put_user(tsize, &ct->u.user.target_size) || + copy_to_user(ct->u.user.name, t->u.kernel.target->name, + strlen(t->u.kernel.target->name) + 1)) return -EFAULT; if (target->compat_to_user) { @@ -490,10 +679,10 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) int cpu; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL); + newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); if (!newinfo) return NULL; @@ -528,19 +717,38 @@ void xt_free_table_info(struct xt_table_info *info) else vfree(info->entries[cpu]); } + + if (info->jumpstack != NULL) { + if (sizeof(void *) * info->stacksize > PAGE_SIZE) { + for_each_possible_cpu(cpu) + vfree(info->jumpstack[cpu]); + } else { + for_each_possible_cpu(cpu) + kfree(info->jumpstack[cpu]); + } + } + + if (sizeof(void **) * nr_cpu_ids > PAGE_SIZE) + vfree(info->jumpstack); + else + kfree(info->jumpstack); + + free_percpu(info->stackptr); + kfree(info); } EXPORT_SYMBOL(xt_free_table_info); /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ -struct xt_table *xt_find_table_lock(int af, const char *name) +struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, + const char *name) { struct xt_table *t; if (mutex_lock_interruptible(&xt[af].mutex) != 0) return ERR_PTR(-EINTR); - list_for_each_entry(t, &xt[af].tables, list) + list_for_each_entry(t, &net->xt.tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; mutex_unlock(&xt[af].mutex); @@ -555,61 +763,145 @@ void xt_table_unlock(struct xt_table *table) EXPORT_SYMBOL_GPL(xt_table_unlock); #ifdef CONFIG_COMPAT -void xt_compat_lock(int af) +void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_lock); -void xt_compat_unlock(int af) +void xt_compat_unlock(u_int8_t af) { mutex_unlock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif +DEFINE_PER_CPU(seqcount_t, xt_recseq); +EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); + +static int xt_jumpstack_alloc(struct xt_table_info *i) +{ + unsigned int size; + int cpu; + + i->stackptr = alloc_percpu(unsigned int); + if (i->stackptr == NULL) + return -ENOMEM; + + size = sizeof(void **) * nr_cpu_ids; + if (size > PAGE_SIZE) + i->jumpstack = vzalloc(size); + else + i->jumpstack = kzalloc(size, GFP_KERNEL); + if (i->jumpstack == NULL) + return -ENOMEM; + + i->stacksize *= xt_jumpstack_multiplier; + size = sizeof(void *) * i->stacksize; + for_each_possible_cpu(cpu) { + if (size > PAGE_SIZE) + i->jumpstack[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + else + i->jumpstack[cpu] = kmalloc_node(size, + GFP_KERNEL, cpu_to_node(cpu)); + if (i->jumpstack[cpu] == NULL) + /* + * Freeing will be done later on by the callers. The + * chain is: xt_replace_table -> __do_replace -> + * do_replace -> xt_free_table_info. + */ + return -ENOMEM; + } + + return 0; +} + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error) { - struct xt_table_info *oldinfo, *private; + struct xt_table_info *private; + int ret; + + ret = xt_jumpstack_alloc(newinfo); + if (ret < 0) { + *error = ret; + return NULL; + } /* Do the substitution. */ - write_lock_bh(&table->lock); + local_bh_disable(); private = table->private; + /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { - duprintf("num_counters != table->private->number (%u/%u)\n", + pr_debug("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - write_unlock_bh(&table->lock); + local_bh_enable(); *error = -EAGAIN; return NULL; } - oldinfo = private; + + newinfo->initial_entries = private->initial_entries; + /* + * Ensure contents of newinfo are visible before assigning to + * private. + */ + smp_wmb(); table->private = newinfo; - newinfo->initial_entries = oldinfo->initial_entries; - write_unlock_bh(&table->lock); - return oldinfo; + /* + * Even though table entries have now been swapped, other CPU's + * may still be using the old entries. This is okay, because + * resynchronization happens because of the locking done + * during the get_counters() routine. + */ + local_bh_enable(); + +#ifdef CONFIG_AUDIT + if (audit_enabled) { + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG); + if (ab) { + audit_log_format(ab, "table=%s family=%u entries=%u", + table->name, table->af, + private->number); + audit_log_end(ab); + } + } +#endif + + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); -int xt_register_table(struct xt_table *table, - struct xt_table_info *bootstrap, - struct xt_table_info *newinfo) +struct xt_table *xt_register_table(struct net *net, + const struct xt_table *input_table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo) { int ret; struct xt_table_info *private; - struct xt_table *t; + struct xt_table *t, *table; + + /* Don't add one object to multiple lists. */ + table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); + if (!table) { + ret = -ENOMEM; + goto out; + } ret = mutex_lock_interruptible(&xt[table->af].mutex); if (ret != 0) - return ret; + goto out_free; /* Don't autoload: we'd eat our tail... */ - list_for_each_entry(t, &xt[table->af].tables, list) { + list_for_each_entry(t, &net->xt.tables[table->af], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto unlock; @@ -618,22 +910,26 @@ int xt_register_table(struct xt_table *table, /* Simplifies replace_table code. */ table->private = bootstrap; - rwlock_init(&table->lock); + if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; - duprintf("table->private->number = %u\n", private->number); + pr_debug("table->private->number = %u\n", private->number); /* save number of initial entries */ private->initial_entries = private->number; - list_add(&table->list, &xt[table->af].tables); + list_add(&table->list, &net->xt.tables[table->af]); + mutex_unlock(&xt[table->af].mutex); + return table; - ret = 0; unlock: mutex_unlock(&xt[table->af].mutex); - return ret; +out_free: + kfree(table); +out: + return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); @@ -645,133 +941,292 @@ void *xt_unregister_table(struct xt_table *table) private = table->private; list_del(&table->list); mutex_unlock(&xt[table->af].mutex); + kfree(table); return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS -static struct list_head *xt_get_idx(struct list_head *list, struct seq_file *seq, loff_t pos) +struct xt_names_priv { + struct seq_net_private p; + u_int8_t af; +}; +static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { - struct list_head *head = list->next; + struct xt_names_priv *priv = seq->private; + struct net *net = seq_file_net(seq); + u_int8_t af = priv->af; - if (!head || list_empty(list)) - return NULL; + mutex_lock(&xt[af].mutex); + return seq_list_start(&net->xt.tables[af], *pos); +} - while (pos && (head = head->next)) { - if (head == list) - return NULL; - pos--; - } - return pos ? NULL : head; +static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct xt_names_priv *priv = seq->private; + struct net *net = seq_file_net(seq); + u_int8_t af = priv->af; + + return seq_list_next(v, &net->xt.tables[af], pos); } -static struct list_head *type2list(u_int16_t af, u_int16_t type) +static void xt_table_seq_stop(struct seq_file *seq, void *v) { - struct list_head *list; + struct xt_names_priv *priv = seq->private; + u_int8_t af = priv->af; - switch (type) { - case TARGET: - list = &xt[af].target; - break; - case MATCH: - list = &xt[af].match; - break; - case TABLE: - list = &xt[af].tables; + mutex_unlock(&xt[af].mutex); +} + +static int xt_table_seq_show(struct seq_file *seq, void *v) +{ + struct xt_table *table = list_entry(v, struct xt_table, list); + + if (strlen(table->name)) + return seq_printf(seq, "%s\n", table->name); + else + return 0; +} + +static const struct seq_operations xt_table_seq_ops = { + .start = xt_table_seq_start, + .next = xt_table_seq_next, + .stop = xt_table_seq_stop, + .show = xt_table_seq_show, +}; + +static int xt_table_open(struct inode *inode, struct file *file) +{ + int ret; + struct xt_names_priv *priv; + + ret = seq_open_net(inode, file, &xt_table_seq_ops, + sizeof(struct xt_names_priv)); + if (!ret) { + priv = ((struct seq_file *)file->private_data)->private; + priv->af = (unsigned long)PDE_DATA(inode); + } + return ret; +} + +static const struct file_operations xt_table_ops = { + .owner = THIS_MODULE, + .open = xt_table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +/* + * Traverse state for ip{,6}_{tables,matches} for helping crossing + * the multi-AF mutexes. + */ +struct nf_mttg_trav { + struct list_head *head, *curr; + uint8_t class, nfproto; +}; + +enum { + MTTG_TRAV_INIT, + MTTG_TRAV_NFP_UNSPEC, + MTTG_TRAV_NFP_SPEC, + MTTG_TRAV_DONE, +}; + +static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, + bool is_target) +{ + static const uint8_t next_class[] = { + [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, + [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, + }; + struct nf_mttg_trav *trav = seq->private; + + switch (trav->class) { + case MTTG_TRAV_INIT: + trav->class = MTTG_TRAV_NFP_UNSPEC; + mutex_lock(&xt[NFPROTO_UNSPEC].mutex); + trav->head = trav->curr = is_target ? + &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match; + break; + case MTTG_TRAV_NFP_UNSPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + mutex_lock(&xt[trav->nfproto].mutex); + trav->head = trav->curr = is_target ? + &xt[trav->nfproto].target : &xt[trav->nfproto].match; + trav->class = next_class[trav->class]; break; + case MTTG_TRAV_NFP_SPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + /* fallthru, _stop will unlock */ default: - list = NULL; - break; + return NULL; } - return list; + if (ppos != NULL) + ++*ppos; + return trav; } -static void *xt_tgt_seq_start(struct seq_file *seq, loff_t *pos) +static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, + bool is_target) { - struct proc_dir_entry *pde = (struct proc_dir_entry *) seq->private; - u_int16_t af = (unsigned long)pde->data & 0xffff; - u_int16_t type = (unsigned long)pde->data >> 16; - struct list_head *list; + struct nf_mttg_trav *trav = seq->private; + unsigned int j; - if (af >= NPROTO) - return NULL; + trav->class = MTTG_TRAV_INIT; + for (j = 0; j < *pos; ++j) + if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL) + return NULL; + return trav; +} - list = type2list(af, type); - if (!list) - return NULL; +static void xt_mttg_seq_stop(struct seq_file *seq, void *v) +{ + struct nf_mttg_trav *trav = seq->private; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) - return NULL; + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + break; + case MTTG_TRAV_NFP_SPEC: + mutex_unlock(&xt[trav->nfproto].mutex); + break; + } +} - return xt_get_idx(list, seq, *pos); +static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) +{ + return xt_mttg_seq_start(seq, pos, false); } -static void *xt_tgt_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct proc_dir_entry *pde = seq->private; - u_int16_t af = (unsigned long)pde->data & 0xffff; - u_int16_t type = (unsigned long)pde->data >> 16; - struct list_head *list; + return xt_mttg_seq_next(seq, v, ppos, false); +} - if (af >= NPROTO) - return NULL; +static int xt_match_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_mttg_trav *trav = seq->private; + const struct xt_match *match; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + match = list_entry(trav->curr, struct xt_match, list); + return (*match->name == '\0') ? 0 : + seq_printf(seq, "%s\n", match->name); + } + return 0; +} - list = type2list(af, type); - if (!list) - return NULL; +static const struct seq_operations xt_match_seq_ops = { + .start = xt_match_seq_start, + .next = xt_match_seq_next, + .stop = xt_mttg_seq_stop, + .show = xt_match_seq_show, +}; + +static int xt_match_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct nf_mttg_trav *trav; + int ret; + + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; - (*pos)++; - return xt_get_idx(list, seq, *pos); + ret = seq_open(file, &xt_match_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; + } + + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE_DATA(inode); + return 0; } -static void xt_tgt_seq_stop(struct seq_file *seq, void *v) -{ - struct proc_dir_entry *pde = seq->private; - u_int16_t af = (unsigned long)pde->data & 0xffff; +static const struct file_operations xt_match_ops = { + .owner = THIS_MODULE, + .open = xt_match_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; - mutex_unlock(&xt[af].mutex); +static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos) +{ + return xt_mttg_seq_start(seq, pos, true); } -static int xt_name_seq_show(struct seq_file *seq, void *v) +static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { - char *name = (char *)v + sizeof(struct list_head); + return xt_mttg_seq_next(seq, v, ppos, true); +} - if (strlen(name)) - return seq_printf(seq, "%s\n", name); - else - return 0; +static int xt_target_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_mttg_trav *trav = seq->private; + const struct xt_target *target; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + target = list_entry(trav->curr, struct xt_target, list); + return (*target->name == '\0') ? 0 : + seq_printf(seq, "%s\n", target->name); + } + return 0; } -static struct seq_operations xt_tgt_seq_ops = { - .start = xt_tgt_seq_start, - .next = xt_tgt_seq_next, - .stop = xt_tgt_seq_stop, - .show = xt_name_seq_show, +static const struct seq_operations xt_target_seq_ops = { + .start = xt_target_seq_start, + .next = xt_target_seq_next, + .stop = xt_mttg_seq_stop, + .show = xt_target_seq_show, }; -static int xt_tgt_open(struct inode *inode, struct file *file) +static int xt_target_open(struct inode *inode, struct file *file) { + struct seq_file *seq; + struct nf_mttg_trav *trav; int ret; - ret = seq_open(file, &xt_tgt_seq_ops); - if (!ret) { - struct seq_file *seq = file->private_data; - struct proc_dir_entry *pde = PDE(inode); + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; - seq->private = pde; + ret = seq_open(file, &xt_target_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; } - return ret; + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE_DATA(inode); + return 0; } -static const struct file_operations xt_file_ops = { +static const struct file_operations xt_target_ops = { .owner = THIS_MODULE, - .open = xt_tgt_open, + .open = xt_target_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #define FORMAT_TABLES "_tables_names" @@ -780,39 +1235,92 @@ static const struct file_operations xt_file_ops = { #endif /* CONFIG_PROC_FS */ -int xt_proto_init(int af) +/** + * xt_hook_link - set up hooks for a new table + * @table: table with metadata needed to set up hooks + * @fn: Hook function + * + * This function will take care of creating and registering the necessary + * Netfilter hooks for XT tables. + */ +struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) +{ + unsigned int hook_mask = table->valid_hooks; + uint8_t i, num_hooks = hweight32(hook_mask); + uint8_t hooknum; + struct nf_hook_ops *ops; + int ret; + + ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0; + hook_mask >>= 1, ++hooknum) { + if (!(hook_mask & 1)) + continue; + ops[i].hook = fn; + ops[i].owner = table->me; + ops[i].pf = table->af; + ops[i].hooknum = hooknum; + ops[i].priority = table->priority; + ++i; + } + + ret = nf_register_hooks(ops, num_hooks); + if (ret < 0) { + kfree(ops); + return ERR_PTR(ret); + } + + return ops; +} +EXPORT_SYMBOL_GPL(xt_hook_link); + +/** + * xt_hook_unlink - remove hooks for a table + * @ops: nf_hook_ops array as returned by nf_hook_link + * @hook_mask: the very same mask that was passed to nf_hook_link + */ +void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops) +{ + nf_unregister_hooks(ops, hweight32(table->valid_hooks)); + kfree(ops); +} +EXPORT_SYMBOL_GPL(xt_hook_unlink); + +int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; struct proc_dir_entry *proc; #endif - if (af >= NPROTO) + if (af >= ARRAY_SIZE(xt_prefix)) return -EINVAL; #ifdef CONFIG_PROC_FS strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); - proc = proc_net_fops_create(buf, 0440, &xt_file_ops); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops, + (void *)(unsigned long)af); if (!proc) goto out; - proc->data = (void *) ((unsigned long) af | (TABLE << 16)); - strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); - proc = proc_net_fops_create(buf, 0440, &xt_file_ops); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops, + (void *)(unsigned long)af); if (!proc) goto out_remove_tables; - proc->data = (void *) ((unsigned long) af | (MATCH << 16)); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); - proc = proc_net_fops_create(buf, 0440, &xt_file_ops); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops, + (void *)(unsigned long)af); if (!proc) goto out_remove_matches; - proc->data = (void *) ((unsigned long) af | (TARGET << 16)); #endif return 0; @@ -821,61 +1329,82 @@ int xt_proto_init(int af) out_remove_matches: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); - proc_net_remove(buf); + remove_proc_entry(buf, net->proc_net); out_remove_tables: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); - proc_net_remove(buf); + remove_proc_entry(buf, net->proc_net); out: return -1; #endif } EXPORT_SYMBOL_GPL(xt_proto_init); -void xt_proto_fini(int af) +void xt_proto_fini(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); - proc_net_remove(buf); + remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); - proc_net_remove(buf); + remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); - proc_net_remove(buf); + remove_proc_entry(buf, net->proc_net); #endif /*CONFIG_PROC_FS*/ } EXPORT_SYMBOL_GPL(xt_proto_fini); +static int __net_init xt_net_init(struct net *net) +{ + int i; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) + INIT_LIST_HEAD(&net->xt.tables[i]); + return 0; +} + +static struct pernet_operations xt_net_ops = { + .init = xt_net_init, +}; static int __init xt_init(void) { - int i; + unsigned int i; + int rv; - xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL); + for_each_possible_cpu(i) { + seqcount_init(&per_cpu(xt_recseq, i)); + } + + xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); if (!xt) return -ENOMEM; - for (i = 0; i < NPROTO; i++) { + for (i = 0; i < NFPROTO_NUMPROTO; i++) { mutex_init(&xt[i].mutex); #ifdef CONFIG_COMPAT mutex_init(&xt[i].compat_mutex); + xt[i].compat_tab = NULL; #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); - INIT_LIST_HEAD(&xt[i].tables); } - return 0; + rv = register_pernet_subsys(&xt_net_ops); + if (rv < 0) + kfree(xt); + return rv; } static void __exit xt_fini(void) { + unregister_pernet_subsys(&xt_net_ops); kfree(xt); } diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c new file mode 100644 index 00000000000..4973cbddc44 --- /dev/null +++ b/net/netfilter/xt_AUDIT.c @@ -0,0 +1,231 @@ +/* + * Creates audit record for dropped/accepted packets + * + * (C) 2010-2011 Thomas Graf <tgraf@redhat.com> + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/audit.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_AUDIT.h> +#include <linux/netfilter_bridge/ebtables.h> +#include <net/ipv6.h> +#include <net/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>"); +MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); +MODULE_ALIAS("ipt_AUDIT"); +MODULE_ALIAS("ip6t_AUDIT"); +MODULE_ALIAS("ebt_AUDIT"); +MODULE_ALIAS("arpt_AUDIT"); + +static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb, + unsigned int proto, unsigned int offset) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + const __be16 *pptr; + __be16 _ports[2]; + + pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports); + if (pptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " sport=%hu dport=%hu", + ntohs(pptr[0]), ntohs(pptr[1])); + } + break; + + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: { + const u8 *iptr; + u8 _ih[2]; + + iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih); + if (iptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu", + iptr[0], iptr[1]); + + } + break; + } +} + +static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct iphdr _iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu", + &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol); + + if (ntohs(ih->frag_off) & IP_OFFSET) { + audit_log_format(ab, " frag=1"); + return; + } + + audit_proto(ab, skb, ih->protocol, ih->ihl * 4); +} + +static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + u8 nexthdr; + __be16 frag_off; + int offset; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + nexthdr = ih->nexthdr; + offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), + &nexthdr, &frag_off); + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + + if (offset) + audit_proto(ab, skb, nexthdr, offset); +} + +static unsigned int +audit_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + struct audit_buffer *ab; + + if (audit_enabled == 0) + goto errout; + + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); + if (ab == NULL) + goto errout; + + audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", + info->type, par->hooknum, skb->len, + par->in ? par->in->name : "?", + par->out ? par->out->name : "?"); + + if (skb->mark) + audit_log_format(ab, " mark=%#x", skb->mark); + + if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + + if (par->family == NFPROTO_BRIDGE) { + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + audit_ip4(ab, skb); + break; + + case htons(ETH_P_IPV6): + audit_ip6(ab, skb); + break; + } + } + } + + switch (par->family) { + case NFPROTO_IPV4: + audit_ip4(ab, skb); + break; + + case NFPROTO_IPV6: + audit_ip6(ab, skb); + break; + } + +#ifdef CONFIG_NETWORK_SECMARK + if (skb->secmark) + audit_log_secctx(ab, skb->secmark); +#endif + + audit_log_end(ab); + +errout: + return XT_CONTINUE; +} + +static unsigned int +audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par) +{ + audit_tg(skb, par); + return EBT_CONTINUE; +} + +static int audit_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + + if (info->type > XT_AUDIT_TYPE_MAX) { + pr_info("Audit type out of range (valid range: 0..%hhu)\n", + XT_AUDIT_TYPE_MAX); + return -ERANGE; + } + + return 0; +} + +static struct xt_target audit_tg_reg[] __read_mostly = { + { + .name = "AUDIT", + .family = NFPROTO_UNSPEC, + .target = audit_tg, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, + }, + { + .name = "AUDIT", + .family = NFPROTO_BRIDGE, + .target = audit_tg_ebt, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, + }, +}; + +static int __init audit_tg_init(void) +{ + return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +static void __exit audit_tg_exit(void) +{ + xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +module_init(audit_tg_init); +module_exit(audit_tg_exit); diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c new file mode 100644 index 00000000000..0f642ef8cd2 --- /dev/null +++ b/net/netfilter/xt_CHECKSUM.c @@ -0,0 +1,70 @@ +/* iptables module for the packet checksum mangling + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * (C) 2010 Red Hat, Inc. + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CHECKSUM.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>"); +MODULE_DESCRIPTION("Xtables: checksum modification"); +MODULE_ALIAS("ipt_CHECKSUM"); +MODULE_ALIAS("ip6t_CHECKSUM"); + +static unsigned int +checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb_checksum_help(skb); + + return XT_CONTINUE; +} + +static int checksum_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_CHECKSUM_info *einfo = par->targinfo; + + if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { + pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); + return -EINVAL; + } + if (!einfo->operation) { + pr_info("no CHECKSUM operation enabled\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target checksum_tg_reg __read_mostly = { + .name = "CHECKSUM", + .family = NFPROTO_UNSPEC, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, +}; + +static int __init checksum_tg_init(void) +{ + return xt_register_target(&checksum_tg_reg); +} + +static void __exit checksum_tg_exit(void) +{ + xt_unregister_target(&checksum_tg_reg); +} + +module_init(checksum_tg_init); +module_exit(checksum_tg_exit); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 30884833e66..af9c4dadf81 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -19,62 +19,55 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CLASSIFY.h> +#include <linux/netfilter_arp.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("iptables qdisc classification target module"); +MODULE_DESCRIPTION("Xtables: Qdisc classification"); MODULE_ALIAS("ipt_CLASSIFY"); +MODULE_ALIAS("ip6t_CLASSIFY"); +MODULE_ALIAS("arpt_CLASSIFY"); static unsigned int -target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +classify_tg(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_classify_target_info *clinfo = targinfo; + const struct xt_classify_target_info *clinfo = par->targinfo; - (*pskb)->priority = clinfo->priority; + skb->priority = clinfo->priority; return XT_CONTINUE; } -static struct xt_target xt_classify_target[] = { +static struct xt_target classify_tg_reg[] __read_mostly = { { - .family = AF_INET, - .name = "CLASSIFY", - .target = target, - .targetsize = sizeof(struct xt_classify_target_info), - .table = "mangle", - .hooks = (1 << NF_IP_LOCAL_OUT) | - (1 << NF_IP_FORWARD) | - (1 << NF_IP_POST_ROUTING), - .me = THIS_MODULE, + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_UNSPEC, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, }, { - .name = "CLASSIFY", - .family = AF_INET6, - .target = target, - .targetsize = sizeof(struct xt_classify_target_info), - .table = "mangle", - .hooks = (1 << NF_IP6_LOCAL_OUT) | - (1 << NF_IP6_FORWARD) | - (1 << NF_IP6_POST_ROUTING), - .me = THIS_MODULE, + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_ARP, + .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, }, }; -static int __init xt_classify_init(void) +static int __init classify_tg_init(void) { - return xt_register_targets(xt_classify_target, - ARRAY_SIZE(xt_classify_target)); + return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); } -static void __exit xt_classify_fini(void) +static void __exit classify_tg_exit(void) { - xt_unregister_targets(xt_classify_target, - ARRAY_SIZE(xt_classify_target)); + xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); } -module_init(xt_classify_init); -module_exit(xt_classify_fini); +module_init(classify_tg_init); +module_exit(classify_tg_exit); diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c deleted file mode 100644 index b03ce009d0b..00000000000 --- a/net/netfilter/xt_CONNMARK.c +++ /dev/null @@ -1,184 +0,0 @@ -/* This kernel module is used to modify the connection mark values, or - * to optionally restore the skb nfmark from the connection mark - * - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> - * by Henrik Nordstrom <hno@marasystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/checksum.h> - -MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); -MODULE_DESCRIPTION("IP tables CONNMARK matching module"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ipt_CONNMARK"); - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_CONNMARK.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static unsigned int -target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - const struct xt_connmark_target_info *markinfo = targinfo; - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - u_int32_t diff; - u_int32_t mark; - u_int32_t newmark; - - ct = nf_ct_get(*pskb, &ctinfo); - if (ct) { - switch(markinfo->mode) { - case XT_CONNMARK_SET: - newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; - if (newmark != ct->mark) { - ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, *pskb); - } - break; - case XT_CONNMARK_SAVE: - newmark = (ct->mark & ~markinfo->mask) | - ((*pskb)->mark & markinfo->mask); - if (ct->mark != newmark) { - ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, *pskb); - } - break; - case XT_CONNMARK_RESTORE: - mark = (*pskb)->mark; - diff = (ct->mark ^ mark) & markinfo->mask; - (*pskb)->mark = mark ^ diff; - break; - } - } - - return XT_CONTINUE; -} - -static int -checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct xt_connmark_target_info *matchinfo = targinfo; - - if (nf_ct_l3proto_try_module_get(target->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", target->family); - return 0; - } - if (matchinfo->mode == XT_CONNMARK_RESTORE) { - if (strcmp(tablename, "mangle") != 0) { - printk(KERN_WARNING "CONNMARK: restore can only be " - "called from \"mangle\" table, not \"%s\"\n", - tablename); - return 0; - } - } - if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) { - printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); - return 0; - } - return 1; -} - -static void -destroy(const struct xt_target *target, void *targinfo) -{ - nf_ct_l3proto_module_put(target->family); -} - -#ifdef CONFIG_COMPAT -struct compat_xt_connmark_target_info { - compat_ulong_t mark, mask; - u_int8_t mode; - u_int8_t __pad1; - u_int16_t __pad2; -}; - -static void compat_from_user(void *dst, void *src) -{ - struct compat_xt_connmark_target_info *cm = src; - struct xt_connmark_target_info m = { - .mark = cm->mark, - .mask = cm->mask, - .mode = cm->mode, - }; - memcpy(dst, &m, sizeof(m)); -} - -static int compat_to_user(void __user *dst, void *src) -{ - struct xt_connmark_target_info *m = src; - struct compat_xt_connmark_target_info cm = { - .mark = m->mark, - .mask = m->mask, - .mode = m->mode, - }; - return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_target xt_connmark_target[] = { - { - .name = "CONNMARK", - .family = AF_INET, - .checkentry = checkentry, - .destroy = destroy, - .target = target, - .targetsize = sizeof(struct xt_connmark_target_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_connmark_target_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, -#endif - .me = THIS_MODULE - }, - { - .name = "CONNMARK", - .family = AF_INET6, - .checkentry = checkentry, - .destroy = destroy, - .target = target, - .targetsize = sizeof(struct xt_connmark_target_info), - .me = THIS_MODULE - }, -}; - -static int __init xt_connmark_init(void) -{ - return xt_register_targets(xt_connmark_target, - ARRAY_SIZE(xt_connmark_target)); -} - -static void __exit xt_connmark_fini(void) -{ - xt_unregister_targets(xt_connmark_target, - ARRAY_SIZE(xt_connmark_target)); -} - -module_init(xt_connmark_init); -module_exit(xt_connmark_fini); diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 81c0c58bab4..e04dc282e3b 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -8,24 +8,24 @@ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * - * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com> + * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CONNSECMARK.h> #include <net/netfilter/nf_conntrack.h> - -#define PFX "CONNSECMARK: " +#include <net/netfilter/nf_conntrack_ecache.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); -MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module"); +MODULE_DESCRIPTION("Xtables: target for copying between connection and security mark"); MODULE_ALIAS("ipt_CONNSECMARK"); MODULE_ALIAS("ip6t_CONNSECMARK"); @@ -33,15 +33,17 @@ MODULE_ALIAS("ip6t_CONNSECMARK"); * If the packet has a security mark and the connection does not, copy * the security mark from the packet to the connection. */ -static void secmark_save(struct sk_buff *skb) +static void secmark_save(const struct sk_buff *skb) { if (skb->secmark) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); - if (ct && !ct->secmark) + if (ct && !ct->secmark) { ct->secmark = skb->secmark; + nf_conntrack_event_cache(IPCT_SECMARK, ct); + } } } @@ -52,7 +54,7 @@ static void secmark_save(struct sk_buff *skb) static void secmark_restore(struct sk_buff *skb) { if (!skb->secmark) { - struct nf_conn *ct; + const struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); @@ -61,13 +63,10 @@ static void secmark_restore(struct sk_buff *skb) } } -static unsigned int target(struct sk_buff **pskb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +connsecmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { - struct sk_buff *skb = *pskb; - const struct xt_connsecmark_target_info *info = targinfo; + const struct xt_connsecmark_target_info *info = par->targinfo; switch (info->mode) { case CONNSECMARK_SAVE: @@ -85,70 +84,60 @@ static unsigned int target(struct sk_buff **pskb, const struct net_device *in, return XT_CONTINUE; } -static int checkentry(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static int connsecmark_tg_check(const struct xt_tgchk_param *par) { - struct xt_connsecmark_target_info *info = targinfo; - - if (nf_ct_l3proto_try_module_get(target->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", target->family); - return 0; + const struct xt_connsecmark_target_info *info = par->targinfo; + int ret; + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { + pr_info("target only valid in the \'mangle\' " + "or \'security\' tables, not \'%s\'.\n", par->table); + return -EINVAL; } + switch (info->mode) { case CONNSECMARK_SAVE: case CONNSECMARK_RESTORE: break; default: - printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); - return 0; + pr_info("invalid mode: %hu\n", info->mode); + return -EINVAL; } - return 1; + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; } -static void -destroy(const struct xt_target *target, void *targinfo) +static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(target->family); + nf_ct_l3proto_module_put(par->family); } -static struct xt_target xt_connsecmark_target[] = { - { - .name = "CONNSECMARK", - .family = AF_INET, - .checkentry = checkentry, - .destroy = destroy, - .target = target, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "CONNSECMARK", - .family = AF_INET6, - .checkentry = checkentry, - .destroy = destroy, - .target = target, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, +static struct xt_target connsecmark_tg_reg __read_mostly = { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, }; -static int __init xt_connsecmark_init(void) +static int __init connsecmark_tg_init(void) { - return xt_register_targets(xt_connsecmark_target, - ARRAY_SIZE(xt_connsecmark_target)); + return xt_register_target(&connsecmark_tg_reg); } -static void __exit xt_connsecmark_fini(void) +static void __exit connsecmark_tg_exit(void) { - xt_unregister_targets(xt_connsecmark_target, - ARRAY_SIZE(xt_connsecmark_target)); + xt_unregister_target(&connsecmark_tg_reg); } -module_init(xt_connsecmark_init); -module_exit(xt_connsecmark_fini); +module_init(connsecmark_tg_init); +module_exit(connsecmark_tg_exit); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c new file mode 100644 index 00000000000..75747aecdeb --- /dev/null +++ b/net/netfilter/xt_CT.c @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2010 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CT.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_zones.h> + +static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + /* special case the untracked ct : we want the percpu object */ + if (!ct) + ct = nf_ct_untracked_get(); + atomic_inc(&ct->ct_general.use); + skb->nfct = &ct->ct_general; + skb->nfctinfo = IP_CT_NEW; + + return XT_CONTINUE; +} + +static unsigned int xt_ct_target_v0(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + + return xt_ct_target(skb, ct); +} + +static unsigned int xt_ct_target_v1(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info_v1 *info = par->targinfo; + struct nf_conn *ct = info->ct; + + return xt_ct_target(skb, ct); +} + +static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) +{ + if (par->family == NFPROTO_IPV4) { + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.invflags & IPT_INV_PROTO) + return 0; + return e->ip.proto; + } else if (par->family == NFPROTO_IPV6) { + const struct ip6t_entry *e = par->entryinfo; + + if (e->ipv6.invflags & IP6T_INV_PROTO) + return 0; + return e->ipv6.proto; + } else + return 0; +} + +static int +xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, + const struct xt_tgchk_param *par) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + u8 proto; + + proto = xt_ct_find_proto(par); + if (!proto) { + pr_info("You must specify a L4 protocol, and not use " + "inversions on it.\n"); + return -ENOENT; + } + + helper = nf_conntrack_helper_try_module_get(helper_name, par->family, + proto); + if (helper == NULL) { + pr_info("No such helper \"%s\"\n", helper_name); + return -ENOENT; + } + + help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); + if (help == NULL) { + module_put(helper->me); + return -ENOMEM; + } + + help->helper = helper; + return 0; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) +{ + typeof(nf_ct_timeout_put_hook) timeout_put; + + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + if (timeout_put) + timeout_put(timeout); +} +#endif + +static int +xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, + const char *timeout_name) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + typeof(nf_ct_timeout_find_get_hook) timeout_find_get; + struct ctnl_timeout *timeout; + struct nf_conn_timeout *timeout_ext; + struct nf_conntrack_l4proto *l4proto; + int ret = 0; + u8 proto; + + rcu_read_lock(); + timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); + if (timeout_find_get == NULL) { + ret = -ENOENT; + pr_info("Timeout policy base is empty\n"); + goto out; + } + + proto = xt_ct_find_proto(par); + if (!proto) { + ret = -EINVAL; + pr_info("You must specify a L4 protocol, and not use " + "inversions on it.\n"); + goto out; + } + + timeout = timeout_find_get(timeout_name); + if (timeout == NULL) { + ret = -ENOENT; + pr_info("No such timeout policy \"%s\"\n", timeout_name); + goto out; + } + + if (timeout->l3num != par->family) { + ret = -EINVAL; + pr_info("Timeout policy `%s' can only be used by L3 protocol " + "number %d\n", timeout_name, timeout->l3num); + goto err_put_timeout; + } + /* Make sure the timeout policy matches any existing protocol tracker, + * otherwise default to generic. + */ + l4proto = __nf_ct_l4proto_find(par->family, proto); + if (timeout->l4proto->l4proto != l4proto->l4proto) { + ret = -EINVAL; + pr_info("Timeout policy `%s' can only be used by L4 protocol " + "number %d\n", + timeout_name, timeout->l4proto->l4proto); + goto err_put_timeout; + } + timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); + if (timeout_ext == NULL) + ret = -ENOMEM; + +err_put_timeout: + __xt_ct_tg_timeout_put(timeout); +out: + rcu_read_unlock(); + return ret; +#else + return -EOPNOTSUPP; +#endif +} + +static int xt_ct_tg_check(const struct xt_tgchk_param *par, + struct xt_ct_target_info_v1 *info) +{ + struct nf_conntrack_tuple t; + struct nf_conn *ct; + int ret = -EOPNOTSUPP; + + if (info->flags & XT_CT_NOTRACK) { + ct = NULL; + goto out; + } + +#ifndef CONFIG_NF_CONNTRACK_ZONES + if (info->zone) + goto err1; +#endif + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + goto err1; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); + ret = PTR_ERR(ct); + if (IS_ERR(ct)) + goto err2; + + ret = 0; + if ((info->ct_events || info->exp_events) && + !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, + GFP_KERNEL)) { + ret = -EINVAL; + goto err3; + } + + if (info->helper[0]) { + ret = xt_ct_set_helper(ct, info->helper, par); + if (ret < 0) + goto err3; + } + + if (info->timeout[0]) { + ret = xt_ct_set_timeout(ct, par, info->timeout); + if (ret < 0) + goto err3; + } + + nf_conntrack_tmpl_insert(par->net, ct); +out: + info->ct = ct; + return 0; + +err3: + nf_conntrack_free(ct); +err2: + nf_ct_l3proto_module_put(par->family); +err1: + return ret; +} + +static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct xt_ct_target_info_v1 info_v1 = { + .flags = info->flags, + .zone = info->zone, + .ct_events = info->ct_events, + .exp_events = info->exp_events, + }; + int ret; + + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + + ret = xt_ct_tg_check(par, &info_v1); + if (ret < 0) + return ret; + + info->ct = info_v1.ct; + + return ret; +} + +static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + return xt_ct_tg_check(par, par->targinfo); +} + +static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + + if (info->flags & ~XT_CT_MASK) + return -EINVAL; + + return xt_ct_tg_check(par, par->targinfo); +} + +static void xt_ct_destroy_timeout(struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + struct nf_conn_timeout *timeout_ext; + typeof(nf_ct_timeout_put_hook) timeout_put; + + rcu_read_lock(); + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + + if (timeout_put) { + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) + timeout_put(timeout_ext->timeout); + } + rcu_read_unlock(); +#endif +} + +static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, + struct xt_ct_target_info_v1 *info) +{ + struct nf_conn *ct = info->ct; + struct nf_conn_help *help; + + if (ct && !nf_ct_is_untracked(ct)) { + help = nfct_help(ct); + if (help) + module_put(help->helper->me); + + nf_ct_l3proto_module_put(par->family); + + xt_ct_destroy_timeout(ct); + nf_ct_put(info->ct); + } +} + +static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct xt_ct_target_info_v1 info_v1 = { + .flags = info->flags, + .zone = info->zone, + .ct_events = info->ct_events, + .exp_events = info->exp_events, + .ct = info->ct, + }; + memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + + xt_ct_tg_destroy(par, &info_v1); +} + +static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) +{ + xt_ct_tg_destroy(par, par->targinfo); +} + +static struct xt_target xt_ct_tg_reg[] __read_mostly = { + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .targetsize = sizeof(struct xt_ct_target_info), + .checkentry = xt_ct_tg_check_v0, + .destroy = xt_ct_tg_destroy_v0, + .target = xt_ct_target_v0, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .revision = 1, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .checkentry = xt_ct_tg_check_v1, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .revision = 2, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .checkentry = xt_ct_tg_check_v2, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, +}; + +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + + return XT_CONTINUE; +} + +static int notrack_chk(const struct xt_tgchk_param *par) +{ + if (!par->net->xt.notrack_deprecated_warning) { + pr_info("netfilter: NOTRACK target is deprecated, " + "use CT instead or upgrade iptables\n"); + par->net->xt.notrack_deprecated_warning = true; + } + return 0; +} + +static struct xt_target notrack_tg_reg __read_mostly = { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = notrack_chk, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, +}; + +static int __init xt_ct_tg_init(void) +{ + int ret; + + ret = xt_register_target(¬rack_tg_reg); + if (ret < 0) + return ret; + + ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); + if (ret < 0) { + xt_unregister_target(¬rack_tg_reg); + return ret; + } + return 0; +} + +static void __exit xt_ct_tg_exit(void) +{ + xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); + xt_unregister_target(¬rack_tg_reg); +} + +module_init(xt_ct_tg_init); +module_exit(xt_ct_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: connection tracking target"); +MODULE_ALIAS("ipt_CT"); +MODULE_ALIAS("ip6t_CT"); +MODULE_ALIAS("ipt_NOTRACK"); +MODULE_ALIAS("ip6t_NOTRACK"); diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 9f2f2201f6a..ae8271652ef 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -9,7 +9,7 @@ * * See RFC2474 for a description of the DSCP field within the IP Header. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> @@ -20,97 +20,145 @@ #include <linux/netfilter/xt_DSCP.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("x_tables DSCP modification module"); +MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_DSCP"); MODULE_ALIAS("ip6t_DSCP"); +MODULE_ALIAS("ipt_TOS"); +MODULE_ALIAS("ip6t_TOS"); -static unsigned int target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_DSCP_info *dinfo = targinfo; - u_int8_t dscp = ipv4_get_dsfield(ip_hdr(*pskb)) >> XT_DSCP_SHIFT; + const struct xt_DSCP_info *dinfo = par->targinfo; + u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(skb, sizeof(struct iphdr))) return NF_DROP; - ipv4_change_dsfield(ip_hdr(*pskb), (__u8)(~XT_DSCP_MASK), + ipv4_change_dsfield(ip_hdr(skb), (__u8)(~XT_DSCP_MASK), dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } -static unsigned int target6(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_DSCP_info *dinfo = targinfo; - u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(*pskb)) >> XT_DSCP_SHIFT; + const struct xt_DSCP_info *dinfo = par->targinfo; + u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(pskb, sizeof(struct ipv6hdr))) + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; - ipv6_change_dsfield(ipv6_hdr(*pskb), (__u8)(~XT_DSCP_MASK), + ipv6_change_dsfield(ipv6_hdr(skb), (__u8)(~XT_DSCP_MASK), dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } -static int checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static int dscp_tg_check(const struct xt_tgchk_param *par) { - const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp; + const struct xt_DSCP_info *info = par->targinfo; + + if (info->dscp > XT_DSCP_MAX) { + pr_info("dscp %x out of range\n", info->dscp); + return -EDOM; + } + return 0; +} + +static unsigned int +tos_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tos_target_info *info = par->targinfo; + struct iphdr *iph = ip_hdr(skb); + u_int8_t orig, nv; + + orig = ipv4_get_dsfield(iph); + nv = (orig & ~info->tos_mask) ^ info->tos_value; - if ((dscp > XT_DSCP_MAX)) { - printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); - return 0; + if (orig != nv) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + iph = ip_hdr(skb); + ipv4_change_dsfield(iph, 0, nv); } - return 1; + + return XT_CONTINUE; } -static struct xt_target xt_dscp_target[] = { +static unsigned int +tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tos_target_info *info = par->targinfo; + struct ipv6hdr *iph = ipv6_hdr(skb); + u_int8_t orig, nv; + + orig = ipv6_get_dsfield(iph); + nv = (orig & ~info->tos_mask) ^ info->tos_value; + + if (orig != nv) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + iph = ipv6_hdr(skb); + ipv6_change_dsfield(iph, 0, nv); + } + + return XT_CONTINUE; +} + +static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", - .family = AF_INET, - .checkentry = checkentry, - .target = target, + .family = NFPROTO_IPV4, + .checkentry = dscp_tg_check, + .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "DSCP", - .family = AF_INET6, - .checkentry = checkentry, - .target = target6, + .family = NFPROTO_IPV6, + .checkentry = dscp_tg_check, + .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, + { + .name = "TOS", + .revision = 1, + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tos_tg, + .targetsize = sizeof(struct xt_tos_target_info), + .me = THIS_MODULE, + }, + { + .name = "TOS", + .revision = 1, + .family = NFPROTO_IPV6, + .table = "mangle", + .target = tos_tg6, + .targetsize = sizeof(struct xt_tos_target_info), + .me = THIS_MODULE, + }, }; -static int __init xt_dscp_target_init(void) +static int __init dscp_tg_init(void) { - return xt_register_targets(xt_dscp_target, ARRAY_SIZE(xt_dscp_target)); + return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } -static void __exit xt_dscp_target_fini(void) +static void __exit dscp_tg_exit(void) { - xt_unregister_targets(xt_dscp_target, ARRAY_SIZE(xt_dscp_target)); + xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } -module_init(xt_dscp_target_init); -module_exit(xt_dscp_target_fini); +module_init(dscp_tg_init); +module_exit(dscp_tg_exit); diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c new file mode 100644 index 00000000000..1535e87ed9b --- /dev/null +++ b/net/netfilter/xt_HL.c @@ -0,0 +1,169 @@ +/* + * TTL modification target for IP tables + * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> + * + * Hop Limit modification target for ip6tables + * Maciej Soltysiak <solt@dns.toxicfilms.tv> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/checksum.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ipt_TTL.h> +#include <linux/netfilter_ipv6/ip6t_HL.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("Xtables: Hoplimit/TTL Limit field modification target"); +MODULE_LICENSE("GPL"); + +static unsigned int +ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct iphdr *iph; + const struct ipt_TTL_info *info = par->targinfo; + int new_ttl; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + iph = ip_hdr(skb); + + switch (info->mode) { + case IPT_TTL_SET: + new_ttl = info->ttl; + break; + case IPT_TTL_INC: + new_ttl = iph->ttl + info->ttl; + if (new_ttl > 255) + new_ttl = 255; + break; + case IPT_TTL_DEC: + new_ttl = iph->ttl - info->ttl; + if (new_ttl < 0) + new_ttl = 0; + break; + default: + new_ttl = iph->ttl; + break; + } + + if (new_ttl != iph->ttl) { + csum_replace2(&iph->check, htons(iph->ttl << 8), + htons(new_ttl << 8)); + iph->ttl = new_ttl; + } + + return XT_CONTINUE; +} + +static unsigned int +hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct ipv6hdr *ip6h; + const struct ip6t_HL_info *info = par->targinfo; + int new_hl; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + ip6h = ipv6_hdr(skb); + + switch (info->mode) { + case IP6T_HL_SET: + new_hl = info->hop_limit; + break; + case IP6T_HL_INC: + new_hl = ip6h->hop_limit + info->hop_limit; + if (new_hl > 255) + new_hl = 255; + break; + case IP6T_HL_DEC: + new_hl = ip6h->hop_limit - info->hop_limit; + if (new_hl < 0) + new_hl = 0; + break; + default: + new_hl = ip6h->hop_limit; + break; + } + + ip6h->hop_limit = new_hl; + + return XT_CONTINUE; +} + +static int ttl_tg_check(const struct xt_tgchk_param *par) +{ + const struct ipt_TTL_info *info = par->targinfo; + + if (info->mode > IPT_TTL_MAXMODE) { + pr_info("TTL: invalid or unknown mode %u\n", info->mode); + return -EINVAL; + } + if (info->mode != IPT_TTL_SET && info->ttl == 0) + return -EINVAL; + return 0; +} + +static int hl_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_HL_info *info = par->targinfo; + + if (info->mode > IP6T_HL_MAXMODE) { + pr_info("invalid or unknown mode %u\n", info->mode); + return -EINVAL; + } + if (info->mode != IP6T_HL_SET && info->hop_limit == 0) { + pr_info("increment/decrement does not " + "make sense with value 0\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target hl_tg_reg[] __read_mostly = { + { + .name = "TTL", + .revision = 0, + .family = NFPROTO_IPV4, + .target = ttl_tg, + .targetsize = sizeof(struct ipt_TTL_info), + .table = "mangle", + .checkentry = ttl_tg_check, + .me = THIS_MODULE, + }, + { + .name = "HL", + .revision = 0, + .family = NFPROTO_IPV6, + .target = hl_tg6, + .targetsize = sizeof(struct ip6t_HL_info), + .table = "mangle", + .checkentry = hl_tg6_check, + .me = THIS_MODULE, + }, +}; + +static int __init hl_tg_init(void) +{ + return xt_register_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); +} + +static void __exit hl_tg_exit(void) +{ + xt_unregister_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); +} + +module_init(hl_tg_init); +module_exit(hl_tg_exit); +MODULE_ALIAS("ipt_TTL"); +MODULE_ALIAS("ip6t_HL"); diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c new file mode 100644 index 00000000000..73b73f687c5 --- /dev/null +++ b/net/netfilter/xt_HMARK.c @@ -0,0 +1,372 @@ +/* + * xt_HMARK - Netfilter module to set mark by means of hashing + * + * (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com> + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_HMARK.h> + +#include <net/ip.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#include <net/ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>"); +MODULE_DESCRIPTION("Xtables: packet marking using hash calculation"); +MODULE_ALIAS("ipt_HMARK"); +MODULE_ALIAS("ip6t_HMARK"); + +struct hmark_tuple { + __be32 src; + __be32 dst; + union hmark_ports uports; + u8 proto; +}; + +static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask) +{ + return (addr32[0] & mask[0]) ^ + (addr32[1] & mask[1]) ^ + (addr32[2] & mask[2]) ^ + (addr32[3] & mask[3]); +} + +static inline __be32 +hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask) +{ + switch (l3num) { + case AF_INET: + return *addr32 & *mask; + case AF_INET6: + return hmark_addr6_mask(addr32, mask); + } + return 0; +} + +static inline void hmark_swap_ports(union hmark_ports *uports, + const struct xt_hmark_info *info) +{ + union hmark_ports hp; + u16 src, dst; + + hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32; + src = ntohs(hp.b16.src); + dst = ntohs(hp.b16.dst); + + if (dst > src) + uports->v32 = (dst << 16) | src; + else + uports->v32 = (src << 16) | dst; +} + +static int +hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, + const struct xt_hmark_info *info) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple *otuple; + struct nf_conntrack_tuple *rtuple; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return -1; + + otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6, + info->src_mask.ip6); + t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6, + info->dst_mask.ip6); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) + return 0; + + t->proto = nf_ct_protonum(ct); + if (t->proto != IPPROTO_ICMP) { + t->uports.b16.src = otuple->src.u.all; + t->uports.b16.dst = rtuple->src.u.all; + hmark_swap_ports(&t->uports, info); + } + + return 0; +#else + return -1; +#endif +} + +/* This hash function is endian independent, to ensure consistent hashing if + * the cluster is composed of big and little endian systems. */ +static inline u32 +hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) +{ + u32 hash; + u32 src = ntohl(t->src); + u32 dst = ntohl(t->dst); + + if (dst < src) + swap(src, dst); + + hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); + hash = hash ^ (t->proto & info->proto_mask); + + return (((u64)hash * info->hmodulus) >> 32) + info->hoffset; +} + +static void +hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff, + struct hmark_tuple *t, const struct xt_hmark_info *info) +{ + int protoff; + + protoff = proto_ports_offset(t->proto); + if (protoff < 0) + return; + + nhoff += protoff; + if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0) + return; + + hmark_swap_ports(&t->uports, info); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int get_inner6_hdr(const struct sk_buff *skb, int *offset) +{ + struct icmp6hdr *icmp6h, _ih6; + + icmp6h = skb_header_pointer(skb, *offset, sizeof(_ih6), &_ih6); + if (icmp6h == NULL) + return 0; + + if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) { + *offset += sizeof(struct icmp6hdr); + return 1; + } + return 0; +} + +static int +hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t, + const struct xt_hmark_info *info) +{ + struct ipv6hdr *ip6, _ip6; + int flag = IP6_FH_F_AUTH; + unsigned int nhoff = 0; + u16 fragoff = 0; + int nexthdr; + + ip6 = (struct ipv6hdr *) (skb->data + skb_network_offset(skb)); + nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); + if (nexthdr < 0) + return 0; + /* No need to check for icmp errors on fragments */ + if ((flag & IP6_FH_F_FRAG) || (nexthdr != IPPROTO_ICMPV6)) + goto noicmp; + /* Use inner header in case of ICMP errors */ + if (get_inner6_hdr(skb, &nhoff)) { + ip6 = skb_header_pointer(skb, nhoff, sizeof(_ip6), &_ip6); + if (ip6 == NULL) + return -1; + /* If AH present, use SPI like in ESP. */ + flag = IP6_FH_F_AUTH; + nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); + if (nexthdr < 0) + return -1; + } +noicmp: + t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6); + t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) + return 0; + + t->proto = nexthdr; + if (t->proto == IPPROTO_ICMPV6) + return 0; + + if (flag & IP6_FH_F_FRAG) + return 0; + + hmark_set_tuple_ports(skb, nhoff, t, info); + return 0; +} + +static unsigned int +hmark_tg_v6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_hmark_info *info = par->targinfo; + struct hmark_tuple t; + + memset(&t, 0, sizeof(struct hmark_tuple)); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { + if (hmark_ct_set_htuple(skb, &t, info) < 0) + return XT_CONTINUE; + } else { + if (hmark_pkt_set_htuple_ipv6(skb, &t, info) < 0) + return XT_CONTINUE; + } + + skb->mark = hmark_hash(&t, info); + return XT_CONTINUE; +} +#endif + +static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) +{ + const struct icmphdr *icmph; + struct icmphdr _ih; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, *nhoff + iphsz, sizeof(_ih), &_ih); + if (icmph == NULL || icmph->type > NR_ICMP_TYPES) + return 0; + + /* Error message? */ + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) + return 0; + + *nhoff += iphsz + sizeof(_ih); + return 1; +} + +static int +hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, + const struct xt_hmark_info *info) +{ + struct iphdr *ip, _ip; + int nhoff = skb_network_offset(skb); + + ip = (struct iphdr *) (skb->data + nhoff); + if (ip->protocol == IPPROTO_ICMP) { + /* Use inner header in case of ICMP errors */ + if (get_inner_hdr(skb, ip->ihl * 4, &nhoff)) { + ip = skb_header_pointer(skb, nhoff, sizeof(_ip), &_ip); + if (ip == NULL) + return -1; + } + } + + t->src = ip->saddr & info->src_mask.ip; + t->dst = ip->daddr & info->dst_mask.ip; + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) + return 0; + + t->proto = ip->protocol; + + /* ICMP has no ports, skip */ + if (t->proto == IPPROTO_ICMP) + return 0; + + /* follow-up fragments don't contain ports, skip all fragments */ + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + return 0; + + hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info); + + return 0; +} + +static unsigned int +hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_hmark_info *info = par->targinfo; + struct hmark_tuple t; + + memset(&t, 0, sizeof(struct hmark_tuple)); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { + if (hmark_ct_set_htuple(skb, &t, info) < 0) + return XT_CONTINUE; + } else { + if (hmark_pkt_set_htuple_ipv4(skb, &t, info) < 0) + return XT_CONTINUE; + } + + skb->mark = hmark_hash(&t, info); + return XT_CONTINUE; +} + +static int hmark_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_hmark_info *info = par->targinfo; + + if (!info->hmodulus) { + pr_info("xt_HMARK: hash modulus can't be zero\n"); + return -EINVAL; + } + if (info->proto_mask && + (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) { + pr_info("xt_HMARK: proto mask must be zero with L3 mode\n"); + return -EINVAL; + } + if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI_MASK) && + (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT_MASK) | + XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) { + pr_info("xt_HMARK: spi-mask and port-mask can't be combined\n"); + return -EINVAL; + } + if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI) && + (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT) | + XT_HMARK_FLAG(XT_HMARK_DPORT)))) { + pr_info("xt_HMARK: spi-set and port-set can't be combined\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target hmark_tg_reg[] __read_mostly = { + { + .name = "HMARK", + .family = NFPROTO_IPV4, + .target = hmark_tg_v4, + .targetsize = sizeof(struct xt_hmark_info), + .checkentry = hmark_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "HMARK", + .family = NFPROTO_IPV6, + .target = hmark_tg_v6, + .targetsize = sizeof(struct xt_hmark_info), + .checkentry = hmark_tg_check, + .me = THIS_MODULE, + }, +#endif +}; + +static int __init hmark_tg_init(void) +{ + return xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); +} + +static void __exit hmark_tg_exit(void) +{ + xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); +} + +module_init(hmark_tg_init); +module_exit(hmark_tg_exit); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c new file mode 100644 index 00000000000..f407ebc1348 --- /dev/null +++ b/net/netfilter/xt_IDLETIMER.c @@ -0,0 +1,315 @@ +/* + * linux/net/netfilter/xt_IDLETIMER.c + * + * Netfilter module to trigger a timer when packet matches. + * After timer expires a kevent will be sent. + * + * Copyright (C) 2004, 2010 Nokia Corporation + * Written by Timo Teras <ext-timo.teras@nokia.com> + * + * Converted to x_tables and reworked for upstream inclusion + * by Luciano Coelho <luciano.coelho@nokia.com> + * + * Contact: Luciano Coelho <luciano.coelho@nokia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_IDLETIMER.h> +#include <linux/kdev_t.h> +#include <linux/kobject.h> +#include <linux/workqueue.h> +#include <linux/sysfs.h> + +struct idletimer_tg_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); +}; + +struct idletimer_tg { + struct list_head entry; + struct timer_list timer; + struct work_struct work; + + struct kobject *kobj; + struct idletimer_tg_attr attr; + + unsigned int refcnt; +}; + +static LIST_HEAD(idletimer_tg_list); +static DEFINE_MUTEX(list_mutex); + +static struct kobject *idletimer_tg_kobj; + +static +struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) +{ + struct idletimer_tg *entry; + + BUG_ON(!label); + + list_for_each_entry(entry, &idletimer_tg_list, entry) { + if (!strcmp(label, entry->attr.attr.name)) + return entry; + } + + return NULL; +} + +static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct idletimer_tg *timer; + unsigned long expires = 0; + + mutex_lock(&list_mutex); + + timer = __idletimer_tg_find_by_label(attr->name); + if (timer) + expires = timer->timer.expires; + + mutex_unlock(&list_mutex); + + if (time_after(expires, jiffies)) + return sprintf(buf, "%u\n", + jiffies_to_msecs(expires - jiffies) / 1000); + + return sprintf(buf, "0\n"); +} + +static void idletimer_tg_work(struct work_struct *work) +{ + struct idletimer_tg *timer = container_of(work, struct idletimer_tg, + work); + + sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); +} + +static void idletimer_tg_expired(unsigned long data) +{ + struct idletimer_tg *timer = (struct idletimer_tg *) data; + + pr_debug("timer %s expired\n", timer->attr.attr.name); + + schedule_work(&timer->work); +} + +static int idletimer_tg_create(struct idletimer_tg_info *info) +{ + int ret; + + info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + if (!info->timer) { + ret = -ENOMEM; + goto out; + } + + info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); + if (!info->timer->attr.attr.name) { + ret = -ENOMEM; + goto out_free_timer; + } + info->timer->attr.attr.mode = S_IRUGO; + info->timer->attr.show = idletimer_tg_show; + + ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); + if (ret < 0) { + pr_debug("couldn't add file to sysfs"); + goto out_free_attr; + } + + list_add(&info->timer->entry, &idletimer_tg_list); + + setup_timer(&info->timer->timer, idletimer_tg_expired, + (unsigned long) info->timer); + info->timer->refcnt = 1; + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + INIT_WORK(&info->timer->work, idletimer_tg_work); + + return 0; + +out_free_attr: + kfree(info->timer->attr.attr.name); +out_free_timer: + kfree(info->timer); +out: + return ret; +} + +/* + * The actual xt_tables plugin. + */ +static unsigned int idletimer_tg_target(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + + pr_debug("resetting timer %s, timeout period %u\n", + info->label, info->timeout); + + BUG_ON(!info->timer); + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + return XT_CONTINUE; +} + +static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info *info = par->targinfo; + int ret; + + pr_debug("checkentry targinfo%s\n", info->label); + + if (info->timeout == 0) { + pr_debug("timeout value is zero\n"); + return -EINVAL; + } + + if (info->label[0] == '\0' || + strnlen(info->label, + MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { + pr_debug("label is empty or not nul-terminated\n"); + return -EINVAL; + } + + mutex_lock(&list_mutex); + + info->timer = __idletimer_tg_find_by_label(info->label); + if (info->timer) { + info->timer->refcnt++; + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + pr_debug("increased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } else { + ret = idletimer_tg_create(info); + if (ret < 0) { + pr_debug("failed to create timer\n"); + mutex_unlock(&list_mutex); + return ret; + } + } + + mutex_unlock(&list_mutex); + return 0; +} + +static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + + pr_debug("destroy targinfo %s\n", info->label); + + mutex_lock(&list_mutex); + + if (--info->timer->refcnt == 0) { + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); + del_timer_sync(&info->timer->timer); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); + } else { + pr_debug("decreased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } + + mutex_unlock(&list_mutex); +} + +static struct xt_target idletimer_tg __read_mostly = { + .name = "IDLETIMER", + .family = NFPROTO_UNSPEC, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, +}; + +static struct class *idletimer_tg_class; + +static struct device *idletimer_tg_device; + +static int __init idletimer_tg_init(void) +{ + int err; + + idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer"); + err = PTR_ERR(idletimer_tg_class); + if (IS_ERR(idletimer_tg_class)) { + pr_debug("couldn't register device class\n"); + goto out; + } + + idletimer_tg_device = device_create(idletimer_tg_class, NULL, + MKDEV(0, 0), NULL, "timers"); + err = PTR_ERR(idletimer_tg_device); + if (IS_ERR(idletimer_tg_device)) { + pr_debug("couldn't register system device\n"); + goto out_class; + } + + idletimer_tg_kobj = &idletimer_tg_device->kobj; + + err = xt_register_target(&idletimer_tg); + if (err < 0) { + pr_debug("couldn't register xt target\n"); + goto out_dev; + } + + return 0; +out_dev: + device_destroy(idletimer_tg_class, MKDEV(0, 0)); +out_class: + class_destroy(idletimer_tg_class); +out: + return err; +} + +static void __exit idletimer_tg_exit(void) +{ + xt_unregister_target(&idletimer_tg); + + device_destroy(idletimer_tg_class, MKDEV(0, 0)); + class_destroy(idletimer_tg_class); +} + +module_init(idletimer_tg_init); +module_exit(idletimer_tg_exit); + +MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); +MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); +MODULE_DESCRIPTION("Xtables: idle time monitor"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ipt_IDLETIMER"); +MODULE_ALIAS("ip6t_IDLETIMER"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c new file mode 100644 index 00000000000..993de2ba89d --- /dev/null +++ b/net/netfilter/xt_LED.c @@ -0,0 +1,215 @@ +/* + * xt_LED.c - netfilter target to make LEDs blink upon packet matches + * + * Copyright (C) 2008 Adam Nielsen <a.nielsen@shikadi.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/slab.h> +#include <linux/leds.h> +#include <linux/mutex.h> + +#include <linux/netfilter/xt_LED.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>"); +MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); +MODULE_ALIAS("ipt_LED"); +MODULE_ALIAS("ip6t_LED"); + +static LIST_HEAD(xt_led_triggers); +static DEFINE_MUTEX(xt_led_mutex); + +/* + * This is declared in here (the kernel module) only, to avoid having these + * dependencies in userspace code. This is what xt_led_info.internal_data + * points to. + */ +struct xt_led_info_internal { + struct list_head list; + int refcnt; + char *trigger_id; + struct led_trigger netfilter_led_trigger; + struct timer_list timer; +}; + +static unsigned int +led_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + /* + * If "always blink" is enabled, and there's still some time until the + * LED will switch off, briefly switch it off now. + */ + if ((ledinfo->delay > 0) && ledinfo->always_blink && + timer_pending(&ledinternal->timer)) + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); + + /* If there's a positive delay, start/update the timer */ + if (ledinfo->delay > 0) { + mod_timer(&ledinternal->timer, + jiffies + msecs_to_jiffies(ledinfo->delay)); + + /* Otherwise if there was no delay given, blink as fast as possible */ + } else if (ledinfo->delay == 0) { + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); + } + + /* else the delay is negative, which means switch on and stay on */ + + return XT_CONTINUE; +} + +static void led_timeout_callback(unsigned long data) +{ + struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data; + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); +} + +static struct xt_led_info_internal *led_trigger_lookup(const char *name) +{ + struct xt_led_info_internal *ledinternal; + + list_for_each_entry(ledinternal, &xt_led_triggers, list) { + if (!strcmp(name, ledinternal->netfilter_led_trigger.name)) { + return ledinternal; + } + } + return NULL; +} + +static int led_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal; + int err; + + if (ledinfo->id[0] == '\0') { + pr_info("No 'id' parameter given.\n"); + return -EINVAL; + } + + mutex_lock(&xt_led_mutex); + + ledinternal = led_trigger_lookup(ledinfo->id); + if (ledinternal) { + ledinternal->refcnt++; + goto out; + } + + err = -ENOMEM; + ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL); + if (!ledinternal) + goto exit_mutex_only; + + ledinternal->trigger_id = kstrdup(ledinfo->id, GFP_KERNEL); + if (!ledinternal->trigger_id) + goto exit_internal_alloc; + + ledinternal->refcnt = 1; + ledinternal->netfilter_led_trigger.name = ledinternal->trigger_id; + + err = led_trigger_register(&ledinternal->netfilter_led_trigger); + if (err) { + pr_warning("led_trigger_register() failed\n"); + if (err == -EEXIST) + pr_warning("Trigger name is already in use.\n"); + goto exit_alloc; + } + + /* See if we need to set up a timer */ + if (ledinfo->delay > 0) + setup_timer(&ledinternal->timer, led_timeout_callback, + (unsigned long)ledinternal); + + list_add_tail(&ledinternal->list, &xt_led_triggers); + +out: + mutex_unlock(&xt_led_mutex); + + ledinfo->internal_data = ledinternal; + + return 0; + +exit_alloc: + kfree(ledinternal->trigger_id); + +exit_internal_alloc: + kfree(ledinternal); + +exit_mutex_only: + mutex_unlock(&xt_led_mutex); + + return err; +} + +static void led_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + mutex_lock(&xt_led_mutex); + + if (--ledinternal->refcnt) { + mutex_unlock(&xt_led_mutex); + return; + } + + list_del(&ledinternal->list); + + if (ledinfo->delay > 0) + del_timer_sync(&ledinternal->timer); + + led_trigger_unregister(&ledinternal->netfilter_led_trigger); + + mutex_unlock(&xt_led_mutex); + + kfree(ledinternal->trigger_id); + kfree(ledinternal); +} + +static struct xt_target led_tg_reg __read_mostly = { + .name = "LED", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, +}; + +static int __init led_tg_init(void) +{ + return xt_register_target(&led_tg_reg); +} + +static void __exit led_tg_exit(void) +{ + xt_unregister_target(&led_tg_reg); +} + +module_init(led_tg_init); +module_exit(led_tg_exit); diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c new file mode 100644 index 00000000000..5ab24843370 --- /dev/null +++ b/net/netfilter/xt_LOG.c @@ -0,0 +1,975 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_LOG.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/xt_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static int dump_udp_header(struct sbuff *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + sb_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + sb_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + +out: + return 0; +} + +static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + sb_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & XT_LOG_TCPSEQ) + sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); + + /* Max length: 13 "WINDOW=65535 " */ + sb_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + sb_add(m, "CWR "); + if (th->ece) + sb_add(m, "ECE "); + if (th->urg) + sb_add(m, "URG "); + if (th->ack) + sb_add(m, "ACK "); + if (th->psh) + sb_add(m, "PSH "); + if (th->rst) + sb_add(m, "RST "); + if (th->syn) + sb_add(m, "SYN "); + if (th->fin) + sb_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + const u_int8_t *op; + unsigned int i; + unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + sb_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + + sb_add(m, ") "); + } + + return 0; +} + +static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk) +{ + if (!sk || sk->sk_state == TCP_TIME_WAIT) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + const struct cred *cred = sk->sk_socket->file->f_cred; + sb_add(m, "UID=%u GID=%u ", + from_kuid_munged(&init_user_ns, cred->fsuid), + from_kgid_munged(&init_user_ns, cred->fsgid)); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +/* One level of recursion won't kill us */ +static void dump_ipv4_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, + unsigned int iphoff) +{ + struct iphdr _iph; + const struct iphdr *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + sb_add(m, "SRC=%pI4 DST=%pI4 ", + &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + sb_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + sb_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + sb_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & XT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4)) + return; + break; + case IPPROTO_ICMP: { + struct icmphdr _icmph; + const struct icmphdr *ich; + static const size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + sb_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + sb_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + sb_add(m, "["); + dump_ipv4_packet(m, info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) + sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + sb_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ + sb_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + sb_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && !iphoff) + dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void dump_ipv4_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + sb_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + sb_add(m, ":%02x", *p); + } + sb_add(m, " "); +} + +static void +log_packet_common(struct sbuff *m, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + const struct net_device *physindev; + const struct net_device *physoutdev; + + physindev = skb->nf_bridge->physindev; + if (physindev && in != physindev) + sb_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev && out != physoutdev) + sb_add(m, "PHYSOUT=%s ", physoutdev->name); + } +#endif +} + + +static void +ipt_log_packet(struct net *net, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + + if (in != NULL) + dump_ipv4_mac_header(m, loginfo, skb); + + dump_ipv4_packet(m, loginfo, skb, 0); + + sb_close(m); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +/* One level of recursion won't kill us */ +static void dump_ipv6_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & XT_LOG_IPOPT) + sb_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + sb_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + sb_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + sb_add(m, "INCOMPLETE "); + + sb_add(m, "ID:%08x ", ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & XT_LOG_IPOPT) + sb_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & XT_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + sb_add(m, "AH "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (logflags & XT_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + sb_add(m, "ESP "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + sb_add(m, "SPI=0x%x )", ntohl(eh->spi)); + + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + sb_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & XT_LOG_IPOPT) + sb_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (dump_tcp_header(m, skb, currenthdr, fragment, ptr, + logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + sb_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + sb_add(m, "["); + dump_ipv6_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) + sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + sb_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && recurse) + dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (recurse && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv6_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p != NULL) { + sb_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + sb_add(m, ":%02x", *p++); + } + sb_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, + &iph->daddr); + } + } else + sb_add(m, " "); +} + +static void +ip6t_log_packet(struct net *net, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + + if (in != NULL) + dump_ipv6_mac_header(m, loginfo, skb); + + dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + + sb_close(m); +} +#endif + +static unsigned int +log_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_log_info *loginfo = par->targinfo; + struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; + + if (par->family == NFPROTO_IPV4) + ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in, + par->out, &li, loginfo->prefix); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + else if (par->family == NFPROTO_IPV6) + ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in, + par->out, &li, loginfo->prefix); +#endif + else + WARN_ON_ONCE(1); + + return XT_CONTINUE; +} + +static int log_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_log_info *loginfo = par->targinfo; + + if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) + return -EINVAL; + + if (loginfo->level >= 8) { + pr_debug("level %u >= 8\n", loginfo->level); + return -EINVAL; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + pr_debug("prefix is not null-terminated\n"); + return -EINVAL; + } + + return 0; +} + +static struct xt_target log_tg_regs[] __read_mostly = { + { + .name = "LOG", + .family = NFPROTO_IPV4, + .target = log_tg, + .targetsize = sizeof(struct xt_log_info), + .checkentry = log_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "LOG", + .family = NFPROTO_IPV6, + .target = log_tg, + .targetsize = sizeof(struct xt_log_info), + .checkentry = log_tg_check, + .me = THIS_MODULE, + }, +#endif +}; + +static struct nf_logger ipt_log_logger __read_mostly = { + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static struct nf_logger ip6t_log_logger __read_mostly = { + .name = "ip6t_LOG", + .logfn = &ip6t_log_packet, + .me = THIS_MODULE, +}; +#endif + +static int __net_init log_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger); +#endif + return 0; +} + +static void __net_exit log_net_exit(struct net *net) +{ + nf_log_unset(net, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_unset(net, &ip6t_log_logger); +#endif +} + +static struct pernet_operations log_net_ops = { + .init = log_net_init, + .exit = log_net_exit, +}; + +static int __init log_tg_init(void) +{ + int ret; + + ret = register_pernet_subsys(&log_net_ops); + if (ret < 0) + goto err_pernet; + + ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); + if (ret < 0) + goto err_target; + + nf_log_register(NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); +#endif + return 0; + +err_target: + unregister_pernet_subsys(&log_net_ops); +err_pernet: + return ret; +} + +static void __exit log_tg_exit(void) +{ + unregister_pernet_subsys(&log_net_ops); + nf_log_unregister(&ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_unregister(&ip6t_log_logger); +#endif + xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); +} + +module_init(log_tg_init); +module_exit(log_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); +MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); +MODULE_ALIAS("ipt_LOG"); +MODULE_ALIAS("ip6t_LOG"); diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c deleted file mode 100644 index 43817808d86..00000000000 --- a/net/netfilter/xt_MARK.c +++ /dev/null @@ -1,185 +0,0 @@ -/* This is a module which is used for setting the NFMARK field of an skb. */ - -/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/checksum.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_MARK.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("ip[6]tables MARK modification module"); -MODULE_ALIAS("ipt_MARK"); -MODULE_ALIAS("ip6t_MARK"); - -static unsigned int -target_v0(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - const struct xt_mark_target_info *markinfo = targinfo; - - (*pskb)->mark = markinfo->mark; - return XT_CONTINUE; -} - -static unsigned int -target_v1(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - const struct xt_mark_target_info_v1 *markinfo = targinfo; - int mark = 0; - - switch (markinfo->mode) { - case XT_MARK_SET: - mark = markinfo->mark; - break; - - case XT_MARK_AND: - mark = (*pskb)->mark & markinfo->mark; - break; - - case XT_MARK_OR: - mark = (*pskb)->mark | markinfo->mark; - break; - } - - (*pskb)->mark = mark; - return XT_CONTINUE; -} - - -static int -checkentry_v0(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct xt_mark_target_info *markinfo = targinfo; - - if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); - return 0; - } - return 1; -} - -static int -checkentry_v1(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct xt_mark_target_info_v1 *markinfo = targinfo; - - if (markinfo->mode != XT_MARK_SET - && markinfo->mode != XT_MARK_AND - && markinfo->mode != XT_MARK_OR) { - printk(KERN_WARNING "MARK: unknown mode %u\n", - markinfo->mode); - return 0; - } - if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); - return 0; - } - return 1; -} - -#ifdef CONFIG_COMPAT -struct compat_xt_mark_target_info_v1 { - compat_ulong_t mark; - u_int8_t mode; - u_int8_t __pad1; - u_int16_t __pad2; -}; - -static void compat_from_user_v1(void *dst, void *src) -{ - struct compat_xt_mark_target_info_v1 *cm = src; - struct xt_mark_target_info_v1 m = { - .mark = cm->mark, - .mode = cm->mode, - }; - memcpy(dst, &m, sizeof(m)); -} - -static int compat_to_user_v1(void __user *dst, void *src) -{ - struct xt_mark_target_info_v1 *m = src; - struct compat_xt_mark_target_info_v1 cm = { - .mark = m->mark, - .mode = m->mode, - }; - return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_target xt_mark_target[] = { - { - .name = "MARK", - .family = AF_INET, - .revision = 0, - .checkentry = checkentry_v0, - .target = target_v0, - .targetsize = sizeof(struct xt_mark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "MARK", - .family = AF_INET, - .revision = 1, - .checkentry = checkentry_v1, - .target = target_v1, - .targetsize = sizeof(struct xt_mark_target_info_v1), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_mark_target_info_v1), - .compat_from_user = compat_from_user_v1, - .compat_to_user = compat_to_user_v1, -#endif - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "MARK", - .family = AF_INET6, - .revision = 0, - .checkentry = checkentry_v0, - .target = target_v0, - .targetsize = sizeof(struct xt_mark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, -}; - -static int __init xt_mark_init(void) -{ - return xt_register_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); -} - -static void __exit xt_mark_fini(void) -{ - xt_unregister_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); -} - -module_init(xt_mark_init); -module_exit(xt_mark_fini); diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c new file mode 100644 index 00000000000..b253e07cb1c --- /dev/null +++ b/net/netfilter/xt_NETMAP.c @@ -0,0 +1,165 @@ +/* + * (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> + +static unsigned int +netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + struct nf_nat_range newrange; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + union nf_inet_addr new_addr, netmask; + unsigned int i; + + ct = nf_ct_get(skb, &ctinfo); + for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++) + netmask.ip6[i] = ~(range->min_addr.ip6[i] ^ + range->max_addr.ip6[i]); + + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT) + new_addr.in6 = ipv6_hdr(skb)->daddr; + else + new_addr.in6 = ipv6_hdr(skb)->saddr; + + for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) { + new_addr.ip6[i] &= ~netmask.ip6[i]; + new_addr.ip6[i] |= range->min_addr.ip6[i] & + netmask.ip6[i]; + } + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr = new_addr; + newrange.max_addr = new_addr; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); +} + +static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + + if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) + return -EINVAL; + return 0; +} + +static unsigned int +netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 new_ip, netmask; + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range newrange; + + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_POST_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT || + par->hooknum == NF_INET_LOCAL_IN); + ct = nf_ct_get(skb, &ctinfo); + + netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); + + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT) + new_ip = ip_hdr(skb)->daddr & ~netmask; + else + new_ip = ip_hdr(skb)->saddr & ~netmask; + new_ip |= mr->range[0].min_ip & netmask; + + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = new_ip; + newrange.max_addr.ip = new_ip; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); +} + +static int netmap_tg4_check(const struct xt_tgchk_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + + if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + if (mr->rangesize != 1) { + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; + } + return 0; +} + +static struct xt_target netmap_tg_reg[] __read_mostly = { + { + .name = "NETMAP", + .family = NFPROTO_IPV6, + .revision = 0, + .target = netmap_tg6, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .checkentry = netmap_tg6_checkentry, + .me = THIS_MODULE, + }, + { + .name = "NETMAP", + .family = NFPROTO_IPV4, + .revision = 0, + .target = netmap_tg4, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .checkentry = netmap_tg4_check, + .me = THIS_MODULE, + }, +}; + +static int __init netmap_tg_init(void) +{ + return xt_register_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); +} + +static void netmap_tg_exit(void) +{ + xt_unregister_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); +} + +module_init(netmap_tg_init); +module_exit(netmap_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of subnets"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ip6t_NETMAP"); +MODULE_ALIAS("ipt_NETMAP"); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 901ed7abaa1..fb7497c928a 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -12,75 +12,62 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFLOG.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/nfnetlink_log.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("x_tables NFLOG target"); +MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFLOG"); MODULE_ALIAS("ip6t_NFLOG"); static unsigned int -nflog_target(struct sk_buff **pskb, - const struct net_device *in, const struct net_device *out, - unsigned int hooknum, const struct xt_target *target, - const void *targinfo) +nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_nflog_info *info = targinfo; + const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nf_log_packet(target->family, hooknum, *pskb, in, out, &li, - "%s", info->prefix); + nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, + par->out, &li, info->prefix); return XT_CONTINUE; } -static int -nflog_checkentry(const char *tablename, const void *entry, - const struct xt_target *target, void *targetinfo, - unsigned int hookmask) +static int nflog_tg_check(const struct xt_tgchk_param *par) { - struct xt_nflog_info *info = targetinfo; + const struct xt_nflog_info *info = par->targinfo; if (info->flags & ~XT_NFLOG_MASK) - return 0; + return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') - return 0; - return 1; + return -EINVAL; + return 0; } -static struct xt_target xt_nflog_target[] = { - { - .name = "NFLOG", - .family = AF_INET, - .checkentry = nflog_checkentry, - .target = nflog_target, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, - }, - { - .name = "NFLOG", - .family = AF_INET6, - .checkentry = nflog_checkentry, - .target = nflog_target, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, - }, +static struct xt_target nflog_tg_reg __read_mostly = { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = nflog_tg_check, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, }; -static int __init xt_nflog_init(void) +static int __init nflog_tg_init(void) { - return xt_register_targets(xt_nflog_target, - ARRAY_SIZE(xt_nflog_target)); + return xt_register_target(&nflog_tg_reg); } -static void __exit xt_nflog_fini(void) +static void __exit nflog_tg_exit(void) { - xt_unregister_targets(xt_nflog_target, ARRAY_SIZE(xt_nflog_target)); + xt_unregister_target(&nflog_tg_reg); } -module_init(xt_nflog_init); -module_exit(xt_nflog_fini); +module_init(nflog_tg_init); +module_exit(nflog_tg_exit); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 201155b316e..8f1779ff7e3 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -16,60 +16,145 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFQUEUE.h> +#include <net/netfilter/nf_queue.h> + MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("[ip,ip6,arp]_tables NFQUEUE target"); +MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFQUEUE"); MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); +static u32 jhash_initval __read_mostly; + static unsigned int -target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_NFQ_info *tinfo = targinfo; + const struct xt_NFQ_info *tinfo = par->targinfo; return NF_QUEUE_NR(tinfo->queuenum); } -static struct xt_target xt_nfqueue_target[] = { +static unsigned int +nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v1 *info = par->targinfo; + u32 queue = info->queuenum; + + if (info->queues_total > 1) { + queue = nfqueue_hash(skb, queue, info->queues_total, + par->family, jhash_initval); + } + return NF_QUEUE_NR(queue); +} + +static unsigned int +nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v2 *info = par->targinfo; + unsigned int ret = nfqueue_tg_v1(skb, par); + + if (info->bypass) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + return ret; +} + +static int nfqueue_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_NFQ_info_v3 *info = par->targinfo; + u32 maxid; + + init_hashrandom(&jhash_initval); + + if (info->queues_total == 0) { + pr_err("NFQUEUE: number of total queues is 0\n"); + return -EINVAL; + } + maxid = info->queues_total - 1 + info->queuenum; + if (maxid > 0xffff) { + pr_err("NFQUEUE: number of queues (%u) out of range (got %u)\n", + info->queues_total, maxid); + return -ERANGE; + } + if (par->target->revision == 2 && info->flags > 1) + return -EINVAL; + if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) + return -EINVAL; + + return 0; +} + +static unsigned int +nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v3 *info = par->targinfo; + u32 queue = info->queuenum; + int ret; + + if (info->queues_total > 1) { + if (info->flags & NFQ_FLAG_CPU_FANOUT) { + int cpu = smp_processor_id(); + + queue = info->queuenum + cpu % info->queues_total; + } else { + queue = nfqueue_hash(skb, queue, info->queues_total, + par->family, jhash_initval); + } + } + + ret = NF_QUEUE_NR(queue); + if (info->flags & NFQ_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + return ret; +} + +static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", - .family = AF_INET, - .target = target, + .family = NFPROTO_UNSPEC, + .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", - .family = AF_INET6, - .target = target, - .targetsize = sizeof(struct xt_NFQ_info), + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v1, + .targetsize = sizeof(struct xt_NFQ_info_v1), .me = THIS_MODULE, }, { .name = "NFQUEUE", - .family = NF_ARP, - .target = target, - .targetsize = sizeof(struct xt_NFQ_info), + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v2, + .targetsize = sizeof(struct xt_NFQ_info_v2), + .me = THIS_MODULE, + }, + { + .name = "NFQUEUE", + .revision = 3, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v3, + .targetsize = sizeof(struct xt_NFQ_info_v3), .me = THIS_MODULE, }, }; -static int __init xt_nfqueue_init(void) +static int __init nfqueue_tg_init(void) { - return xt_register_targets(xt_nfqueue_target, - ARRAY_SIZE(xt_nfqueue_target)); + return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } -static void __exit xt_nfqueue_fini(void) +static void __exit nfqueue_tg_exit(void) { - xt_unregister_targets(xt_nfqueue_target, ARRAY_SIZE(xt_nfqueue_target)); + xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } -module_init(xt_nfqueue_init); -module_exit(xt_nfqueue_fini); +module_init(nfqueue_tg_init); +module_exit(nfqueue_tg_exit); diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c deleted file mode 100644 index 5085fb3d1e2..00000000000 --- a/net/netfilter/xt_NOTRACK.c +++ /dev/null @@ -1,65 +0,0 @@ -/* This is a module which is used for setting up fake conntracks - * on packets so that they are not seen by the conntrack/NAT code. - */ -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_conntrack.h> - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ipt_NOTRACK"); - -static unsigned int -target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - /* Previously seen (loopback)? Ignore. */ - if ((*pskb)->nfct != NULL) - return XT_CONTINUE; - - /* Attach fake conntrack entry. - If there is a real ct entry correspondig to this packet, - it'll hang aroun till timing out. We don't deal with it - for performance reasons. JK */ - (*pskb)->nfct = &nf_conntrack_untracked.ct_general; - (*pskb)->nfctinfo = IP_CT_NEW; - nf_conntrack_get((*pskb)->nfct); - - return XT_CONTINUE; -} - -static struct xt_target xt_notrack_target[] = { - { - .name = "NOTRACK", - .family = AF_INET, - .target = target, - .table = "raw", - .me = THIS_MODULE, - }, - { - .name = "NOTRACK", - .family = AF_INET6, - .target = target, - .table = "raw", - .me = THIS_MODULE, - }, -}; - -static int __init xt_notrack_init(void) -{ - return xt_register_targets(xt_notrack_target, - ARRAY_SIZE(xt_notrack_target)); -} - -static void __exit xt_notrack_fini(void) -{ - xt_unregister_targets(xt_notrack_target, ARRAY_SIZE(xt_notrack_target)); -} - -module_init(xt_notrack_init); -module_exit(xt_notrack_fini); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c new file mode 100644 index 00000000000..370adf622ce --- /dev/null +++ b/net/netfilter/xt_RATEEST.c @@ -0,0 +1,194 @@ +/* + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/gen_stats.h> +#include <linux/jhash.h> +#include <linux/rtnetlink.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <net/gen_stats.h> +#include <net/netlink.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_RATEEST.h> +#include <net/netfilter/xt_rateest.h> + +static DEFINE_MUTEX(xt_rateest_mutex); + +#define RATEEST_HSIZE 16 +static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; +static unsigned int jhash_rnd __read_mostly; +static bool rnd_inited __read_mostly; + +static unsigned int xt_rateest_hash(const char *name) +{ + return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) & + (RATEEST_HSIZE - 1); +} + +static void xt_rateest_hash_insert(struct xt_rateest *est) +{ + unsigned int h; + + h = xt_rateest_hash(est->name); + hlist_add_head(&est->list, &rateest_hash[h]); +} + +struct xt_rateest *xt_rateest_lookup(const char *name) +{ + struct xt_rateest *est; + unsigned int h; + + h = xt_rateest_hash(name); + mutex_lock(&xt_rateest_mutex); + hlist_for_each_entry(est, &rateest_hash[h], list) { + if (strcmp(est->name, name) == 0) { + est->refcnt++; + mutex_unlock(&xt_rateest_mutex); + return est; + } + } + mutex_unlock(&xt_rateest_mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(xt_rateest_lookup); + +void xt_rateest_put(struct xt_rateest *est) +{ + mutex_lock(&xt_rateest_mutex); + if (--est->refcnt == 0) { + hlist_del(&est->list); + gen_kill_estimator(&est->bstats, &est->rstats); + /* + * gen_estimator est_timer() might access est->lock or bstats, + * wait a RCU grace period before freeing 'est' + */ + kfree_rcu(est, rcu); + } + mutex_unlock(&xt_rateest_mutex); +} +EXPORT_SYMBOL_GPL(xt_rateest_put); + +static unsigned int +xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_rateest_target_info *info = par->targinfo; + struct gnet_stats_basic_packed *stats = &info->est->bstats; + + spin_lock_bh(&info->est->lock); + stats->bytes += skb->len; + stats->packets++; + spin_unlock_bh(&info->est->lock); + + return XT_CONTINUE; +} + +static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct xt_rateest_target_info *info = par->targinfo; + struct xt_rateest *est; + struct { + struct nlattr opt; + struct gnet_estimator est; + } cfg; + int ret; + + if (unlikely(!rnd_inited)) { + get_random_bytes(&jhash_rnd, sizeof(jhash_rnd)); + rnd_inited = true; + } + + est = xt_rateest_lookup(info->name); + if (est) { + /* + * If estimator parameters are specified, they must match the + * existing estimator. + */ + if ((!info->interval && !info->ewma_log) || + (info->interval != est->params.interval || + info->ewma_log != est->params.ewma_log)) { + xt_rateest_put(est); + return -EINVAL; + } + info->est = est; + return 0; + } + + ret = -ENOMEM; + est = kzalloc(sizeof(*est), GFP_KERNEL); + if (!est) + goto err1; + + strlcpy(est->name, info->name, sizeof(est->name)); + spin_lock_init(&est->lock); + est->refcnt = 1; + est->params.interval = info->interval; + est->params.ewma_log = info->ewma_log; + + cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); + cfg.opt.nla_type = TCA_STATS_RATE_EST; + cfg.est.interval = info->interval; + cfg.est.ewma_log = info->ewma_log; + + ret = gen_new_estimator(&est->bstats, &est->rstats, + &est->lock, &cfg.opt); + if (ret < 0) + goto err2; + + info->est = est; + xt_rateest_hash_insert(est); + return 0; + +err2: + kfree(est); +err1: + return ret; +} + +static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_rateest_target_info *info = par->targinfo; + + xt_rateest_put(info->est); +} + +static struct xt_target xt_rateest_tg_reg __read_mostly = { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .me = THIS_MODULE, +}; + +static int __init xt_rateest_tg_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(rateest_hash); i++) + INIT_HLIST_HEAD(&rateest_hash[i]); + + return xt_register_target(&xt_rateest_tg_reg); +} + +static void __exit xt_rateest_tg_fini(void) +{ + xt_unregister_target(&xt_rateest_tg_reg); +} + + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: packet rate estimator"); +MODULE_ALIAS("ipt_RATEEST"); +MODULE_ALIAS("ip6t_RATEEST"); +module_init(xt_rateest_tg_init); +module_exit(xt_rateest_tg_fini); diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c new file mode 100644 index 00000000000..22a10309297 --- /dev/null +++ b/net/netfilter/xt_REDIRECT.c @@ -0,0 +1,190 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/types.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/addrconf.h> +#include <net/checksum.h> +#include <net/protocol.h> +#include <net/netfilter/nf_nat.h> + +static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; + +static unsigned int +redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + struct nf_nat_range newrange; + struct in6_addr newdst; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (par->hooknum == NF_INET_LOCAL_OUT) + newdst = loopback_addr; + else { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + bool addr = false; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + if (idev != NULL) { + list_for_each_entry(ifa, &idev->addr_list, if_list) { + newdst = ifa->addr; + addr = true; + break; + } + } + rcu_read_unlock(); + + if (!addr) + return NF_DROP; + } + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = newdst; + newrange.max_addr.in6 = newdst; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + +static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) + return -EINVAL; + return 0; +} + +/* FIXME: Take multiple ranges --RR */ +static int redirect_tg4_check(const struct xt_tgchk_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + + if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + if (mr->rangesize != 1) { + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; + } + return 0; +} + +static unsigned int +redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 newdst; + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range newrange; + + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (par->hooknum == NF_INET_LOCAL_OUT) + newdst = htonl(0x7F000001); + else { + struct in_device *indev; + struct in_ifaddr *ifa; + + newdst = 0; + + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + if (indev && (ifa = indev->ifa_list)) + newdst = ifa->ifa_local; + rcu_read_unlock(); + + if (!newdst) + return NF_DROP; + } + + /* Transfer from original range. */ + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newdst; + newrange.max_addr.ip = newdst; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + +static struct xt_target redirect_tg_reg[] __read_mostly = { + { + .name = "REDIRECT", + .family = NFPROTO_IPV6, + .revision = 0, + .table = "nat", + .checkentry = redirect_tg6_checkentry, + .target = redirect_tg6, + .targetsize = sizeof(struct nf_nat_range), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, + { + .name = "REDIRECT", + .family = NFPROTO_IPV4, + .revision = 0, + .table = "nat", + .target = redirect_tg4, + .checkentry = redirect_tg4_check, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, +}; + +static int __init redirect_tg_init(void) +{ + return xt_register_targets(redirect_tg_reg, + ARRAY_SIZE(redirect_tg_reg)); +} + +static void __exit redirect_tg_exit(void) +{ + xt_unregister_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg)); +} + +module_init(redirect_tg_init); +module_exit(redirect_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); +MODULE_ALIAS("ip6t_REDIRECT"); +MODULE_ALIAS("ipt_REDIRECT"); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 705f0e830a7..9faf5e050b7 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -5,22 +5,23 @@ * Based on the nfmark match by: * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> * - * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com> + * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> +#include <linux/security.h> #include <linux/skbuff.h> -#include <linux/selinux.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SECMARK.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); -MODULE_DESCRIPTION("ip[6]tables SECMARK modification module"); +MODULE_DESCRIPTION("Xtables: packet security mark modification"); MODULE_ALIAS("ipt_SECMARK"); MODULE_ALIAS("ip6t_SECMARK"); @@ -28,118 +29,119 @@ MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; -static unsigned int target(struct sk_buff **pskb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { u32 secmark = 0; - const struct xt_secmark_target_info *info = targinfo; + const struct xt_secmark_target_info *info = par->targinfo; BUG_ON(info->mode != mode); switch (mode) { case SECMARK_MODE_SEL: - secmark = info->u.sel.selsid; + secmark = info->secid; break; - default: BUG(); } - (*pskb)->secmark = secmark; + skb->secmark = secmark; return XT_CONTINUE; } -static int checkentry_selinux(struct xt_secmark_target_info *info) +static int checkentry_lsm(struct xt_secmark_target_info *info) { int err; - struct xt_secmark_target_selinux_info *sel = &info->u.sel; - sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0'; + info->secctx[SECMARK_SECCTX_MAX - 1] = '\0'; + info->secid = 0; - err = selinux_string_to_sid(sel->selctx, &sel->selsid); + err = security_secctx_to_secid(info->secctx, strlen(info->secctx), + &info->secid); if (err) { if (err == -EINVAL) - printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n", - sel->selctx); - return 0; + pr_info("invalid security context \'%s\'\n", info->secctx); + return err; } - if (!sel->selsid) { - printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n", - sel->selctx); - return 0; + if (!info->secid) { + pr_info("unable to map security context \'%s\'\n", info->secctx); + return -ENOENT; } - err = selinux_relabel_packet_permission(sel->selsid); + err = security_secmark_relabel_packet(info->secid); if (err) { - printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); - return 0; + pr_info("unable to obtain relabeling permission\n"); + return err; } - return 1; + security_secmark_refcount_inc(); + return 0; } -static int checkentry(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static int secmark_tg_check(const struct xt_tgchk_param *par) { - struct xt_secmark_target_info *info = targinfo; + struct xt_secmark_target_info *info = par->targinfo; + int err; + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { + pr_info("target only valid in the \'mangle\' " + "or \'security\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } if (mode && mode != info->mode) { - printk(KERN_INFO PFX "mode already set to %hu cannot mix with " - "rules for mode %hu\n", mode, info->mode); - return 0; + pr_info("mode already set to %hu cannot mix with " + "rules for mode %hu\n", mode, info->mode); + return -EINVAL; } switch (info->mode) { case SECMARK_MODE_SEL: - if (!checkentry_selinux(info)) - return 0; break; - default: - printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); - return 0; + pr_info("invalid mode: %hu\n", info->mode); + return -EINVAL; } + err = checkentry_lsm(info); + if (err) + return err; + if (!mode) mode = info->mode; - return 1; + return 0; +} + +static void secmark_tg_destroy(const struct xt_tgdtor_param *par) +{ + switch (mode) { + case SECMARK_MODE_SEL: + security_secmark_refcount_dec(); + } } -static struct xt_target xt_secmark_target[] = { - { - .name = "SECMARK", - .family = AF_INET, - .checkentry = checkentry, - .target = target, - .targetsize = sizeof(struct xt_secmark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "SECMARK", - .family = AF_INET6, - .checkentry = checkentry, - .target = target, - .targetsize = sizeof(struct xt_secmark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, +static struct xt_target secmark_tg_reg __read_mostly = { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = secmark_tg_check, + .destroy = secmark_tg_destroy, + .target = secmark_tg, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, }; -static int __init xt_secmark_init(void) +static int __init secmark_tg_init(void) { - return xt_register_targets(xt_secmark_target, - ARRAY_SIZE(xt_secmark_target)); + return xt_register_target(&secmark_tg_reg); } -static void __exit xt_secmark_fini(void) +static void __exit secmark_tg_exit(void) { - xt_unregister_targets(xt_secmark_target, ARRAY_SIZE(xt_secmark_target)); + xt_unregister_target(&secmark_tg_reg); } -module_init(xt_secmark_init); -module_exit(xt_secmark_fini); +module_init(secmark_tg_init); +module_exit(secmark_tg_exit); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 15fe8f64951..e762de5ee89 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -2,18 +2,23 @@ * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <linux/gfp.h> #include <linux/ipv6.h> #include <linux/tcp.h> +#include <net/dst.h> +#include <net/flow.h> #include <net/ipv6.h> +#include <net/route.h> #include <net/tcp.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -24,7 +29,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("x_tables TCP MSS modification module"); +MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); MODULE_ALIAS("ipt_TCPMSS"); MODULE_ALIAS("ip6t_TCPMSS"); @@ -38,243 +43,286 @@ optlen(const u_int8_t *opt, unsigned int offset) return opt[offset+1]; } +static u_int32_t tcpmss_reverse_mtu(struct net *net, + const struct sk_buff *skb, + unsigned int family) +{ + struct flowi fl; + const struct nf_afinfo *ai; + struct rtable *rt = NULL; + u_int32_t mtu = ~0U; + + if (family == PF_INET) { + struct flowi4 *fl4 = &fl.u.ip4; + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = ip_hdr(skb)->saddr; + } else { + struct flowi6 *fl6 = &fl.u.ip6; + + memset(fl6, 0, sizeof(*fl6)); + fl6->daddr = ipv6_hdr(skb)->saddr; + } + rcu_read_lock(); + ai = nf_get_afinfo(family); + if (ai != NULL) + ai->route(net, (struct dst_entry **)&rt, &fl, false); + rcu_read_unlock(); + + if (rt != NULL) { + mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); + } + return mtu; +} + static int -tcpmss_mangle_packet(struct sk_buff **pskb, - const struct xt_tcpmss_info *info, +tcpmss_mangle_packet(struct sk_buff *skb, + const struct xt_action_param *par, + unsigned int family, unsigned int tcphoff, unsigned int minlen) { + const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; - unsigned int tcplen, i; + int len, tcp_hdrlen; + unsigned int i; __be16 oldval; u16 newmss; u8 *opt; - if (!skb_make_writable(pskb, (*pskb)->len)) + /* This is a fragment, no TCP header is available */ + if (par->fragoff != 0) + return 0; + + if (!skb_make_writable(skb, skb->len)) return -1; - tcplen = (*pskb)->len - tcphoff; - tcph = (struct tcphdr *)(skb_network_header(*pskb) + tcphoff); - - /* Since it passed flags test in tcp match, we know it is is - not a fragment, and has data >= tcp header length. SYN - packets should not contain data: if they did, then we risk - running over MTU, sending Frag Needed and breaking things - badly. --RR */ - if (tcplen != tcph->doff*4) { - if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", - (*pskb)->len); + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr)) + return -1; + + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + tcp_hdrlen = tcph->doff * 4; + + if (len < tcp_hdrlen) return -1; - } if (info->mss == XT_TCPMSS_CLAMP_PMTU) { - if (dst_mtu((*pskb)->dst) <= minlen) { - if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: " - "unknown or invalid path-MTU (%u)\n", - dst_mtu((*pskb)->dst)); + struct net *net = dev_net(par->in ? par->in : par->out); + unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); + + if (dst_mtu(skb_dst(skb)) <= minlen) { + net_err_ratelimited("unknown or invalid path-MTU (%u)\n", + dst_mtu(skb_dst(skb))); return -1; } - newmss = dst_mtu((*pskb)->dst) - minlen; + if (in_mtu <= minlen) { + net_err_ratelimited("unknown or invalid path-MTU (%u)\n", + in_mtu); + return -1; + } + newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen; } else newmss = info->mss; opt = (u_int8_t *)tcph; - for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) { - if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS && - opt[i+1] == TCPOLEN_MSS) { + for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { + if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - oldmss <= newmss) + /* Never increase MSS, even when setting it, as + * doing so results in problems for hosts that rely + * on MSS being set correctly. + */ + if (oldmss <= newmss) return 0; opt[i+2] = (newmss & 0xff00) >> 8; - opt[i+3] = (newmss & 0x00ff); + opt[i+3] = newmss & 0x00ff; - nf_proto_csum_replace2(&tcph->check, *pskb, - htons(oldmss), htons(newmss), 0); + inet_proto_csum_replace2(&tcph->check, skb, + htons(oldmss), htons(newmss), + 0); return 0; } } + /* There is data after the header so the option can't be added + * without moving it, and doing so may make the SYN packet + * itself too large. Accept the packet unmodified instead. + */ + if (len > tcp_hdrlen) + return 0; + /* * MSS Option not found ?! add it.. */ - if (skb_tailroom((*pskb)) < TCPOLEN_MSS) { - struct sk_buff *newskb; - - newskb = skb_copy_expand(*pskb, skb_headroom(*pskb), - TCPOLEN_MSS, GFP_ATOMIC); - if (!newskb) + if (skb_tailroom(skb) < TCPOLEN_MSS) { + if (pskb_expand_head(skb, 0, + TCPOLEN_MSS - skb_tailroom(skb), + GFP_ATOMIC)) return -1; - kfree_skb(*pskb); - *pskb = newskb; - tcph = (struct tcphdr *)(skb_network_header(*pskb) + tcphoff); + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); } - skb_put((*pskb), TCPOLEN_MSS); + skb_put(skb, TCPOLEN_MSS); + + /* + * IPv4: RFC 1122 states "If an MSS option is not received at + * connection setup, TCP MUST assume a default send MSS of 536". + * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum + * length IPv6 header of 60, ergo the default MSS value is 1220 + * Since no MSS was provided, we must use the default values + */ + if (par->family == NFPROTO_IPV4) + newmss = min(newmss, (u16)536); + else + newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); - memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); - nf_proto_csum_replace2(&tcph->check, *pskb, - htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); + inet_proto_csum_replace2(&tcph->check, skb, + htons(len), htons(len + TCPOLEN_MSS), 1); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; - opt[3] = (newmss & 0x00ff); + opt[3] = newmss & 0x00ff; - nf_proto_csum_replace4(&tcph->check, *pskb, 0, *((__be32 *)opt), 0); + inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; - nf_proto_csum_replace2(&tcph->check, *pskb, - oldval, ((__be16 *)tcph)[6], 0); + inet_proto_csum_replace2(&tcph->check, skb, + oldval, ((__be16 *)tcph)[6], 0); return TCPOLEN_MSS; } static unsigned int -xt_tcpmss_target4(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - struct iphdr *iph = ip_hdr(*pskb); + struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; - ret = tcpmss_mangle_packet(pskb, targinfo, iph->ihl * 4, + ret = tcpmss_mangle_packet(skb, par, + PF_INET, + iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { - iph = ip_hdr(*pskb); + iph = ip_hdr(skb); newlen = htons(ntohs(iph->tot_len) + ret); - nf_csum_replace2(&iph->check, iph->tot_len, newlen); + csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; } return XT_CONTINUE; } -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int -xt_tcpmss_target6(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - struct ipv6hdr *ipv6h = ipv6_hdr(*pskb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; + __be16 frag_off; int tcphoff; int ret; nexthdr = ipv6h->nexthdr; - tcphoff = ipv6_skip_exthdr(*pskb, sizeof(*ipv6h), &nexthdr); - if (tcphoff < 0) { - WARN_ON(1); + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); + if (tcphoff < 0) return NF_DROP; - } - ret = tcpmss_mangle_packet(pskb, targinfo, tcphoff, + ret = tcpmss_mangle_packet(skb, par, + PF_INET6, + tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { - ipv6h = ipv6_hdr(*pskb); + ipv6h = ipv6_hdr(skb); ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret); } return XT_CONTINUE; } #endif -#define TH_SYN 0x02 - /* Must specify -p tcp --syn */ -static inline int find_syn_match(const struct xt_entry_match *m) +static inline bool find_syn_match(const struct xt_entry_match *m) { const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; if (strcmp(m->u.kernel.match->name, "tcp") == 0 && - tcpinfo->flg_cmp & TH_SYN && + tcpinfo->flg_cmp & TCPHDR_SYN && !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) - return 1; + return true; - return 0; + return false; } -static int -xt_tcpmss_checkentry4(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = targinfo; - const struct ipt_entry *e = entry; + const struct xt_tcpmss_info *info = par->targinfo; + const struct ipt_entry *e = par->entryinfo; + const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT) | - (1 << NF_IP_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " - "FORWARD, OUTPUT and POSTROUTING hooks\n"); - return 0; + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info("path-MTU clamping only supported in " + "FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; } - if (IPT_MATCH_ITERATE(e, find_syn_match)) - return 1; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); - return 0; + xt_ematch_foreach(ematch, e) + if (find_syn_match(ematch)) + return 0; + pr_info("Only works on TCP SYN packets\n"); + return -EINVAL; } -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -static int -xt_tcpmss_checkentry6(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = targinfo; - const struct ip6t_entry *e = entry; + const struct xt_tcpmss_info *info = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; + const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_IP6_FORWARD) | - (1 << NF_IP6_LOCAL_OUT) | - (1 << NF_IP6_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " - "FORWARD, OUTPUT and POSTROUTING hooks\n"); - return 0; + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info("path-MTU clamping only supported in " + "FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; } - if (IP6T_MATCH_ITERATE(e, find_syn_match)) - return 1; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); - return 0; + xt_ematch_foreach(ematch, e) + if (find_syn_match(ematch)) + return 0; + pr_info("Only works on TCP SYN packets\n"); + return -EINVAL; } #endif -static struct xt_target xt_tcpmss_reg[] = { +static struct xt_target tcpmss_tg_reg[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "TCPMSS", - .checkentry = xt_tcpmss_checkentry4, - .target = xt_tcpmss_target4, + .checkentry = tcpmss_tg4_check, + .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { - .family = AF_INET6, + .family = NFPROTO_IPV6, .name = "TCPMSS", - .checkentry = xt_tcpmss_checkentry6, - .target = xt_tcpmss_target6, + .checkentry = tcpmss_tg6_check, + .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, @@ -282,15 +330,15 @@ static struct xt_target xt_tcpmss_reg[] = { #endif }; -static int __init xt_tcpmss_init(void) +static int __init tcpmss_tg_init(void) { - return xt_register_targets(xt_tcpmss_reg, ARRAY_SIZE(xt_tcpmss_reg)); + return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } -static void __exit xt_tcpmss_fini(void) +static void __exit tcpmss_tg_exit(void) { - xt_unregister_targets(xt_tcpmss_reg, ARRAY_SIZE(xt_tcpmss_reg)); + xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } -module_init(xt_tcpmss_init); -module_exit(xt_tcpmss_fini); +module_init(tcpmss_tg_init); +module_exit(tcpmss_tg_exit); diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c new file mode 100644 index 00000000000..625fa1d636a --- /dev/null +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -0,0 +1,158 @@ +/* + * A module for stripping a specific TCP option from TCP packets. + * + * Copyright (C) 2007 Sven Schnelle <svens@bitebene.org> + * Copyright © CC Computer Consultants GmbH, 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_TCPOPTSTRIP.h> + +static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) + return 1; + else + return opt[offset+1]; +} + +static unsigned int +tcpoptstrip_mangle_packet(struct sk_buff *skb, + const struct xt_action_param *par, + unsigned int tcphoff, unsigned int minlen) +{ + const struct xt_tcpoptstrip_target_info *info = par->targinfo; + unsigned int optl, i, j; + struct tcphdr *tcph; + u_int16_t n, o; + u_int8_t *opt; + int len, tcp_hdrlen; + + /* This is a fragment, no TCP header is available */ + if (par->fragoff != 0) + return XT_CONTINUE; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr)) + return NF_DROP; + + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + tcp_hdrlen = tcph->doff * 4; + + if (len < tcp_hdrlen) + return NF_DROP; + + opt = (u_int8_t *)tcph; + + /* + * Walk through all TCP options - if we find some option to remove, + * set all octets to %TCPOPT_NOP and adjust checksum. + */ + for (i = sizeof(struct tcphdr); i < tcp_hdrlen - 1; i += optl) { + optl = optlen(opt, i); + + if (i + optl > tcp_hdrlen) + break; + + if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) + continue; + + for (j = 0; j < optl; ++j) { + o = opt[i+j]; + n = TCPOPT_NOP; + if ((i + j) % 2 == 0) { + o <<= 8; + n <<= 8; + } + inet_proto_csum_replace2(&tcph->check, skb, htons(o), + htons(n), 0); + } + memset(opt + i, TCPOPT_NOP, optl); + } + + return XT_CONTINUE; +} + +static unsigned int +tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), + sizeof(struct iphdr) + sizeof(struct tcphdr)); +} + +#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +static unsigned int +tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int tcphoff; + u_int8_t nexthdr; + __be16 frag_off; + + nexthdr = ipv6h->nexthdr; + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); + if (tcphoff < 0) + return NF_DROP; + + return tcpoptstrip_mangle_packet(skb, par, tcphoff, + sizeof(*ipv6h) + sizeof(struct tcphdr)); +} +#endif + +static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { + { + .name = "TCPOPTSTRIP", + .family = NFPROTO_IPV4, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg4, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) + { + .name = "TCPOPTSTRIP", + .family = NFPROTO_IPV6, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg6, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tcpoptstrip_tg_init(void) +{ + return xt_register_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +static void __exit tcpoptstrip_tg_exit(void) +{ + xt_unregister_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +module_init(tcpoptstrip_tg_init); +module_exit(tcpoptstrip_tg_exit); +MODULE_AUTHOR("Sven Schnelle <svens@bitebene.org>, Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: TCP option stripping"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TCPOPTSTRIP"); +MODULE_ALIAS("ip6t_TCPOPTSTRIP"); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c new file mode 100644 index 00000000000..292934d2348 --- /dev/null +++ b/net/netfilter/xt_TEE.c @@ -0,0 +1,309 @@ +/* + * "TEE" target extension for Xtables + * Copyright © Sebastian Claßen, 2007 + * Jan Engelhardt, 2007-2010 + * + * based on ipt_ROUTE.c from Cédric de Launois + * <delaunois@info.ucl.be> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 or later, as published by the Free Software Foundation. + */ +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/route.h> +#include <linux/skbuff.h> +#include <linux/notifier.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/route.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_TEE.h> + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +# define WITH_CONNTRACK 1 +# include <net/netfilter/nf_conntrack.h> +#endif + +struct xt_tee_priv { + struct notifier_block notifier; + struct xt_tee_tginfo *tginfo; + int oif; +}; + +static const union nf_inet_addr tee_zero_address; +static DEFINE_PER_CPU(bool, tee_active); + +static struct net *pick_net(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS + const struct dst_entry *dst; + + if (skb->dev != NULL) + return dev_net(skb->dev); + dst = skb_dst(skb); + if (dst != NULL && dst->dev != NULL) + return dev_net(dst->dev); +#endif + return &init_net; +} + +static bool +tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) +{ + const struct iphdr *iph = ip_hdr(skb); + struct net *net = pick_net(skb); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + if (info->priv) { + if (info->priv->oif == -1) + return false; + fl4.flowi4_oif = info->priv->oif; + } + fl4.daddr = info->gw.ip; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return false; + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; + skb->protocol = htons(ETH_P_IP); + return true; +} + +static unsigned int +tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tee_tginfo *info = par->targinfo; + struct iphdr *iph; + + if (__this_cpu_read(tee_active)) + return XT_CONTINUE; + /* + * Copy the skb, and route the copy. Will later return %XT_CONTINUE for + * the original skb, which should continue on its way as if nothing has + * happened. The copy should be independently delivered to the TEE + * --gateway. + */ + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return XT_CONTINUE; + +#ifdef WITH_CONNTRACK + /* Avoid counting cloned packets towards the original connection. */ + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + /* + * If we are in PREROUTING/INPUT, the checksum must be recalculated + * since the length could have changed as a result of defragmentation. + * + * We also decrease the TTL to mitigate potential TEE loops + * between two hosts. + * + * Set %IP_DF so that the original source is notified of a potentially + * decreased MTU on the clone route. IPv6 does this too. + */ + iph = ip_hdr(skb); + iph->frag_off |= htons(IP_DF); + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_IN) + --iph->ttl; + ip_send_check(iph); + + if (tee_tg_route4(skb, info)) { + __this_cpu_write(tee_active, true); + ip_local_out(skb); + __this_cpu_write(tee_active, false); + } else { + kfree_skb(skb); + } + return XT_CONTINUE; +} + +#if IS_ENABLED(CONFIG_IPV6) +static bool +tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = pick_net(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + if (info->priv) { + if (info->priv->oif == -1) + return false; + fl6.flowi6_oif = info->priv->oif; + } + fl6.daddr = info->gw.in6; + fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | + (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); + return false; + } + skb_dst_drop(skb); + skb_dst_set(skb, dst); + skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); + return true; +} + +static unsigned int +tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tee_tginfo *info = par->targinfo; + + if (__this_cpu_read(tee_active)) + return XT_CONTINUE; + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return XT_CONTINUE; + +#ifdef WITH_CONNTRACK + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_IN) { + struct ipv6hdr *iph = ipv6_hdr(skb); + --iph->hop_limit; + } + if (tee_tg_route6(skb, info)) { + __this_cpu_write(tee_active, true); + ip6_local_out(skb); + __this_cpu_write(tee_active, false); + } else { + kfree_skb(skb); + } + return XT_CONTINUE; +} +#endif + +static int tee_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct xt_tee_priv *priv; + + priv = container_of(this, struct xt_tee_priv, notifier); + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + else if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + } + + return NOTIFY_DONE; +} + +static int tee_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_tee_tginfo *info = par->targinfo; + struct xt_tee_priv *priv; + + /* 0.0.0.0 and :: not allowed */ + if (memcmp(&info->gw, &tee_zero_address, + sizeof(tee_zero_address)) == 0) + return -EINVAL; + + if (info->oif[0]) { + if (info->oif[sizeof(info->oif)-1] != '\0') + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + priv->tginfo = info; + priv->oif = -1; + priv->notifier.notifier_call = tee_netdev_event; + info->priv = priv; + + register_netdevice_notifier(&priv->notifier); + } else + info->priv = NULL; + + return 0; +} + +static void tee_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_tee_tginfo *info = par->targinfo; + + if (info->priv) { + unregister_netdevice_notifier(&info->priv->notifier); + kfree(info->priv); + } +} + +static struct xt_target tee_tg_reg[] __read_mostly = { + { + .name = "TEE", + .revision = 1, + .family = NFPROTO_IPV4, + .target = tee_tg4, + .targetsize = sizeof(struct xt_tee_tginfo), + .checkentry = tee_tg_check, + .destroy = tee_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .name = "TEE", + .revision = 1, + .family = NFPROTO_IPV6, + .target = tee_tg6, + .targetsize = sizeof(struct xt_tee_tginfo), + .checkentry = tee_tg_check, + .destroy = tee_tg_destroy, + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tee_tg_init(void) +{ + return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); +} + +static void __exit tee_tg_exit(void) +{ + xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); +} + +module_init(tee_tg_init); +module_exit(tee_tg_exit); +MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: Reroute packet copy"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TEE"); +MODULE_ALIAS("ip6t_TEE"); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c new file mode 100644 index 00000000000..ef8a926752a --- /dev/null +++ b/net/netfilter/xt_TPROXY.c @@ -0,0 +1,599 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (c) 2006-2010 BalaBit IT Ltd. + * Author: Balazs Scheidler, Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/inet_sock.h> +#include <net/inet_hashtables.h> +#include <linux/inetdevice.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#include <net/netfilter/ipv4/nf_defrag_ipv4.h> + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#define XT_TPROXY_HAVE_IPV6 1 +#include <net/if_inet6.h> +#include <net/addrconf.h> +#include <net/inet6_hashtables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#endif + +#include <linux/netfilter/xt_TPROXY.h> + +enum nf_tproxy_lookup_t { + NFT_LOOKUP_LISTENER, + NFT_LOOKUP_ESTABLISHED, +}; + +static bool tproxy_sk_is_transparent(struct sock *sk) +{ + if (sk->sk_state != TCP_TIME_WAIT) { + if (inet_sk(sk)->transparent) + return true; + sock_put(sk); + } else { + if (inet_twsk(sk)->tw_transparent) + return true; + inet_twsk_put(inet_twsk(sk)); + } + return false; +} + +static inline __be32 +tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) +{ + struct in_device *indev; + __be32 laddr; + + if (user_laddr) + return user_laddr; + + laddr = 0; + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + for_primary_ifa(indev) { + laddr = ifa->ifa_local; + break; + } endfor_ifa(indev); + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} + +/* + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple, it is returned, assuming the redirection + * already took place and we process a packet belonging to an + * established connection + * + * - match: if there's a listening socket matching the redirection + * (e.g. on-port & on-ip of the connection), it is returned, + * regardless if it was bound to 0.0.0.0 or an explicit + * address. The reasoning is that if there's an explicit rule, it + * does not really matter if the listener is bound to an interface + * or to 0. The user already stated that he wants redirection + * (since he added the rule). + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +static inline struct sock * +nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, dport, + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); + + return sk; +} + +#ifdef XT_TPROXY_HAVE_IPV6 +static inline struct sock * +nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet6_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, ntohs(dport), + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = __inet6_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, ntohs(dport), + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", + protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); + + return sk; +} +#endif + +/** + * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @laddr: IPv4 address to redirect to or zero. + * @lport: TCP port to redirect to or zero. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait4() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, + struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, laddr ? laddr : iph->daddr, + hp->source, lport ? lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + +/* assign a socket to the skb -- consumes sk */ +static void +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_edemux; +} + +static unsigned int +tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, + u_int32_t mark_mask, u_int32_t mark_value) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp; + struct sock *sk; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) + return NF_DROP; + + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, iph->daddr, + hp->source, hp->dest, + skb->dev, NFT_LOOKUP_ESTABLISHED); + + laddr = tproxy_laddr4(skb, laddr, iph->daddr); + if (!lport) + lport = hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait4(skb, laddr, lport, sk); + else if (!sk) + /* no, there's no established connection, check if + * there's a listener on the redirected addr/port */ + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, laddr, + hp->source, lport, + skb->dev, NFT_LOOKUP_LISTENER); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && tproxy_sk_is_transparent(sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~mark_mask) ^ mark_value; + + pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->daddr, ntohs(hp->dest), + &laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); + return NF_DROP; +} + +static unsigned int +tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +static unsigned int +tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +#ifdef XT_TPROXY_HAVE_IPV6 + +static inline const struct in6_addr * +tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, + const struct in6_addr *daddr) +{ + struct inet6_dev *indev; + struct inet6_ifaddr *ifa; + struct in6_addr *laddr; + + if (!ipv6_addr_any(user_laddr)) + return user_laddr; + laddr = NULL; + + rcu_read_lock(); + indev = __in6_dev_get(skb->dev); + if (indev) + list_for_each_entry(ifa, &indev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) + continue; + + laddr = &ifa->addr; + break; + } + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} + +/** + * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @tproto: Transport protocol. + * @thoff: Transport protocol header offset. + * @par: Iptables target parameters. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait6() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + const struct xt_action_param *par, + struct sock *sk) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr _hdr, *hp; + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, + tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), + hp->source, + tgi->lport ? tgi->lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + +static unsigned int +tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + struct udphdr _hdr, *hp; + struct sock *sk; + const struct in6_addr *laddr; + __be16 lport; + int thoff = 0; + int tproto; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n"); + return NF_DROP; + } + + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, &iph->daddr, + hp->source, hp->dest, + par->in, NFT_LOOKUP_ESTABLISHED); + + laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); + lport = tgi->lport ? tgi->lport : hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk); + else if (!sk) + /* no there's no established connection, check if + * there's a listener on the redirected addr/port */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, laddr, + hp->source, lport, + par->in, NFT_LOOKUP_LISTENER); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && tproxy_sk_is_transparent(sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; + + pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->source), + laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); + + return NF_DROP; +} + +static int tproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_ip6 *i = par->entryinfo; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->flags & IP6T_INV_PROTO)) + return 0; + + pr_info("Can be used only in combination with " + "either -p tcp or -p udp\n"); + return -EINVAL; +} +#endif + +static int tproxy_tg4_check(const struct xt_tgchk_param *par) +{ + const struct ipt_ip *i = par->entryinfo; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->invflags & IPT_INV_PROTO)) + return 0; + + pr_info("Can be used only in combination with " + "either -p tcp or -p udp\n"); + return -EINVAL; +} + +static struct xt_target tproxy_tg_reg[] __read_mostly = { + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v0, + .revision = 0, + .targetsize = sizeof(struct xt_tproxy_target_info), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#ifdef XT_TPROXY_HAVE_IPV6 + { + .name = "TPROXY", + .family = NFPROTO_IPV6, + .table = "mangle", + .target = tproxy_tg6_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg6_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#endif + +}; + +static int __init tproxy_tg_init(void) +{ + nf_defrag_ipv4_enable(); +#ifdef XT_TPROXY_HAVE_IPV6 + nf_defrag_ipv6_enable(); +#endif + + return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); +} + +static void __exit tproxy_tg_exit(void) +{ + xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); +} + +module_init(tproxy_tg_init); +module_exit(tproxy_tg_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); +MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module."); +MODULE_ALIAS("ipt_TPROXY"); +MODULE_ALIAS("ip6t_TPROXY"); diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c new file mode 100644 index 00000000000..df48967af38 --- /dev/null +++ b/net/netfilter/xt_TRACE.c @@ -0,0 +1,40 @@ +/* This is a module which is used to mark packets for tracing. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> + +MODULE_DESCRIPTION("Xtables: packet flow tracing"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TRACE"); +MODULE_ALIAS("ip6t_TRACE"); + +static unsigned int +trace_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + skb->nf_trace = 1; + return XT_CONTINUE; +} + +static struct xt_target trace_tg_reg __read_mostly = { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_UNSPEC, + .table = "raw", + .target = trace_tg, + .me = THIS_MODULE, +}; + +static int __init trace_tg_init(void) +{ + return xt_register_target(&trace_tg_reg); +} + +static void __exit trace_tg_exit(void) +{ + xt_unregister_target(&trace_tg_reg); +} + +module_init(trace_tg_init); +module_exit(trace_tg_exit); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c new file mode 100644 index 00000000000..fab6eea1bf3 --- /dev/null +++ b/net/netfilter/xt_addrtype.c @@ -0,0 +1,248 @@ +/* + * iptables module to match inet_addr_type() of an ip. + * + * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> + * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/ip.h> +#include <net/route.h> + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/ip6_fib.h> +#endif + +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/xt_addrtype.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: address type match"); +MODULE_ALIAS("ipt_addrtype"); +MODULE_ALIAS("ip6t_addrtype"); + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr, u16 mask) +{ + const struct nf_afinfo *afinfo; + struct flowi6 flow; + struct rt6_info *rt; + u32 ret = 0; + int route_err; + + memset(&flow, 0, sizeof(flow)); + flow.daddr = *addr; + if (dev) + flow.flowi6_oif = dev->ifindex; + + rcu_read_lock(); + + afinfo = nf_get_afinfo(NFPROTO_IPV6); + if (afinfo != NULL) { + const struct nf_ipv6_ops *v6ops; + + if (dev && (mask & XT_ADDRTYPE_LOCAL)) { + v6ops = nf_get_ipv6_ops(); + if (v6ops && v6ops->chk_addr(net, addr, dev, true)) + ret = XT_ADDRTYPE_LOCAL; + } + route_err = afinfo->route(net, (struct dst_entry **)&rt, + flowi6_to_flowi(&flow), false); + } else { + route_err = 1; + } + rcu_read_unlock(); + + if (route_err) + return XT_ADDRTYPE_UNREACHABLE; + + if (rt->rt6i_flags & RTF_REJECT) + ret = XT_ADDRTYPE_UNREACHABLE; + + if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) + ret |= XT_ADDRTYPE_LOCAL; + if (rt->rt6i_flags & RTF_ANYCAST) + ret |= XT_ADDRTYPE_ANYCAST; + + dst_release(&rt->dst); + return ret; +} + +static bool match_type6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr, u16 mask) +{ + int addr_type = ipv6_addr_type(addr); + + if ((mask & XT_ADDRTYPE_MULTICAST) && + !(addr_type & IPV6_ADDR_MULTICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY) + return false; + + if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | + XT_ADDRTYPE_UNREACHABLE) & mask) + return !!(mask & match_lookup_rt6(net, dev, addr, mask)); + return true; +} + +static bool +addrtype_mt6(struct net *net, const struct net_device *dev, + const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool ret = true; + + if (info->source) + ret &= match_type6(net, dev, &iph->saddr, info->source) ^ + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type6(net, dev, &iph->daddr, info->dest) ^ + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); + return ret; +} +#endif + +static inline bool match_type(struct net *net, const struct net_device *dev, + __be32 addr, u_int16_t mask) +{ + return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); +} + +static bool +addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_addrtype_info *info = par->matchinfo; + const struct iphdr *iph = ip_hdr(skb); + bool ret = true; + + if (info->source) + ret &= match_type(net, NULL, iph->saddr, info->source) ^ + info->invert_source; + if (info->dest) + ret &= match_type(net, NULL, iph->daddr, info->dest) ^ + info->invert_dest; + + return ret; +} + +static bool +addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_addrtype_info_v1 *info = par->matchinfo; + const struct iphdr *iph; + const struct net_device *dev = NULL; + bool ret = true; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) + dev = par->in; + else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + dev = par->out; + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + if (par->family == NFPROTO_IPV6) + return addrtype_mt6(net, dev, skb, info); +#endif + iph = ip_hdr(skb); + if (info->source) + ret &= match_type(net, dev, iph->saddr, info->source) ^ + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type(net, dev, iph->daddr, info->dest) ^ + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); + return ret; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ + struct xt_addrtype_info_v1 *info = par->matchinfo; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { + pr_info("both incoming and outgoing " + "interface limitation cannot be selected\n"); + return -EINVAL; + } + + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { + pr_info("output interface limitation " + "not valid in PREROUTING and INPUT\n"); + return -EINVAL; + } + + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { + pr_info("input interface limitation " + "not valid in POSTROUTING and OUTPUT\n"); + return -EINVAL; + } + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + if (par->family == NFPROTO_IPV6) { + if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { + pr_err("ipv6 BLACKHOLE matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { + pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { + pr_err("ipv6 does not support BROADCAST matching\n"); + return -EINVAL; + } + } +#endif + return 0; +} + +static struct xt_match addrtype_mt_reg[] __read_mostly = { + { + .name = "addrtype", + .family = NFPROTO_IPV4, + .match = addrtype_mt_v0, + .matchsize = sizeof(struct xt_addrtype_info), + .me = THIS_MODULE + }, + { + .name = "addrtype", + .family = NFPROTO_UNSPEC, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct xt_addrtype_info_v1), + .me = THIS_MODULE + } +}; + +static int __init addrtype_mt_init(void) +{ + return xt_register_matches(addrtype_mt_reg, + ARRAY_SIZE(addrtype_mt_reg)); +} + +static void __exit addrtype_mt_exit(void) +{ + xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); +} + +module_init(addrtype_mt_init); +module_exit(addrtype_mt_exit); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c new file mode 100644 index 00000000000..bbffdbdaf60 --- /dev/null +++ b/net/netfilter/xt_bpf.c @@ -0,0 +1,74 @@ +/* Xtables module to match packets using a BPF filter. + * Copyright 2013 Google Inc. + * Written by Willem de Bruijn <willemb@google.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/filter.h> + +#include <linux/netfilter/xt_bpf.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Willem de Bruijn <willemb@google.com>"); +MODULE_DESCRIPTION("Xtables: BPF filter match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_bpf"); +MODULE_ALIAS("ip6t_bpf"); + +static int bpf_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info *info = par->matchinfo; + struct sock_fprog_kern program; + + program.len = info->bpf_program_num_elem; + program.filter = info->bpf_program; + + if (sk_unattached_filter_create(&info->filter, &program)) { + pr_info("bpf: check failed: parse error\n"); + return -EINVAL; + } + + return 0; +} + +static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_bpf_info *info = par->matchinfo; + + return SK_RUN_FILTER(info->filter, skb); +} + +static void bpf_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_bpf_info *info = par->matchinfo; + sk_unattached_filter_destroy(info->filter); +} + +static struct xt_match bpf_mt_reg __read_mostly = { + .name = "bpf", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check, + .match = bpf_mt, + .destroy = bpf_mt_destroy, + .matchsize = sizeof(struct xt_bpf_info), + .me = THIS_MODULE, +}; + +static int __init bpf_mt_init(void) +{ + return xt_register_match(&bpf_mt_reg); +} + +static void __exit bpf_mt_exit(void) +{ + xt_unregister_match(&bpf_mt_reg); +} + +module_init(bpf_mt_init); +module_exit(bpf_mt_exit); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c new file mode 100644 index 00000000000..f4e83300532 --- /dev/null +++ b/net/netfilter/xt_cgroup.c @@ -0,0 +1,72 @@ +/* + * Xtables module to match the process control group. + * + * Might be used to implement individual "per-application" firewall + * policies in contrast to global policies based on control groups. + * Matching is based upon processes tagged to net_cls' classid marker. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/skbuff.h> +#include <linux/module.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_cgroup.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("Xtables: process control group matching"); +MODULE_ALIAS("ipt_cgroup"); +MODULE_ALIAS("ip6t_cgroup"); + +static int cgroup_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + + return info->id ? 0 : -EINVAL; +} + +static bool +cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info *info = par->matchinfo; + + if (skb->sk == NULL) + return false; + + return (info->id == skb->sk->sk_classid) ^ info->invert; +} + +static struct xt_match cgroup_mt_reg __read_mostly = { + .name = "cgroup", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check, + .match = cgroup_mt, + .matchsize = sizeof(struct xt_cgroup_info), + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), +}; + +static int __init cgroup_mt_init(void) +{ + return xt_register_match(&cgroup_mt_reg); +} + +static void __exit cgroup_mt_exit(void) +{ + xt_unregister_match(&cgroup_mt_reg); +} + +module_init(cgroup_mt_init); +module_exit(cgroup_mt_exit); diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c new file mode 100644 index 00000000000..f4af1bfafb1 --- /dev/null +++ b/net/netfilter/xt_cluster.c @@ -0,0 +1,178 @@ +/* + * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/ip.h> +#include <net/ipv6.h> + +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/xt_cluster.h> + +static inline u32 nf_ct_orig_ipv4_src(const struct nf_conn *ct) +{ + return (__force u32)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; +} + +static inline const u32 *nf_ct_orig_ipv6_src(const struct nf_conn *ct) +{ + return (__force u32 *)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6; +} + +static inline u_int32_t +xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info) +{ + return jhash_1word(ip, info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info) +{ + return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash(const struct nf_conn *ct, + const struct xt_cluster_match_info *info) +{ + u_int32_t hash = 0; + + switch(nf_ct_l3num(ct)) { + case AF_INET: + hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info); + break; + case AF_INET6: + hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info); + break; + default: + WARN_ON(1); + break; + } + return (((u64)hash * info->total_nodes) >> 32); +} + +static inline bool +xt_cluster_ipv6_is_multicast(const struct in6_addr *addr) +{ + __be32 st = addr->s6_addr32[0]; + return ((st & htonl(0xFF000000)) == htonl(0xFF000000)); +} + +static inline bool +xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) +{ + bool is_multicast = false; + + switch(family) { + case NFPROTO_IPV4: + is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr); + break; + case NFPROTO_IPV6: + is_multicast = + xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr); + break; + default: + WARN_ON(1); + break; + } + return is_multicast; +} + +static bool +xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct sk_buff *pskb = (struct sk_buff *)skb; + const struct xt_cluster_match_info *info = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned long hash; + + /* This match assumes that all nodes see the same packets. This can be + * achieved if the switch that connects the cluster nodes support some + * sort of 'port mirroring'. However, if your switch does not support + * this, your cluster nodes can reply ARP request using a multicast MAC + * address. Thus, your switch will flood the same packets to the + * cluster nodes with the same multicast MAC address. Using a multicast + * link address is a RFC 1812 (section 3.3.2) violation, but this works + * fine in practise. + * + * Unfortunately, if you use the multicast MAC address, the link layer + * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted + * by TCP and others for packets coming to this node. For that reason, + * this match mangles skbuff's pkt_type if it detects a packet + * addressed to a unicast address but using PACKET_MULTICAST. Yes, I + * know, matches should not alter packets, but we are doing this here + * because we would need to add a PKTTYPE target for this sole purpose. + */ + if (!xt_cluster_is_multicast_addr(skb, par->family) && + skb->pkt_type == PACKET_MULTICAST) { + pskb->pkt_type = PACKET_HOST; + } + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + if (nf_ct_is_untracked(ct)) + return false; + + if (ct->master) + hash = xt_cluster_hash(ct->master, info); + else + hash = xt_cluster_hash(ct, info); + + return !!((1 << hash) & info->node_mask) ^ + !!(info->flags & XT_CLUSTER_F_INV); +} + +static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_cluster_match_info *info = par->matchinfo; + + if (info->total_nodes > XT_CLUSTER_NODES_MAX) { + pr_info("you have exceeded the maximum " + "number of cluster nodes (%u > %u)\n", + info->total_nodes, XT_CLUSTER_NODES_MAX); + return -EINVAL; + } + if (info->node_mask >= (1ULL << info->total_nodes)) { + pr_info("this node mask cannot be " + "higher than the total number of nodes\n"); + return -EDOM; + } + return 0; +} + +static struct xt_match xt_cluster_match __read_mostly = { + .name = "cluster", + .family = NFPROTO_UNSPEC, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .me = THIS_MODULE, +}; + +static int __init xt_cluster_mt_init(void) +{ + return xt_register_match(&xt_cluster_match); +} + +static void __exit xt_cluster_mt_fini(void) +{ + xt_unregister_match(&xt_cluster_match); +} + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: hash-based cluster match"); +MODULE_ALIAS("ipt_cluster"); +MODULE_ALIAS("ip6t_cluster"); +module_init(xt_cluster_mt_init); +module_exit(xt_cluster_mt_fini); diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c index 7db492d6522..5c861d2f21c 100644 --- a/net/netfilter/xt_comment.c +++ b/net/netfilter/xt_comment.c @@ -10,52 +10,36 @@ #include <linux/netfilter/xt_comment.h> MODULE_AUTHOR("Brad Fisher <brad@info-link.net>"); -MODULE_DESCRIPTION("iptables comment match module"); +MODULE_DESCRIPTION("Xtables: No-op match which can be tagged with a comment"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_comment"); MODULE_ALIAS("ip6t_comment"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protooff, - int *hotdrop) +static bool +comment_mt(const struct sk_buff *skb, struct xt_action_param *par) { /* We always match */ - return 1; + return true; } -static struct xt_match xt_comment_match[] = { - { - .name = "comment", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct xt_comment_info), - .me = THIS_MODULE - }, - { - .name = "comment", - .family = AF_INET6, - .match = match, - .matchsize = sizeof(struct xt_comment_info), - .me = THIS_MODULE - }, +static struct xt_match comment_mt_reg __read_mostly = { + .name = "comment", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = comment_mt, + .matchsize = sizeof(struct xt_comment_info), + .me = THIS_MODULE, }; -static int __init xt_comment_init(void) +static int __init comment_mt_init(void) { - return xt_register_matches(xt_comment_match, - ARRAY_SIZE(xt_comment_match)); + return xt_register_match(&comment_mt_reg); } -static void __exit xt_comment_fini(void) +static void __exit comment_mt_exit(void) { - xt_unregister_matches(xt_comment_match, ARRAY_SIZE(xt_comment_match)); + xt_unregister_match(&comment_mt_reg); } -module_init(xt_comment_init); -module_exit(xt_comment_fini); +module_init(comment_mt_init); +module_exit(comment_mt_exit); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 804afe55e14..1e634615ab9 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -1,165 +1,157 @@ /* Kernel module to match connection tracking byte counter. * GPL (C) 2002 Martin Devera (devik@cdi.cz). */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> +#include <linux/bitops.h> #include <linux/skbuff.h> +#include <linux/math64.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> #include <net/netfilter/nf_conntrack.h> - -#include <asm/div64.h> -#include <asm/bitops.h> +#include <net/netfilter/nf_conntrack_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); +MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); MODULE_ALIAS("ipt_connbytes"); +MODULE_ALIAS("ip6t_connbytes"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_connbytes_info *sinfo = matchinfo; - struct nf_conn *ct; + const struct xt_connbytes_info *sinfo = par->matchinfo; + const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; - const struct ip_conntrack_counter *counters; + const struct nf_conn_acct *acct; + const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) - return 0; - counters = ct->counters; + return false; + + acct = nf_conn_acct_find(ct); + if (!acct) + return false; + counters = acct->counter; switch (sinfo->what) { case XT_CONNBYTES_PKTS: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: - what = counters[IP_CT_DIR_ORIGINAL].packets; + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: - what = counters[IP_CT_DIR_REPLY].packets; + what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: - what = counters[IP_CT_DIR_ORIGINAL].packets; - what += counters[IP_CT_DIR_REPLY].packets; + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); + what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } break; case XT_CONNBYTES_BYTES: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: - what = counters[IP_CT_DIR_ORIGINAL].bytes; + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); break; case XT_CONNBYTES_DIR_REPLY: - what = counters[IP_CT_DIR_REPLY].bytes; + what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; case XT_CONNBYTES_DIR_BOTH: - what = counters[IP_CT_DIR_ORIGINAL].bytes; - what += counters[IP_CT_DIR_REPLY].bytes; + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); + what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; } break; case XT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: - bytes = counters[IP_CT_DIR_ORIGINAL].bytes; - pkts = counters[IP_CT_DIR_ORIGINAL].packets; + bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); + pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: - bytes = counters[IP_CT_DIR_REPLY].bytes; - pkts = counters[IP_CT_DIR_REPLY].packets; + bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); + pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: - bytes = counters[IP_CT_DIR_ORIGINAL].bytes + - counters[IP_CT_DIR_REPLY].bytes; - pkts = counters[IP_CT_DIR_ORIGINAL].packets + - counters[IP_CT_DIR_REPLY].packets; + bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + + atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); + pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + + atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } if (pkts != 0) - what = div64_64(bytes, pkts); + what = div64_u64(bytes, pkts); break; } - if (sinfo->count.to) - return (what <= sinfo->count.to && what >= sinfo->count.from); - else - return (what >= sinfo->count.from); + if (sinfo->count.to >= sinfo->count.from) + return what <= sinfo->count.to && what >= sinfo->count.from; + else /* inverted */ + return what < sinfo->count.to || what > sinfo->count.from; } -static int check(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int connbytes_mt_check(const struct xt_mtchk_param *par) { - const struct xt_connbytes_info *sinfo = matchinfo; + const struct xt_connbytes_info *sinfo = par->matchinfo; + int ret; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && sinfo->what != XT_CONNBYTES_AVGPKT) - return 0; + return -EINVAL; if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && sinfo->direction != XT_CONNBYTES_DIR_REPLY && sinfo->direction != XT_CONNBYTES_DIR_BOTH) - return 0; - - if (nf_ct_l3proto_try_module_get(match->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); - return 0; + return -EINVAL; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + + /* + * This filter cannot function correctly unless connection tracking + * accounting is enabled, so complain in the hope that someone notices. + */ + if (!nf_ct_acct_enabled(par->net)) { + pr_warning("Forcing CT accounting to be enabled\n"); + nf_ct_set_acct(par->net, true); } - return 1; + return ret; } -static void -destroy(const struct xt_match *match, void *matchinfo) +static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->family); } -static struct xt_match xt_connbytes_match[] = { - { - .name = "connbytes", - .family = AF_INET, - .checkentry = check, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_connbytes_info), - .me = THIS_MODULE - }, - { - .name = "connbytes", - .family = AF_INET6, - .checkentry = check, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_connbytes_info), - .me = THIS_MODULE - }, +static struct xt_match connbytes_mt_reg __read_mostly = { + .name = "connbytes", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connbytes_mt_check, + .match = connbytes_mt, + .destroy = connbytes_mt_destroy, + .matchsize = sizeof(struct xt_connbytes_info), + .me = THIS_MODULE, }; -static int __init xt_connbytes_init(void) +static int __init connbytes_mt_init(void) { - return xt_register_matches(xt_connbytes_match, - ARRAY_SIZE(xt_connbytes_match)); + return xt_register_match(&connbytes_mt_reg); } -static void __exit xt_connbytes_fini(void) +static void __exit connbytes_mt_exit(void) { - xt_unregister_matches(xt_connbytes_match, - ARRAY_SIZE(xt_connbytes_match)); + xt_unregister_match(&connbytes_mt_reg); } -module_init(xt_connbytes_init); -module_exit(xt_connbytes_fini); +module_init(connbytes_mt_init); +module_exit(connbytes_mt_exit); diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c new file mode 100644 index 00000000000..9f8719df200 --- /dev/null +++ b/net/netfilter/xt_connlabel.c @@ -0,0 +1,99 @@ +/* + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); +MODULE_ALIAS("ipt_connlabel"); +MODULE_ALIAS("ip6t_connlabel"); + +static bool +connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connlabel_mtinfo *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + bool invert = info->options & XT_CONNLABEL_OP_INVERT; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL || nf_ct_is_untracked(ct)) + return invert; + + if (info->options & XT_CONNLABEL_OP_SET) + return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; + + return nf_connlabel_match(ct, info->bit) ^ invert; +} + +static int connlabel_mt_check(const struct xt_mtchk_param *par) +{ + const int options = XT_CONNLABEL_OP_INVERT | + XT_CONNLABEL_OP_SET; + struct xt_connlabel_mtinfo *info = par->matchinfo; + int ret; + size_t words; + + if (info->bit > XT_CONNLABEL_MAXBIT) + return -ERANGE; + + if (info->options & ~options) { + pr_err("Unknown options in mask %x\n", info->options); + return -EINVAL; + } + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + + par->net->ct.labels_used++; + words = BITS_TO_LONGS(info->bit+1); + if (words > par->net->ct.label_words) + par->net->ct.label_words = words; + + return ret; +} + +static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) +{ + par->net->ct.labels_used--; + if (par->net->ct.labels_used == 0) + par->net->ct.label_words = 0; + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match connlabels_mt_reg __read_mostly = { + .name = "connlabel", + .family = NFPROTO_UNSPEC, + .checkentry = connlabel_mt_check, + .match = connlabel_mt, + .matchsize = sizeof(struct xt_connlabel_mtinfo), + .destroy = connlabel_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init connlabel_mt_init(void) +{ + return xt_register_match(&connlabels_mt_reg); +} + +static void __exit connlabel_mt_exit(void) +{ + xt_unregister_match(&connlabels_mt_reg); +} + +module_init(connlabel_mt_init); +module_exit(connlabel_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c new file mode 100644 index 00000000000..fbc66bb250d --- /dev/null +++ b/net/netfilter/xt_connlimit.c @@ -0,0 +1,484 @@ +/* + * netfilter module to limit the number of parallel tcp + * connections per IP address. + * (c) 2000 Gerd Knorr <kraxel@bytesex.org> + * Nov 2002: Martin Bene <martin.bene@icomedias.com>: + * only ignore TIME_WAIT or gone connections + * (C) CC Computer Consultants GmbH, 2007 + * + * based on ... + * + * Kernel module to match connection tracking information. + * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_tcp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_connlimit.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_zones.h> + +#define CONNLIMIT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS 8U +#else +#define CONNLIMIT_LOCK_SLOTS 256U +#endif + +#define CONNLIMIT_GC_MAX_NODES 8 + +/* we will save the tuples of all connections we care about */ +struct xt_connlimit_conn { + struct hlist_node node; + struct nf_conntrack_tuple tuple; + union nf_inet_addr addr; +}; + +struct xt_connlimit_rb { + struct rb_node node; + struct hlist_head hhead; /* connections/hosts in same subnet */ + union nf_inet_addr addr; /* search key */ +}; + +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; + +struct xt_connlimit_data { + struct rb_root climit_root4[CONNLIMIT_SLOTS]; + struct rb_root climit_root6[CONNLIMIT_SLOTS]; +}; + +static u_int32_t connlimit_rnd __read_mostly; +static struct kmem_cache *connlimit_rb_cachep __read_mostly; +static struct kmem_cache *connlimit_conn_cachep __read_mostly; + +static inline unsigned int connlimit_iphash(__be32 addr) +{ + return jhash_1word((__force __u32)addr, + connlimit_rnd) % CONNLIMIT_SLOTS; +} + +static inline unsigned int +connlimit_iphash6(const union nf_inet_addr *addr, + const union nf_inet_addr *mask) +{ + union nf_inet_addr res; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) + res.ip6[i] = addr->ip6[i] & mask->ip6[i]; + + return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), + connlimit_rnd) % CONNLIMIT_SLOTS; +} + +static inline bool already_closed(const struct nf_conn *conn) +{ + if (nf_ct_protonum(conn) == IPPROTO_TCP) + return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || + conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; + else + return 0; +} + +static int +same_source_net(const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + const union nf_inet_addr *u3, u_int8_t family) +{ + if (family == NFPROTO_IPV4) { + return ntohl(addr->ip & mask->ip) - + ntohl(u3->ip & mask->ip); + } else { + union nf_inet_addr lh, rh; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) { + lh.ip6[i] = addr->ip6[i] & mask->ip6[i]; + rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; + } + + return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)); + } +} + +static bool add_hlist(struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr) +{ + struct xt_connlimit_conn *conn; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return false; + conn->tuple = *tuple; + conn->addr = *addr; + hlist_add_head(&conn->node, head); + return true; +} + +static unsigned int check_hlist(struct net *net, + struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + bool *addit) +{ + const struct nf_conntrack_tuple_hash *found; + struct xt_connlimit_conn *conn; + struct hlist_node *n; + struct nf_conn *found_ct; + unsigned int length = 0; + + *addit = true; + rcu_read_lock(); + + /* check the saved connections */ + hlist_for_each_entry_safe(conn, n, head, node) { + found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, + &conn->tuple); + if (found == NULL) { + hlist_del(&conn->node); + kmem_cache_free(connlimit_conn_cachep, conn); + continue; + } + + found_ct = nf_ct_tuplehash_to_ctrack(found); + + if (nf_ct_tuple_equal(&conn->tuple, tuple)) { + /* + * Just to be sure we have it only once in the list. + * We should not see tuples twice unless someone hooks + * this into a table without "-p tcp --syn". + */ + *addit = false; + } else if (already_closed(found_ct)) { + /* + * we do not care about connections which are + * closed already -> ditch it + */ + nf_ct_put(found_ct); + hlist_del(&conn->node); + kmem_cache_free(connlimit_conn_cachep, conn); + continue; + } + + nf_ct_put(found_ct); + length++; + } + + rcu_read_unlock(); + + return length; +} + +static void tree_nodes_free(struct rb_root *root, + struct xt_connlimit_rb *gc_nodes[], + unsigned int gc_count) +{ + struct xt_connlimit_rb *rbconn; + + while (gc_count) { + rbconn = gc_nodes[--gc_count]; + rb_erase(&rbconn->node, root); + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, const union nf_inet_addr *mask, + u8 family) +{ + struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; + struct rb_node **rbnode, *parent; + struct xt_connlimit_rb *rbconn; + struct xt_connlimit_conn *conn; + unsigned int gc_count; + bool no_gc = false; + + restart: + gc_count = 0; + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { + int diff; + bool addit; + + rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); + + parent = *rbnode; + diff = same_source_net(addr, mask, &rbconn->addr, family); + if (diff < 0) { + rbnode = &((*rbnode)->rb_left); + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { + /* same source network -> be counted! */ + unsigned int count; + count = check_hlist(net, &rbconn->hhead, tuple, &addit); + + tree_nodes_free(root, gc_nodes, gc_count); + if (!addit) + return count; + + if (!add_hlist(&rbconn->hhead, tuple, addr)) + return 0; /* hotdrop */ + + return count + 1; + } + + if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + /* only used for GC on hhead, retval and 'addit' ignored */ + check_hlist(net, &rbconn->hhead, tuple, &addit); + if (hlist_empty(&rbconn->hhead)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + no_gc = true; + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + goto restart; + } + + /* no match, need to insert new node */ + rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + return 0; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(connlimit_rb_cachep, rbconn); + return 0; + } + + conn->tuple = *tuple; + conn->addr = *addr; + rbconn->addr = *addr; + + INIT_HLIST_HEAD(&rbconn->hhead); + hlist_add_head(&conn->node, &rbconn->hhead); + + rb_link_node(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + return 1; +} + +static int count_them(struct net *net, + struct xt_connlimit_data *data, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + u_int8_t family) +{ + struct rb_root *root; + int count; + u32 hash; + + if (family == NFPROTO_IPV6) { + hash = connlimit_iphash6(addr, mask); + root = &data->climit_root6[hash]; + } else { + hash = connlimit_iphash(addr->ip & mask->ip); + root = &data->climit_root4[hash]; + } + + spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + count = count_tree(net, root, tuple, addr, mask, family); + + spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + return count; +} + +static bool +connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_connlimit_info *info = par->matchinfo; + union nf_inet_addr addr; + struct nf_conntrack_tuple tuple; + const struct nf_conntrack_tuple *tuple_ptr = &tuple; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int connections; + + ct = nf_ct_get(skb, &ctinfo); + if (ct != NULL) + tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + par->family, &tuple)) + goto hotdrop; + + if (par->family == NFPROTO_IPV6) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? + &iph->daddr : &iph->saddr, sizeof(addr.ip6)); + } else { + const struct iphdr *iph = ip_hdr(skb); + addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? + iph->daddr : iph->saddr; + } + + connections = count_them(net, info->data, tuple_ptr, &addr, + &info->mask, par->family); + if (connections == 0) + /* kmalloc failed, drop it entirely */ + goto hotdrop; + + return (connections > info->limit) ^ + !!(info->flags & XT_CONNLIMIT_INVERT); + + hotdrop: + par->hotdrop = true; + return false; +} + +static int connlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_connlimit_info *info = par->matchinfo; + unsigned int i; + int ret; + + if (unlikely(!connlimit_rnd)) { + u_int32_t rand; + + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&connlimit_rnd, 0, rand); + } + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for " + "address family %u\n", par->family); + return ret; + } + + /* init private data */ + info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); + if (info->data == NULL) { + nf_ct_l3proto_module_put(par->family); + return -ENOMEM; + } + + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + info->data->climit_root4[i] = RB_ROOT; + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + info->data->climit_root6[i] = RB_ROOT; + + return 0; +} + +static void destroy_tree(struct rb_root *r) +{ + struct xt_connlimit_conn *conn; + struct xt_connlimit_rb *rbconn; + struct hlist_node *n; + struct rb_node *node; + + while ((node = rb_first(r)) != NULL) { + rbconn = container_of(node, struct xt_connlimit_rb, node); + + rb_erase(node, r); + + hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) + kmem_cache_free(connlimit_conn_cachep, conn); + + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_connlimit_info *info = par->matchinfo; + unsigned int i; + + nf_ct_l3proto_module_put(par->family); + + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + destroy_tree(&info->data->climit_root4[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + destroy_tree(&info->data->climit_root6[i]); + + kfree(info->data); +} + +static struct xt_match connlimit_mt_reg __read_mostly = { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init connlimit_mt_init(void) +{ + int ret, i; + + BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); + BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) + spin_lock_init(&xt_connlimit_locks[i]); + + connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", + sizeof(struct xt_connlimit_conn), + 0, 0, NULL); + if (!connlimit_conn_cachep) + return -ENOMEM; + + connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", + sizeof(struct xt_connlimit_rb), + 0, 0, NULL); + if (!connlimit_rb_cachep) { + kmem_cache_destroy(connlimit_conn_cachep); + return -ENOMEM; + } + ret = xt_register_match(&connlimit_mt_reg); + if (ret != 0) { + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); + } + return ret; +} + +static void __exit connlimit_mt_exit(void) +{ + xt_unregister_match(&connlimit_mt_reg); + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); +} + +module_init(connlimit_mt_init); +module_exit(connlimit_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: Number of connections matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_connlimit"); +MODULE_ALIAS("ip6t_connlimit"); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index e1803256c79..69f78e96fdb 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -1,8 +1,10 @@ -/* This kernel module matches connection mark values set by the - * CONNMARK target +/* + * xt_connmark - Netfilter module to operate on connection marks * - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> - * by Henrik Nordstrom <hno@marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@medozas.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -15,136 +17,150 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connmark.h> -MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); -MODULE_DESCRIPTION("IP tables connmark match module"); +MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>"); +MODULE_DESCRIPTION("Xtables: connection mark operations"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_CONNMARK"); +MODULE_ALIAS("ip6t_CONNMARK"); MODULE_ALIAS("ipt_connmark"); +MODULE_ALIAS("ip6t_connmark"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static unsigned int +connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_connmark_info *info = matchinfo; - struct nf_conn *ct; + const struct xt_connmark_tginfo1 *info = par->targinfo; enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u_int32_t newmark; ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return 0; + if (ct == NULL) + return XT_CONTINUE; - return (((ct->mark) & info->mask) == info->mark) ^ info->invert; + switch (info->mode) { + case XT_CONNMARK_SET: + newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_SAVE: + newmark = (ct->mark & ~info->ctmask) ^ + (skb->mark & info->nfmask); + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_RESTORE: + newmark = (skb->mark & ~info->nfmask) ^ + (ct->mark & info->ctmask); + skb->mark = newmark; + break; + } + + return XT_CONTINUE; } -static int -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int connmark_tg_check(const struct xt_tgchk_param *par) { - struct xt_connmark_info *cm = matchinfo; + int ret; - if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { - printk(KERN_WARNING "connmark: only support 32bit mark\n"); - return 0; - } - if (nf_ct_l3proto_try_module_get(match->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); - return 0; - } - return 1; + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; } -static void -destroy(const struct xt_match *match, void *matchinfo) +static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->family); } -#ifdef CONFIG_COMPAT -struct compat_xt_connmark_info { - compat_ulong_t mark, mask; - u_int8_t invert; - u_int8_t __pad1; - u_int16_t __pad2; -}; +static bool +connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connmark_mtinfo1 *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; -static void compat_from_user(void *dst, void *src) + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static int connmark_mt_check(const struct xt_mtchk_param *par) { - struct compat_xt_connmark_info *cm = src; - struct xt_connmark_info m = { - .mark = cm->mark, - .mask = cm->mask, - .invert = cm->invert, - }; - memcpy(dst, &m, sizeof(m)); + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; } -static int compat_to_user(void __user *dst, void *src) +static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { - struct xt_connmark_info *m = src; - struct compat_xt_connmark_info cm = { - .mark = m->mark, - .mask = m->mask, - .invert = m->invert, - }; - return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; + nf_ct_l3proto_module_put(par->family); } -#endif /* CONFIG_COMPAT */ - -static struct xt_match xt_connmark_match[] = { - { - .name = "connmark", - .family = AF_INET, - .checkentry = checkentry, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_connmark_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_connmark_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, -#endif - .me = THIS_MODULE - }, - { - .name = "connmark", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_connmark_info), - .me = THIS_MODULE - }, + +static struct xt_target connmark_tg_reg __read_mostly = { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, }; -static int __init xt_connmark_init(void) +static struct xt_match connmark_mt_reg __read_mostly = { + .name = "connmark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_mt_check, + .match = connmark_mt, + .matchsize = sizeof(struct xt_connmark_mtinfo1), + .destroy = connmark_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init connmark_mt_init(void) { - return xt_register_matches(xt_connmark_match, - ARRAY_SIZE(xt_connmark_match)); + int ret; + + ret = xt_register_target(&connmark_tg_reg); + if (ret < 0) + return ret; + ret = xt_register_match(&connmark_mt_reg); + if (ret < 0) { + xt_unregister_target(&connmark_tg_reg); + return ret; + } + return 0; } -static void __exit xt_connmark_fini(void) +static void __exit connmark_mt_exit(void) { - xt_unregister_matches(xt_connmark_match, ARRAY_SIZE(xt_connmark_match)); + xt_unregister_match(&connmark_mt_reg); + xt_unregister_target(&connmark_tg_reg); } -module_init(xt_connmark_init); -module_exit(xt_connmark_fini); +module_init(connmark_mt_init); +module_exit(connmark_mt_exit); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 189ded5f378..188404b9b00 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -1,212 +1,333 @@ -/* Kernel module to match connection tracking information. - * Superset of Rusty's minimalistic state match. +/* + * xt_conntrack - Netfilter module to match connection tracking + * information. (Superset of Rusty's minimalistic state match.) * - * (C) 2001 Marc Boucher (marc@mbsi.ca). + * (C) 2001 Marc Boucher (marc@mbsi.ca). + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> +#include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_conntrack.h> #include <net/netfilter/nf_conntrack.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables connection tracking match module"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: connection tracking state match"); MODULE_ALIAS("ipt_conntrack"); +MODULE_ALIAS("ip6t_conntrack"); + +static bool +conntrack_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; + else + return false; +} + +static inline bool +conntrack_mt_origsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, + &info->origsrc_addr, &info->origsrc_mask, family); +} + +static inline bool +conntrack_mt_origdst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, + &info->origdst_addr, &info->origdst_mask, family); +} + +static inline bool +conntrack_mt_replsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, + &info->replsrc_addr, &info->replsrc_mask, family); +} + +static inline bool +conntrack_mt_repldst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, + &info->repldst_addr, &info->repldst_mask, family); +} + +static inline bool +ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + (tuple->src.u.all == info->origsrc_port) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + (tuple->dst.u.all == info->origdst_port) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + (tuple->src.u.all == info->replsrc_port) ^ + !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + (tuple->dst.u.all == info->repldst_port) ^ + !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + +static inline bool +port_match(u16 min, u16 max, u16 port, bool invert) +{ + return (port >= min && port <= max) ^ invert; +} + +static inline bool +ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - const struct xt_conntrack_info *sinfo = matchinfo; - struct nf_conn *ct; + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + !port_match(info->origsrc_port, info->origsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + !port_match(info->origdst_port, info->origdst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + !port_match(info->replsrc_port, info->replsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + !port_match(info->repldst_port, info->repldst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + +static bool +conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, + u16 state_mask, u16 status_mask) +{ + const struct xt_conntrack_mtinfo2 *info = par->matchinfo; enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; unsigned int statebit; - ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); - -#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + ct = nf_ct_get(skb, &ctinfo); - if (ct == &nf_conntrack_untracked) - statebit = XT_CONNTRACK_STATE_UNTRACKED; - else if (ct) - statebit = XT_CONNTRACK_STATE_BIT(ctinfo); - else + if (ct) { + if (nf_ct_is_untracked(ct)) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + } else statebit = XT_CONNTRACK_STATE_INVALID; - if (sinfo->flags & XT_CONNTRACK_STATE) { - if (ct) { + if (info->match_flags & XT_CONNTRACK_STATE) { + if (ct != NULL) { if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_SNAT; if (test_bit(IPS_DST_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_DNAT; } - if (FWINV((statebit & sinfo->statemask) == 0, - XT_CONNTRACK_STATE)) - return 0; + if (!!(state_mask & statebit) ^ + !(info->invert_flags & XT_CONNTRACK_STATE)) + return false; } - if (ct == NULL) { - if (sinfo->flags & ~XT_CONNTRACK_STATE) - return 0; - return 1; + if (ct == NULL) + return info->match_flags & XT_CONNTRACK_STATE; + if ((info->match_flags & XT_CONNTRACK_DIRECTION) && + (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ + !(info->invert_flags & XT_CONNTRACK_DIRECTION)) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGSRC) + if (conntrack_mt_origsrc(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGDST) + if (conntrack_mt_origdst(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGDST)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLSRC) + if (conntrack_mt_replsrc(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLDST) + if (conntrack_mt_repldst(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLDST)) + return false; + + if (par->match->revision != 3) { + if (!ct_proto_port_check(info, ct)) + return false; + } else { + if (!ct_proto_port_check_v3(par->matchinfo, ct)) + return false; } - if (sinfo->flags & XT_CONNTRACK_PROTO && - FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != - sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, - XT_CONNTRACK_PROTO)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_ORIGSRC && - FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip & - sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != - sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, - XT_CONNTRACK_ORIGSRC)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_ORIGDST && - FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip & - sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != - sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, - XT_CONNTRACK_ORIGDST)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_REPLSRC && - FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip & - sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != - sinfo->tuple[IP_CT_DIR_REPLY].src.ip, - XT_CONNTRACK_REPLSRC)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_REPLDST && - FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip & - sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != - sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, - XT_CONNTRACK_REPLDST)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_STATUS && - FWINV((ct->status & sinfo->statusmask) == 0, - XT_CONNTRACK_STATUS)) - return 0; - - if(sinfo->flags & XT_CONNTRACK_EXPIRES) { - unsigned long expires = timer_pending(&ct->timeout) ? - (ct->timeout.expires - jiffies)/HZ : 0; - - if (FWINV(!(expires >= sinfo->expires_min && - expires <= sinfo->expires_max), - XT_CONNTRACK_EXPIRES)) - return 0; + if ((info->match_flags & XT_CONNTRACK_STATUS) && + (!!(status_mask & ct->status) ^ + !(info->invert_flags & XT_CONNTRACK_STATUS))) + return false; + + if (info->match_flags & XT_CONNTRACK_EXPIRES) { + unsigned long expires = 0; + + if (timer_pending(&ct->timeout)) + expires = (ct->timeout.expires - jiffies) / HZ; + if ((expires >= info->expires_min && + expires <= info->expires_max) ^ + !(info->invert_flags & XT_CONNTRACK_EXPIRES)) + return false; } - return 1; + return true; } -static int -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool +conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); - return 0; - } - return 1; + const struct xt_conntrack_mtinfo1 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); } -static void destroy(const struct xt_match *match, void *matchinfo) +static bool +conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { - nf_ct_l3proto_module_put(match->family); + const struct xt_conntrack_mtinfo2 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); } -#ifdef CONFIG_COMPAT -struct compat_xt_conntrack_info +static bool +conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) { - compat_uint_t statemask; - compat_uint_t statusmask; - struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX]; - struct in_addr sipmsk[IP_CT_DIR_MAX]; - struct in_addr dipmsk[IP_CT_DIR_MAX]; - compat_ulong_t expires_min; - compat_ulong_t expires_max; - u_int8_t flags; - u_int8_t invflags; -}; + const struct xt_conntrack_mtinfo3 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static int conntrack_mt_check(const struct xt_mtchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} -static void compat_from_user(void *dst, void *src) -{ - struct compat_xt_conntrack_info *cm = src; - struct xt_conntrack_info m = { - .statemask = cm->statemask, - .statusmask = cm->statusmask, - .expires_min = cm->expires_min, - .expires_max = cm->expires_max, - .flags = cm->flags, - .invflags = cm->invflags, - }; - memcpy(m.tuple, cm->tuple, sizeof(m.tuple)); - memcpy(m.sipmsk, cm->sipmsk, sizeof(m.sipmsk)); - memcpy(m.dipmsk, cm->dipmsk, sizeof(m.dipmsk)); - memcpy(dst, &m, sizeof(m)); -} - -static int compat_to_user(void __user *dst, void *src) -{ - struct xt_conntrack_info *m = src; - struct compat_xt_conntrack_info cm = { - .statemask = m->statemask, - .statusmask = m->statusmask, - .expires_min = m->expires_min, - .expires_max = m->expires_max, - .flags = m->flags, - .invflags = m->invflags, - }; - memcpy(cm.tuple, m->tuple, sizeof(cm.tuple)); - memcpy(cm.sipmsk, m->sipmsk, sizeof(cm.sipmsk)); - memcpy(cm.dipmsk, m->dipmsk, sizeof(cm.dipmsk)); - return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; -} -#endif - -static struct xt_match conntrack_match = { - .name = "conntrack", - .match = match, - .checkentry = checkentry, - .destroy = destroy, - .matchsize = sizeof(struct xt_conntrack_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_conntrack_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, -#endif - .family = AF_INET, - .me = THIS_MODULE, +static struct xt_match conntrack_mt_reg[] __read_mostly = { + { + .name = "conntrack", + .revision = 1, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo1), + .match = conntrack_mt_v1, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 2, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo2), + .match = conntrack_mt_v2, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 3, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo3), + .match = conntrack_mt_v3, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, }; -static int __init xt_conntrack_init(void) +static int __init conntrack_mt_init(void) { - return xt_register_match(&conntrack_match); + return xt_register_matches(conntrack_mt_reg, + ARRAY_SIZE(conntrack_mt_reg)); } -static void __exit xt_conntrack_fini(void) +static void __exit conntrack_mt_exit(void) { - xt_unregister_match(&conntrack_match); + xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } -module_init(xt_conntrack_init); -module_exit(xt_conntrack_fini); +module_init(conntrack_mt_init); +module_exit(conntrack_mt_exit); diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c new file mode 100644 index 00000000000..c7a2e5466bc --- /dev/null +++ b/net/netfilter/xt_cpu.c @@ -0,0 +1,65 @@ +/* Kernel module to match running CPU */ + +/* + * Might be used to distribute connections on several daemons, if + * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable, + * each RX queue IRQ affined to one CPU (1:1 mapping) + * + */ + +/* (C) 2010 Eric Dumazet + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_cpu.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); +MODULE_DESCRIPTION("Xtables: CPU match"); +MODULE_ALIAS("ipt_cpu"); +MODULE_ALIAS("ip6t_cpu"); + +static int cpu_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + return 0; +} + +static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + return (info->cpu == smp_processor_id()) ^ info->invert; +} + +static struct xt_match cpu_mt_reg __read_mostly = { + .name = "cpu", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cpu_mt_check, + .match = cpu_mt, + .matchsize = sizeof(struct xt_cpu_info), + .me = THIS_MODULE, +}; + +static int __init cpu_mt_init(void) +{ + return xt_register_match(&cpu_mt_reg); +} + +static void __exit cpu_mt_exit(void) +{ + xt_unregister_match(&cpu_mt_reg); +} + +module_init(cpu_mt_init); +module_exit(cpu_mt_exit); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index 2c9c0dee8aa..b63d2a3d80b 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/spinlock.h> #include <net/ip.h> #include <linux/dccp.h> @@ -22,8 +23,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Match for DCCP protocol packets"); +MODULE_DESCRIPTION("Xtables: DCCP protocol packet match"); MODULE_ALIAS("ipt_dccp"); +MODULE_ALIAS("ip6t_dccp"); #define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ || (!!((invflag) & (option)) ^ (cond))) @@ -31,40 +33,36 @@ MODULE_ALIAS("ipt_dccp"); static unsigned char *dccp_optbuf; static DEFINE_SPINLOCK(dccp_buflock); -static inline int +static inline bool dccp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, const struct dccp_hdr *dh, - int *hotdrop) + bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ - unsigned char *op; + const unsigned char *op; unsigned int optoff = __dccp_hdr_len(dh); unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); unsigned int i; - if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) { - *hotdrop = 1; - return 0; - } + if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) + goto invalid; if (!optlen) - return 0; + return false; spin_lock_bh(&dccp_buflock); op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf); if (op == NULL) { /* If we don't have the whole header, drop packet. */ - spin_unlock_bh(&dccp_buflock); - *hotdrop = 1; - return 0; + goto partial; } for (i = 0; i < optlen; ) { if (op[i] == option) { spin_unlock_bh(&dccp_buflock); - return 1; + return true; } if (op[i] < 2) @@ -74,94 +72,93 @@ dccp_find_option(u_int8_t option, } spin_unlock_bh(&dccp_buflock); - return 0; + return false; + +partial: + spin_unlock_bh(&dccp_buflock); +invalid: + *hotdrop = true; + return false; } -static inline int +static inline bool match_types(const struct dccp_hdr *dh, u_int16_t typemask) { - return (typemask & (1 << dh->dccph_type)); + return typemask & (1 << dh->dccph_type); } -static inline int +static inline bool match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, - const struct dccp_hdr *dh, int *hotdrop) + const struct dccp_hdr *dh, bool *hotdrop) { return dccp_find_option(option, skb, protoff, dh, hotdrop); } -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +dccp_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_dccp_info *info = matchinfo; - struct dccp_hdr _dh, *dh; + const struct xt_dccp_info *info = par->matchinfo; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; - if (offset) - return 0; + if (par->fragoff != 0) + return false; - dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh); + dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh); if (dh == NULL) { - *hotdrop = 1; - return 0; + par->hotdrop = true; + return false; } - return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) - && (ntohs(dh->dccph_sport) <= info->spts[1])), + return DCCHECK(ntohs(dh->dccph_sport) >= info->spts[0] + && ntohs(dh->dccph_sport) <= info->spts[1], XT_DCCP_SRC_PORTS, info->flags, info->invflags) - && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) - && (ntohs(dh->dccph_dport) <= info->dpts[1])), + && DCCHECK(ntohs(dh->dccph_dport) >= info->dpts[0] + && ntohs(dh->dccph_dport) <= info->dpts[1], XT_DCCP_DEST_PORTS, info->flags, info->invflags) && DCCHECK(match_types(dh, info->typemask), XT_DCCP_TYPE, info->flags, info->invflags) - && DCCHECK(match_option(info->option, skb, protoff, dh, - hotdrop), + && DCCHECK(match_option(info->option, skb, par->thoff, dh, + &par->hotdrop), XT_DCCP_OPTION, info->flags, info->invflags); } -static int -checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int dccp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_dccp_info *info = matchinfo; - - return !(info->flags & ~XT_DCCP_VALID_FLAGS) - && !(info->invflags & ~XT_DCCP_VALID_FLAGS) - && !(info->invflags & ~info->flags); + const struct xt_dccp_info *info = par->matchinfo; + + if (info->flags & ~XT_DCCP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~XT_DCCP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~info->flags) + return -EINVAL; + return 0; } -static struct xt_match xt_dccp_match[] = { +static struct xt_match dccp_mt_reg[] __read_mostly = { { .name = "dccp", - .family = AF_INET, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV4, + .checkentry = dccp_mt_check, + .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, }, { .name = "dccp", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV6, + .checkentry = dccp_mt_check, + .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, }, }; -static int __init xt_dccp_init(void) +static int __init dccp_mt_init(void) { int ret; @@ -171,7 +168,7 @@ static int __init xt_dccp_init(void) dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); if (!dccp_optbuf) return -ENOMEM; - ret = xt_register_matches(xt_dccp_match, ARRAY_SIZE(xt_dccp_match)); + ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); if (ret) goto out_kfree; return ret; @@ -181,11 +178,11 @@ out_kfree: return ret; } -static void __exit xt_dccp_fini(void) +static void __exit dccp_mt_exit(void) { - xt_unregister_matches(xt_dccp_match, ARRAY_SIZE(xt_dccp_match)); + xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); kfree(dccp_optbuf); } -module_init(xt_dccp_init); -module_exit(xt_dccp_fini); +module_init(dccp_mt_init); +module_exit(dccp_mt_exit); diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c new file mode 100644 index 00000000000..d9202cdd25c --- /dev/null +++ b/net/netfilter/xt_devgroup.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> + +#include <linux/netfilter/xt_devgroup.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Device group match"); +MODULE_ALIAS("ipt_devgroup"); +MODULE_ALIAS("ip6t_devgroup"); + +static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & XT_DEVGROUP_MATCH_SRC && + (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^ + ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) + return false; + + if (info->flags & XT_DEVGROUP_MATCH_DST && + (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^ + ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) + return false; + + return true; +} + +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | + XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) + return -EINVAL; + + if (info->flags & XT_DEVGROUP_MATCH_SRC && + par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD))) + return -EINVAL; + + if (info->flags & XT_DEVGROUP_MATCH_DST && + par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) + return -EINVAL; + + return 0; +} + +static struct xt_match devgroup_mt_reg __read_mostly = { + .name = "devgroup", + .match = devgroup_mt, + .checkentry = devgroup_mt_checkentry, + .matchsize = sizeof(struct xt_devgroup_info), + .family = NFPROTO_UNSPEC, + .me = THIS_MODULE +}; + +static int __init devgroup_mt_init(void) +{ + return xt_register_match(&devgroup_mt_reg); +} + +static void __exit devgroup_mt_exit(void) +{ + xt_unregister_match(&devgroup_mt_reg); +} + +module_init(devgroup_mt_init); +module_exit(devgroup_mt_exit); diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 56b247ecc28..64670fc5d0e 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -6,96 +6,110 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/dsfield.h> -#include <linux/netfilter/xt_dscp.h> #include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_dscp.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("x_tables DSCP matching module"); +MODULE_DESCRIPTION("Xtables: DSCP/TOS field match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_dscp"); MODULE_ALIAS("ip6t_dscp"); +MODULE_ALIAS("ipt_tos"); +MODULE_ALIAS("ip6t_tos"); -static int match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +dscp_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_dscp_info *info = matchinfo; + const struct xt_dscp_info *info = par->matchinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; } -static int match6(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +dscp_mt6(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_dscp_info *info = matchinfo; + const struct xt_dscp_info *info = par->matchinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; } -static int checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int dscp_mt_check(const struct xt_mtchk_param *par) { - const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp; + const struct xt_dscp_info *info = par->matchinfo; - if (dscp > XT_DSCP_MAX) { - printk(KERN_ERR "xt_dscp: dscp %x out of range\n", dscp); - return 0; + if (info->dscp > XT_DSCP_MAX) { + pr_info("dscp %x out of range\n", info->dscp); + return -EDOM; } - return 1; + return 0; +} + +static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_tos_match_info *info = par->matchinfo; + + if (par->family == NFPROTO_IPV4) + return ((ip_hdr(skb)->tos & info->tos_mask) == + info->tos_value) ^ !!info->invert; + else + return ((ipv6_get_dsfield(ipv6_hdr(skb)) & info->tos_mask) == + info->tos_value) ^ !!info->invert; } -static struct xt_match xt_dscp_match[] = { +static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "dscp", - .family = AF_INET, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV4, + .checkentry = dscp_mt_check, + .match = dscp_mt, .matchsize = sizeof(struct xt_dscp_info), .me = THIS_MODULE, }, { .name = "dscp", - .family = AF_INET6, - .checkentry = checkentry, - .match = match6, + .family = NFPROTO_IPV6, + .checkentry = dscp_mt_check, + .match = dscp_mt6, .matchsize = sizeof(struct xt_dscp_info), .me = THIS_MODULE, }, + { + .name = "tos", + .revision = 1, + .family = NFPROTO_IPV4, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, + { + .name = "tos", + .revision = 1, + .family = NFPROTO_IPV6, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, }; -static int __init xt_dscp_match_init(void) +static int __init dscp_mt_init(void) { - return xt_register_matches(xt_dscp_match, ARRAY_SIZE(xt_dscp_match)); + return xt_register_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); } -static void __exit xt_dscp_match_fini(void) +static void __exit dscp_mt_exit(void) { - xt_unregister_matches(xt_dscp_match, ARRAY_SIZE(xt_dscp_match)); + xt_unregister_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); } -module_init(xt_dscp_match_init); -module_exit(xt_dscp_match_fini); +module_init(dscp_mt_init); +module_exit(dscp_mt_exit); diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c new file mode 100644 index 00000000000..3c831a8efeb --- /dev/null +++ b/net/netfilter/xt_ecn.c @@ -0,0 +1,179 @@ +/* + * Xtables module for matching the value of the IPv4/IPv6 and TCP ECN bits + * + * (C) 2002 by Harald Welte <laforge@gnumonks.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_ecn.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ecn"); +MODULE_ALIAS("ip6t_ecn"); + +static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ecn_info *einfo = par->matchinfo; + struct tcphdr _tcph; + const struct tcphdr *th; + + /* In practice, TCP match does this, so can't fail. But let's + * be good citizens. + */ + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + + if (einfo->operation & XT_ECN_OP_MATCH_ECE) { + if (einfo->invert & XT_ECN_OP_MATCH_ECE) { + if (th->ece == 1) + return false; + } else { + if (th->ece == 0) + return false; + } + } + + if (einfo->operation & XT_ECN_OP_MATCH_CWR) { + if (einfo->invert & XT_ECN_OP_MATCH_CWR) { + if (th->cwr == 1) + return false; + } else { + if (th->cwr == 0) + return false; + } + } + + return true; +} + +static inline bool match_ip(const struct sk_buff *skb, + const struct xt_ecn_info *einfo) +{ + return ((ip_hdr(skb)->tos & XT_ECN_IP_MASK) == einfo->ip_ect) ^ + !!(einfo->invert & XT_ECN_OP_MATCH_IP); +} + +static bool ecn_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + + if (info->operation & XT_ECN_OP_MATCH_IP && !match_ip(skb, info)) + return false; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + !match_tcp(skb, par)) + return false; + + return true; +} + +static int ecn_mt_check4(const struct xt_mtchk_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + const struct ipt_ip *ip = par->entryinfo; + + if (info->operation & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->invert & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { + pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + return -EINVAL; + } + + return 0; +} + +static inline bool match_ipv6(const struct sk_buff *skb, + const struct xt_ecn_info *einfo) +{ + return (((ipv6_hdr(skb)->flow_lbl[0] >> 4) & XT_ECN_IP_MASK) == + einfo->ip_ect) ^ + !!(einfo->invert & XT_ECN_OP_MATCH_IP); +} + +static bool ecn_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + + if (info->operation & XT_ECN_OP_MATCH_IP && !match_ipv6(skb, info)) + return false; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + !match_tcp(skb, par)) + return false; + + return true; +} + +static int ecn_mt_check6(const struct xt_mtchk_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + const struct ip6t_ip6 *ip = par->entryinfo; + + if (info->operation & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->invert & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + (ip->proto != IPPROTO_TCP || ip->invflags & IP6T_INV_PROTO)) { + pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + return -EINVAL; + } + + return 0; +} + +static struct xt_match ecn_mt_reg[] __read_mostly = { + { + .name = "ecn", + .family = NFPROTO_IPV4, + .match = ecn_mt4, + .matchsize = sizeof(struct xt_ecn_info), + .checkentry = ecn_mt_check4, + .me = THIS_MODULE, + }, + { + .name = "ecn", + .family = NFPROTO_IPV6, + .match = ecn_mt6, + .matchsize = sizeof(struct xt_ecn_info), + .checkentry = ecn_mt_check6, + .me = THIS_MODULE, + }, +}; + +static int __init ecn_mt_init(void) +{ + return xt_register_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); +} + +static void __exit ecn_mt_exit(void) +{ + xt_unregister_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); +} + +module_init(ecn_mt_init); +module_exit(ecn_mt_exit); diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index 7c95f149d94..171ba82b590 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/in.h> @@ -20,107 +20,88 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); -MODULE_DESCRIPTION("x_tables ESP SPI match module"); +MODULE_DESCRIPTION("Xtables: IPsec-ESP packet match"); MODULE_ALIAS("ipt_esp"); MODULE_ALIAS("ip6t_esp"); -#if 0 -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif - /* Returns 1 if the spi is matched by the range, 0 otherwise */ -static inline int -spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { - int r = 0; - duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ', - min, spi, max); + bool r; + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); r = (spi >= min && spi <= max) ^ invert; - duprintf(" result %s\n", r ? "PASS" : "FAILED"); + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool esp_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct ip_esp_hdr _esp, *eh; - const struct xt_esp *espinfo = matchinfo; + const struct ip_esp_hdr *eh; + struct ip_esp_hdr _esp; + const struct xt_esp *espinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) - return 0; + if (par->fragoff != 0) + return false; - eh = skb_header_pointer(skb, protoff, sizeof(_esp), &_esp); + eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp); if (eh == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ESP tinygram.\n"); - *hotdrop = 1; - return 0; + pr_debug("Dropping evil ESP tinygram.\n"); + par->hotdrop = true; + return false; } return spi_match(espinfo->spis[0], espinfo->spis[1], ntohl(eh->spi), !!(espinfo->invflags & XT_ESP_INV_SPI)); } -/* Called when user tries to insert an entry of this type. */ -static int -checkentry(const char *tablename, - const void *ip_void, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int esp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_esp *espinfo = matchinfo; + const struct xt_esp *espinfo = par->matchinfo; if (espinfo->invflags & ~XT_ESP_INV_MASK) { - duprintf("xt_esp: unknown flags %X\n", espinfo->invflags); - return 0; + pr_debug("unknown flags %X\n", espinfo->invflags); + return -EINVAL; } - return 1; + return 0; } -static struct xt_match xt_esp_match[] = { +static struct xt_match esp_mt_reg[] __read_mostly = { { .name = "esp", - .family = AF_INET, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV4, + .checkentry = esp_mt_check, + .match = esp_mt, .matchsize = sizeof(struct xt_esp), .proto = IPPROTO_ESP, .me = THIS_MODULE, }, { .name = "esp", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV6, + .checkentry = esp_mt_check, + .match = esp_mt, .matchsize = sizeof(struct xt_esp), .proto = IPPROTO_ESP, .me = THIS_MODULE, }, }; -static int __init xt_esp_init(void) +static int __init esp_mt_init(void) { - return xt_register_matches(xt_esp_match, ARRAY_SIZE(xt_esp_match)); + return xt_register_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); } -static void __exit xt_esp_cleanup(void) +static void __exit esp_mt_exit(void) { - xt_unregister_matches(xt_esp_match, ARRAY_SIZE(xt_esp_match)); + xt_unregister_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); } -module_init(xt_esp_init); -module_exit(xt_esp_cleanup); +module_init(esp_mt_init); +module_exit(esp_mt_exit); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index d3043fa32eb..a3910fc2122 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1,12 +1,14 @@ -/* iptables match extension to limit the number of packets per second - * seperately for each hashbucket (sourceip/sourceport/dstip/dstport) +/* + * xt_hashlimit - Netfilter module to limit the number of packets per time + * separately for each hashbucket (sourceip/sourceport/dstip/dstport) * - * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> - * - * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $ + * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * Development of this code was funded by Astaro AG, http://www.astaro.com/ */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/random.h> @@ -20,7 +22,13 @@ #include <linux/mm.h> #include <linux/in.h> #include <linux/ip.h> +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #include <linux/ipv6.h> +#include <net/ipv6.h> +#endif + +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -30,13 +38,24 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables match for limiting per hash-bucket"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match"); MODULE_ALIAS("ipt_hashlimit"); MODULE_ALIAS("ip6t_hashlimit"); +struct hashlimit_net { + struct hlist_head htables; + struct proc_dir_entry *ipt_hashlimit; + struct proc_dir_entry *ip6t_hashlimit; +}; + +static int hashlimit_net_id; +static inline struct hashlimit_net *hashlimit_pernet(struct net *net) +{ + return net_generic(net, hashlimit_net_id); +} + /* need to declare this at the top */ -static struct proc_dir_entry *hashlimit_procdir4; -static struct proc_dir_entry *hashlimit_procdir6; static const struct file_operations dl_file_ops; /* hash table crap */ @@ -46,11 +65,13 @@ struct dsthash_dst { __be32 src; __be32 dst; } ip; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) struct { __be32 src[4]; __be32 dst[4]; } ip6; - } addr; +#endif + }; __be16 src_port; __be16 dst_port; }; @@ -61,40 +82,43 @@ struct dsthash_ent { struct dsthash_dst dst; /* modified structure members in the end */ + spinlock_t lock; unsigned long expires; /* precalculated expiry time */ struct { unsigned long prev; /* last modification */ u_int32_t credit; u_int32_t credit_cap, cost; } rateinfo; + struct rcu_head rcu; }; struct xt_hashlimit_htable { struct hlist_node node; /* global list of all htables */ - atomic_t use; - int family; + int use; + u_int8_t family; + bool rnd_initialized; - struct hashlimit_cfg cfg; /* config */ + struct hashlimit_cfg1 cfg; /* config */ /* used internally */ spinlock_t lock; /* lock for list_head */ u_int32_t rnd; /* random seed for hash */ - int rnd_initialized; unsigned int count; /* number entries in table */ struct timer_list timer; /* timer for gc */ /* seq_file stuff */ struct proc_dir_entry *pde; + const char *name; + struct net *net; struct hlist_head hash[0]; /* hashtable itself */ }; -static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ -static DEFINE_MUTEX(hlimit_mutex); /* additional checkentry protection */ -static HLIST_HEAD(hashlimit_htables); +static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ static struct kmem_cache *hashlimit_cachep __read_mostly; -static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) +static inline bool dst_cmp(const struct dsthash_ent *ent, + const struct dsthash_dst *b) { return !memcmp(&ent->dst, b, sizeof(ent->dst)); } @@ -102,98 +126,124 @@ static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) static u_int32_t hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) { - return jhash(dst, sizeof(*dst), ht->rnd) % ht->cfg.size; + u_int32_t hash = jhash2((const u32 *)dst, + sizeof(*dst)/sizeof(u32), + ht->rnd); + /* + * Instead of returning hash % ht->cfg.size (implying a divide) + * we return the high 32 bits of the (hash * ht->cfg.size) that will + * give results between [0 and cfg.size-1] and same hash distribution, + * but using a multiply, less expensive than a divide + */ + return ((u64)hash * ht->cfg.size) >> 32; } static struct dsthash_ent * -dsthash_find(const struct xt_hashlimit_htable *ht, struct dsthash_dst *dst) +dsthash_find(const struct xt_hashlimit_htable *ht, + const struct dsthash_dst *dst) { struct dsthash_ent *ent; - struct hlist_node *pos; u_int32_t hash = hash_dst(ht, dst); if (!hlist_empty(&ht->hash[hash])) { - hlist_for_each_entry(ent, pos, &ht->hash[hash], node) - if (dst_cmp(ent, dst)) + hlist_for_each_entry_rcu(ent, &ht->hash[hash], node) + if (dst_cmp(ent, dst)) { + spin_lock(&ent->lock); return ent; + } } return NULL; } /* allocate dsthash_ent, initialize dst, put in htable and lock it */ static struct dsthash_ent * -dsthash_alloc_init(struct xt_hashlimit_htable *ht, struct dsthash_dst *dst) +dsthash_alloc_init(struct xt_hashlimit_htable *ht, + const struct dsthash_dst *dst, bool *race) { struct dsthash_ent *ent; + spin_lock(&ht->lock); + + /* Two or more packets may race to create the same entry in the + * hashtable, double check if this packet lost race. + */ + ent = dsthash_find(ht, dst); + if (ent != NULL) { + spin_unlock(&ht->lock); + *race = true; + return ent; + } + /* initialize hash with random val at the time we allocate * the first hashtable entry */ - if (!ht->rnd_initialized) { - get_random_bytes(&ht->rnd, 4); - ht->rnd_initialized = 1; + if (unlikely(!ht->rnd_initialized)) { + get_random_bytes(&ht->rnd, sizeof(ht->rnd)); + ht->rnd_initialized = true; } if (ht->cfg.max && ht->count >= ht->cfg.max) { /* FIXME: do something. question is what.. */ - if (net_ratelimit()) - printk(KERN_WARNING - "xt_hashlimit: max count of %u reached\n", - ht->cfg.max); - return NULL; + net_err_ratelimited("max count of %u reached\n", ht->cfg.max); + ent = NULL; + } else + ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); + if (ent) { + memcpy(&ent->dst, dst, sizeof(ent->dst)); + spin_lock_init(&ent->lock); + + spin_lock(&ent->lock); + hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]); + ht->count++; } + spin_unlock(&ht->lock); + return ent; +} - ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); - if (!ent) { - if (net_ratelimit()) - printk(KERN_ERR - "xt_hashlimit: can't allocate dsthash_ent\n"); - return NULL; - } - memcpy(&ent->dst, dst, sizeof(ent->dst)); +static void dsthash_free_rcu(struct rcu_head *head) +{ + struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu); - hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]); - ht->count++; - return ent; + kmem_cache_free(hashlimit_cachep, ent); } static inline void dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) { - hlist_del(&ent->node); - kmem_cache_free(hashlimit_cachep, ent); + hlist_del_rcu(&ent->node); + call_rcu_bh(&ent->rcu, dsthash_free_rcu); ht->count--; } static void htable_gc(unsigned long htlong); -static int htable_create(struct xt_hashlimit_info *minfo, int family) +static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, + u_int8_t family) { + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; unsigned int size; unsigned int i; - if (minfo->cfg.size) + if (minfo->cfg.size) { size = minfo->cfg.size; - else { - size = ((num_physpages << PAGE_SHIFT) / 16384) / + } else { + size = (totalram_pages << PAGE_SHIFT) / 16384 / sizeof(struct list_head); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; } /* FIXME: don't use vmalloc() here or anywhere else -HW */ hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + - sizeof(struct list_head) * size); - if (!hinfo) { - printk(KERN_ERR "xt_hashlimit: unable to create hashtable\n"); - return -1; - } + sizeof(struct list_head) * size); + if (hinfo == NULL) + return -ENOMEM; minfo->hinfo = hinfo; /* copy match config into hashtable config */ memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); hinfo->cfg.size = size; - if (!hinfo->cfg.max) + if (hinfo->cfg.max == 0) hinfo->cfg.max = 8 * hinfo->cfg.size; else if (hinfo->cfg.max < hinfo->cfg.size) hinfo->cfg.max = hinfo->cfg.size; @@ -201,45 +251,52 @@ static int htable_create(struct xt_hashlimit_info *minfo, int family) for (i = 0; i < hinfo->cfg.size; i++) INIT_HLIST_HEAD(&hinfo->hash[i]); - atomic_set(&hinfo->use, 1); + hinfo->use = 1; hinfo->count = 0; hinfo->family = family; - hinfo->rnd_initialized = 0; + hinfo->rnd_initialized = false; + hinfo->name = kstrdup(minfo->name, GFP_KERNEL); + if (!hinfo->name) { + vfree(hinfo); + return -ENOMEM; + } spin_lock_init(&hinfo->lock); - hinfo->pde = create_proc_entry(minfo->name, 0, - family == AF_INET ? hashlimit_procdir4 : - hashlimit_procdir6); - if (!hinfo->pde) { + + hinfo->pde = proc_create_data(minfo->name, 0, + (family == NFPROTO_IPV4) ? + hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, + &dl_file_ops, hinfo); + if (hinfo->pde == NULL) { + kfree(hinfo->name); vfree(hinfo); - return -1; + return -ENOMEM; } - hinfo->pde->proc_fops = &dl_file_ops; - hinfo->pde->data = hinfo; + hinfo->net = net; - setup_timer(&hinfo->timer, htable_gc, (unsigned long )hinfo); + setup_timer(&hinfo->timer, htable_gc, (unsigned long)hinfo); hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); add_timer(&hinfo->timer); - spin_lock_bh(&hashlimit_lock); - hlist_add_head(&hinfo->node, &hashlimit_htables); - spin_unlock_bh(&hashlimit_lock); + hlist_add_head(&hinfo->node, &hashlimit_net->htables); return 0; } -static int select_all(struct xt_hashlimit_htable *ht, struct dsthash_ent *he) +static bool select_all(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he) { return 1; } -static int select_gc(struct xt_hashlimit_htable *ht, struct dsthash_ent *he) +static bool select_gc(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he) { - return (jiffies >= he->expires); + return time_after_eq(jiffies, he->expires); } static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, - int (*select)(struct xt_hashlimit_htable *ht, - struct dsthash_ent *he)) + bool (*select)(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he)) { unsigned int i; @@ -247,8 +304,8 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); for (i = 0; i < ht->cfg.size; i++) { struct dsthash_ent *dh; - struct hlist_node *pos, *n; - hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + struct hlist_node *n; + hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { if ((*select)(ht, dh)) dsthash_free(ht, dh); } @@ -268,46 +325,54 @@ static void htable_gc(unsigned long htlong) add_timer(&ht->timer); } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) +static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) { - /* remove timer, if it is pending */ - if (timer_pending(&hinfo->timer)) - del_timer(&hinfo->timer); + struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net); + struct proc_dir_entry *parent; + + if (hinfo->family == NFPROTO_IPV4) + parent = hashlimit_net->ipt_hashlimit; + else + parent = hashlimit_net->ip6t_hashlimit; - /* remove proc entry */ - remove_proc_entry(hinfo->pde->name, - hinfo->family == AF_INET ? hashlimit_procdir4 : - hashlimit_procdir6); + if (parent != NULL) + remove_proc_entry(hinfo->name, parent); +} + +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ + del_timer_sync(&hinfo->timer); + htable_remove_proc_entry(hinfo); htable_selective_cleanup(hinfo, select_all); + kfree(hinfo->name); vfree(hinfo); } -static struct xt_hashlimit_htable *htable_find_get(char *name, int family) +static struct xt_hashlimit_htable *htable_find_get(struct net *net, + const char *name, + u_int8_t family) { + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; - struct hlist_node *pos; - spin_lock_bh(&hashlimit_lock); - hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { - if (!strcmp(name, hinfo->pde->name) && + hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { + if (!strcmp(name, hinfo->name) && hinfo->family == family) { - atomic_inc(&hinfo->use); - spin_unlock_bh(&hashlimit_lock); + hinfo->use++; return hinfo; } } - spin_unlock_bh(&hashlimit_lock); return NULL; } static void htable_put(struct xt_hashlimit_htable *hinfo) { - if (atomic_dec_and_test(&hinfo->use)) { - spin_lock_bh(&hashlimit_lock); + mutex_lock(&hashlimit_mutex); + if (--hinfo->use == 0) { hlist_del(&hinfo->node); - spin_unlock_bh(&hashlimit_lock); htable_destroy(hinfo); } + mutex_unlock(&hashlimit_mutex); } /* The algorithm used is the Simple Token Bucket Filter (TBF) @@ -346,9 +411,20 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) +/* in byte mode, the lowest possible rate is one packet/second. + * credit_cap is used as a counter that tells us how many times we can + * refill the "credits available" counter when it becomes empty. + */ +#define MAX_CPJ_BYTES (0xFFFFFFFF / HZ) +#define CREDITS_PER_JIFFY_BYTES POW2_BELOW32(MAX_CPJ_BYTES) + +static u32 xt_hashlimit_len_to_chunks(u32 len) +{ + return (len >> XT_HASHLIMIT_BYTE_SHIFT) + 1; +} + /* Precision saver. */ -static inline u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user) { /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -358,70 +434,148 @@ user2credits(u_int32_t user) return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; } -static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +static u32 user2credits_byte(u32 user) { - dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY; - if (dh->rateinfo.credit > dh->rateinfo.credit_cap) - dh->rateinfo.credit = dh->rateinfo.credit_cap; + u64 us = user; + us *= HZ * CREDITS_PER_JIFFY_BYTES; + return (u32) (us >> 32); +} + +static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) +{ + unsigned long delta = now - dh->rateinfo.prev; + u32 cap; + + if (delta == 0) + return; + dh->rateinfo.prev = now; + + if (mode & XT_HASHLIMIT_BYTES) { + u32 tmp = dh->rateinfo.credit; + dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; + cap = CREDITS_PER_JIFFY_BYTES * HZ; + if (tmp >= dh->rateinfo.credit) {/* overflow */ + dh->rateinfo.credit = cap; + return; + } + } else { + dh->rateinfo.credit += delta * CREDITS_PER_JIFFY; + cap = dh->rateinfo.credit_cap; + } + if (dh->rateinfo.credit > cap) + dh->rateinfo.credit = cap; +} + +static void rateinfo_init(struct dsthash_ent *dh, + struct xt_hashlimit_htable *hinfo) +{ + dh->rateinfo.prev = jiffies; + if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { + dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; + dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg); + dh->rateinfo.credit_cap = hinfo->cfg.burst; + } else { + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + dh->rateinfo.credit_cap = dh->rateinfo.credit; + } +} + +static inline __be32 maskl(__be32 a, unsigned int l) +{ + return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0; } +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static void hashlimit_ipv6_mask(__be32 *i, unsigned int p) +{ + switch (p) { + case 0 ... 31: + i[0] = maskl(i[0], p); + i[1] = i[2] = i[3] = 0; + break; + case 32 ... 63: + i[1] = maskl(i[1], p - 32); + i[2] = i[3] = 0; + break; + case 64 ... 95: + i[2] = maskl(i[2], p - 64); + i[3] = 0; + break; + case 96 ... 127: + i[3] = maskl(i[3], p - 96); + break; + case 128: + break; + } +} +#endif + static int -hashlimit_init_dst(struct xt_hashlimit_htable *hinfo, struct dsthash_dst *dst, +hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, + struct dsthash_dst *dst, const struct sk_buff *skb, unsigned int protoff) { __be16 _ports[2], *ports; - int nexthdr; + u8 nexthdr; + int poff; memset(dst, 0, sizeof(*dst)); switch (hinfo->family) { - case AF_INET: + case NFPROTO_IPV4: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) - dst->addr.ip.dst = ip_hdr(skb)->daddr; + dst->ip.dst = maskl(ip_hdr(skb)->daddr, + hinfo->cfg.dstmask); if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) - dst->addr.ip.src = ip_hdr(skb)->saddr; + dst->ip.src = maskl(ip_hdr(skb)->saddr, + hinfo->cfg.srcmask); if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; nexthdr = ip_hdr(skb)->protocol; break; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - case AF_INET6: - if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) - memcpy(&dst->addr.ip6.dst, &ipv6_hdr(skb)->daddr, - sizeof(dst->addr.ip6.dst)); - if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) - memcpy(&dst->addr.ip6.src, &ipv6_hdr(skb)->saddr, - sizeof(dst->addr.ip6.src)); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + case NFPROTO_IPV6: + { + __be16 frag_off; + + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) { + memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr, + sizeof(dst->ip6.dst)); + hashlimit_ipv6_mask(dst->ip6.dst, hinfo->cfg.dstmask); + } + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) { + memcpy(&dst->ip6.src, &ipv6_hdr(skb)->saddr, + sizeof(dst->ip6.src)); + hashlimit_ipv6_mask(dst->ip6.src, hinfo->cfg.srcmask); + } if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; - nexthdr = ipv6_find_hdr(skb, &protoff, -1, NULL); - if (nexthdr < 0) + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); + if ((int)protoff < 0) return -1; break; + } #endif default: BUG(); return 0; } - switch (nexthdr) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - ports = skb_header_pointer(skb, protoff, sizeof(_ports), + poff = proto_ports_offset(nexthdr); + if (poff >= 0) { + ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports), &_ports); - break; - default: + } else { _ports[0] = _ports[1] = 0; ports = _ports; - break; } if (!ports) return -1; @@ -432,181 +586,167 @@ hashlimit_init_dst(struct xt_hashlimit_htable *hinfo, struct dsthash_dst *dst, return 0; } -static int -hashlimit_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - struct xt_hashlimit_info *r = - ((struct xt_hashlimit_info *)matchinfo)->u.master; - struct xt_hashlimit_htable *hinfo = r->hinfo; +static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) +{ + u64 tmp = xt_hashlimit_len_to_chunks(len); + tmp = tmp * dh->rateinfo.cost; + + if (unlikely(tmp > CREDITS_PER_JIFFY_BYTES * HZ)) + tmp = CREDITS_PER_JIFFY_BYTES * HZ; + + if (dh->rateinfo.credit < tmp && dh->rateinfo.credit_cap) { + dh->rateinfo.credit_cap--; + dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; + } + return (u32) tmp; +} + +static bool +hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; + bool race = false; + u32 cost; - if (hashlimit_init_dst(hinfo, &dst, skb, protoff) < 0) + if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; - spin_lock_bh(&hinfo->lock); + rcu_read_lock_bh(); dh = dsthash_find(hinfo, &dst); - if (!dh) { - dh = dsthash_alloc_init(hinfo, &dst); - if (!dh) { - spin_unlock_bh(&hinfo->lock); + if (dh == NULL) { + dh = dsthash_alloc_init(hinfo, &dst, &race); + if (dh == NULL) { + rcu_read_unlock_bh(); goto hotdrop; + } else if (race) { + /* Already got an entry, update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + rateinfo_recalc(dh, now, hinfo->cfg.mode); + } else { + dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); + rateinfo_init(dh, hinfo); } - - dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); - dh->rateinfo.prev = jiffies; - dh->rateinfo.credit = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); - dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); - dh->rateinfo.cost = user2credits(hinfo->cfg.avg); } else { /* update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now); + rateinfo_recalc(dh, now, hinfo->cfg.mode); } - if (dh->rateinfo.credit >= dh->rateinfo.cost) { - /* We're underlimit. */ - dh->rateinfo.credit -= dh->rateinfo.cost; - spin_unlock_bh(&hinfo->lock); - return 1; + if (info->cfg.mode & XT_HASHLIMIT_BYTES) + cost = hashlimit_byte_cost(skb->len, dh); + else + cost = dh->rateinfo.cost; + + if (dh->rateinfo.credit >= cost) { + /* below the limit */ + dh->rateinfo.credit -= cost; + spin_unlock(&dh->lock); + rcu_read_unlock_bh(); + return !(info->cfg.mode & XT_HASHLIMIT_INVERT); } - spin_unlock_bh(&hinfo->lock); - - /* default case: we're overlimit, thus don't match */ - return 0; + spin_unlock(&dh->lock); + rcu_read_unlock_bh(); + /* default match is underlimit - so over the limit, we need to invert */ + return info->cfg.mode & XT_HASHLIMIT_INVERT; -hotdrop: - *hotdrop = 1; - return 0; + hotdrop: + par->hotdrop = true; + return false; } -static int -hashlimit_checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int hashlimit_mt_check(const struct xt_mtchk_param *par) { - struct xt_hashlimit_info *r = matchinfo; - - /* Check for overflow. */ - if (r->cfg.burst == 0 || - user2credits(r->cfg.avg * r->cfg.burst) < user2credits(r->cfg.avg)) { - printk(KERN_ERR "xt_hashlimit: overflow, try lower: %u/%u\n", - r->cfg.avg, r->cfg.burst); - return 0; + struct net *net = par->net; + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + int ret; + + if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) + return -EINVAL; + if (info->name[sizeof(info->name)-1] != '\0') + return -EINVAL; + if (par->family == NFPROTO_IPV4) { + if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) + return -EINVAL; + } else { + if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128) + return -EINVAL; } - if (r->cfg.mode == 0 || - r->cfg.mode > (XT_HASHLIMIT_HASH_DPT | - XT_HASHLIMIT_HASH_DIP | - XT_HASHLIMIT_HASH_SIP | - XT_HASHLIMIT_HASH_SPT)) - return 0; - if (!r->cfg.gc_interval) - return 0; - if (!r->cfg.expire) - return 0; - if (r->name[sizeof(r->name) - 1] != '\0') - return 0; - /* This is the best we've got: We cannot release and re-grab lock, - * since checkentry() is called before x_tables.c grabs xt_mutex. - * We also cannot grab the hashtable spinlock, since htable_create will - * call vmalloc, and that can sleep. And we cannot just re-search - * the list of htable's in htable_create(), since then we would - * create duplicate proc files. -HW */ - mutex_lock(&hlimit_mutex); - r->hinfo = htable_find_get(r->name, match->family); - if (!r->hinfo && htable_create(r, match->family) != 0) { - mutex_unlock(&hlimit_mutex); - return 0; + if (info->cfg.mode & ~XT_HASHLIMIT_ALL) { + pr_info("Unknown mode mask %X, kernel too old?\n", + info->cfg.mode); + return -EINVAL; } - mutex_unlock(&hlimit_mutex); - - /* Ugly hack: For SMP, we only want to use one set */ - r->u.master = r; - return 1; -} -static void -hashlimit_destroy(const struct xt_match *match, void *matchinfo) -{ - struct xt_hashlimit_info *r = matchinfo; - - htable_put(r->hinfo); -} - -#ifdef CONFIG_COMPAT -struct compat_xt_hashlimit_info { - char name[IFNAMSIZ]; - struct hashlimit_cfg cfg; - compat_uptr_t hinfo; - compat_uptr_t master; -}; - -static void compat_from_user(void *dst, void *src) -{ - int off = offsetof(struct compat_xt_hashlimit_info, hinfo); + /* Check for overflow. */ + if (info->cfg.mode & XT_HASHLIMIT_BYTES) { + if (user2credits_byte(info->cfg.avg) == 0) { + pr_info("overflow, rate too high: %u\n", info->cfg.avg); + return -EINVAL; + } + } else if (info->cfg.burst == 0 || + user2credits(info->cfg.avg * info->cfg.burst) < + user2credits(info->cfg.avg)) { + pr_info("overflow, try lower: %u/%u\n", + info->cfg.avg, info->cfg.burst); + return -ERANGE; + } - memcpy(dst, src, off); - memset(dst + off, 0, sizeof(struct compat_xt_hashlimit_info) - off); + mutex_lock(&hashlimit_mutex); + info->hinfo = htable_find_get(net, info->name, par->family); + if (info->hinfo == NULL) { + ret = htable_create(net, info, par->family); + if (ret < 0) { + mutex_unlock(&hashlimit_mutex); + return ret; + } + } + mutex_unlock(&hashlimit_mutex); + return 0; } -static int compat_to_user(void __user *dst, void *src) +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) { - int off = offsetof(struct compat_xt_hashlimit_info, hinfo); + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; - return copy_to_user(dst, src, off) ? -EFAULT : 0; + htable_put(info->hinfo); } -#endif -static struct xt_match xt_hashlimit[] = { +static struct xt_match hashlimit_mt_reg[] __read_mostly = { { - .name = "hashlimit", - .family = AF_INET, - .match = hashlimit_match, - .matchsize = sizeof(struct xt_hashlimit_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_hashlimit_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, -#endif - .checkentry = hashlimit_checkentry, - .destroy = hashlimit_destroy, - .me = THIS_MODULE + .name = "hashlimit", + .revision = 1, + .family = NFPROTO_IPV4, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { - .name = "hashlimit", - .family = AF_INET6, - .match = hashlimit_match, - .matchsize = sizeof(struct xt_hashlimit_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_hashlimit_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, -#endif - .checkentry = hashlimit_checkentry, - .destroy = hashlimit_destroy, - .me = THIS_MODULE + .name = "hashlimit", + .revision = 1, + .family = NFPROTO_IPV6, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, }, +#endif }; /* PROC stuff */ static void *dl_seq_start(struct seq_file *s, loff_t *pos) + __acquires(htable->lock) { - struct proc_dir_entry *pde = s->private; - struct xt_hashlimit_htable *htable = pde->data; + struct xt_hashlimit_htable *htable = s->private; unsigned int *bucket; spin_lock_bh(&htable->lock); @@ -623,8 +763,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos) static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { - struct proc_dir_entry *pde = s->private; - struct xt_hashlimit_htable *htable = pde->data; + struct xt_hashlimit_htable *htable = s->private; unsigned int *bucket = (unsigned int *)v; *pos = ++(*bucket); @@ -636,65 +775,72 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) } static void dl_seq_stop(struct seq_file *s, void *v) + __releases(htable->lock) { - struct proc_dir_entry *pde = s->private; - struct xt_hashlimit_htable *htable = pde->data; + struct xt_hashlimit_htable *htable = s->private; unsigned int *bucket = (unsigned int *)v; - kfree(bucket); + if (!IS_ERR(bucket)) + kfree(bucket); spin_unlock_bh(&htable->lock); } -static int dl_seq_real_show(struct dsthash_ent *ent, int family, +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { + int res; + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); /* recalculate to show accurate numbers */ - rateinfo_recalc(ent, jiffies); + rateinfo_recalc(ent, jiffies, ht->cfg.mode); switch (family) { - case AF_INET: - return seq_printf(s, "%ld %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u %u %u %u\n", + case NFPROTO_IPV4: + res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n", (long)(ent->expires - jiffies)/HZ, - NIPQUAD(ent->dst.addr.ip.src), + &ent->dst.ip.src, ntohs(ent->dst.src_port), - NIPQUAD(ent->dst.addr.ip.dst), + &ent->dst.ip.dst, ntohs(ent->dst.dst_port), ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); - case AF_INET6: - return seq_printf(s, "%ld " NIP6_FMT ":%u->" - NIP6_FMT ":%u %u %u %u\n", + break; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + case NFPROTO_IPV6: + res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n", (long)(ent->expires - jiffies)/HZ, - NIP6(*(struct in6_addr *)&ent->dst.addr.ip6.src), + &ent->dst.ip6.src, ntohs(ent->dst.src_port), - NIP6(*(struct in6_addr *)&ent->dst.addr.ip6.dst), + &ent->dst.ip6.dst, ntohs(ent->dst.dst_port), ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); + break; +#endif default: BUG(); - return 0; + res = 0; } + spin_unlock(&ent->lock); + return res; } static int dl_seq_show(struct seq_file *s, void *v) { - struct proc_dir_entry *pde = s->private; - struct xt_hashlimit_htable *htable = pde->data; + struct xt_hashlimit_htable *htable = s->private; unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; - struct hlist_node *pos; if (!hlist_empty(&htable->hash[*bucket])) { - hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) + hlist_for_each_entry(ent, &htable->hash[*bucket], node) if (dl_seq_real_show(ent, htable->family, s)) - return 1; + return -1; } return 0; } -static struct seq_operations dl_seq_ops = { +static const struct seq_operations dl_seq_ops = { .start = dl_seq_start, .next = dl_seq_next, .stop = dl_seq_stop, @@ -707,7 +853,7 @@ static int dl_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode); + sf->private = PDE_DATA(inode); } return ret; } @@ -720,53 +866,103 @@ static const struct file_operations dl_file_ops = { .release = seq_release }; -static int __init xt_hashlimit_init(void) +static int __net_init hashlimit_proc_net_init(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net); + if (!hashlimit_net->ipt_hashlimit) + return -ENOMEM; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net); + if (!hashlimit_net->ip6t_hashlimit) { + remove_proc_entry("ipt_hashlimit", net->proc_net); + return -ENOMEM; + } +#endif + return 0; +} + +static void __net_exit hashlimit_proc_net_exit(struct net *net) +{ + struct xt_hashlimit_htable *hinfo; + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + /* hashlimit_net_exit() is called before hashlimit_mt_destroy(). + * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc + * entries is empty before trying to remove it. + */ + mutex_lock(&hashlimit_mutex); + hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) + htable_remove_proc_entry(hinfo); + hashlimit_net->ipt_hashlimit = NULL; + hashlimit_net->ip6t_hashlimit = NULL; + mutex_unlock(&hashlimit_mutex); + + remove_proc_entry("ipt_hashlimit", net->proc_net); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + remove_proc_entry("ip6t_hashlimit", net->proc_net); +#endif +} + +static int __net_init hashlimit_net_init(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + INIT_HLIST_HEAD(&hashlimit_net->htables); + return hashlimit_proc_net_init(net); +} + +static void __net_exit hashlimit_net_exit(struct net *net) +{ + hashlimit_proc_net_exit(net); +} + +static struct pernet_operations hashlimit_net_ops = { + .init = hashlimit_net_init, + .exit = hashlimit_net_exit, + .id = &hashlimit_net_id, + .size = sizeof(struct hashlimit_net), +}; + +static int __init hashlimit_mt_init(void) { int err; - err = xt_register_matches(xt_hashlimit, ARRAY_SIZE(xt_hashlimit)); + err = register_pernet_subsys(&hashlimit_net_ops); + if (err < 0) + return err; + err = xt_register_matches(hashlimit_mt_reg, + ARRAY_SIZE(hashlimit_mt_reg)); if (err < 0) goto err1; err = -ENOMEM; hashlimit_cachep = kmem_cache_create("xt_hashlimit", sizeof(struct dsthash_ent), 0, 0, - NULL, NULL); + NULL); if (!hashlimit_cachep) { - printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n"); + pr_warning("unable to create slab cache\n"); goto err2; } - hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net); - if (!hashlimit_procdir4) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); - goto err3; - } - hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net); - if (!hashlimit_procdir6) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); - goto err4; - } return 0; -err4: - remove_proc_entry("ipt_hashlimit", proc_net); -err3: - kmem_cache_destroy(hashlimit_cachep); + err2: - xt_unregister_matches(xt_hashlimit, ARRAY_SIZE(xt_hashlimit)); + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); err1: + unregister_pernet_subsys(&hashlimit_net_ops); return err; } -static void __exit xt_hashlimit_fini(void) +static void __exit hashlimit_mt_exit(void) { - remove_proc_entry("ipt_hashlimit", proc_net); - remove_proc_entry("ip6t_hashlimit", proc_net); + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); + unregister_pernet_subsys(&hashlimit_net_ops); + + rcu_barrier_bh(); kmem_cache_destroy(hashlimit_cachep); - xt_unregister_matches(xt_hashlimit, ARRAY_SIZE(xt_hashlimit)); } -module_init(xt_hashlimit_init); -module_exit(xt_hashlimit_fini); +module_init(hashlimit_mt_init); +module_exit(hashlimit_mt_exit); diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index c139b2f43a1..9f4ab00c805 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> @@ -18,119 +18,82 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); -MODULE_DESCRIPTION("iptables helper match module"); +MODULE_DESCRIPTION("Xtables: Related connection matching"); MODULE_ALIAS("ipt_helper"); MODULE_ALIAS("ip6t_helper"); -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +helper_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_helper_info *info = matchinfo; - struct nf_conn *ct; - struct nf_conn_help *master_help; + const struct xt_helper_info *info = par->matchinfo; + const struct nf_conn *ct; + const struct nf_conn_help *master_help; + const struct nf_conntrack_helper *helper; enum ip_conntrack_info ctinfo; - int ret = info->invert; + bool ret = info->invert; - ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); - if (!ct) { - DEBUGP("xt_helper: Eek! invalid conntrack?\n"); + ct = nf_ct_get(skb, &ctinfo); + if (!ct || !ct->master) return ret; - } - - if (!ct->master) { - DEBUGP("xt_helper: conntrack %p has no master\n", ct); - return ret; - } - read_lock_bh(&nf_conntrack_lock); master_help = nfct_help(ct->master); - if (!master_help || !master_help->helper) { - DEBUGP("xt_helper: master ct %p has no helper\n", - exp->expectant); - goto out_unlock; - } + if (!master_help) + return ret; - DEBUGP("master's name = %s , info->name = %s\n", - ct->master->helper->name, info->name); + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(master_help->helper); + if (!helper) + return ret; if (info->name[0] == '\0') - ret ^= 1; + ret = !ret; else - ret ^= !strncmp(master_help->helper->name, info->name, - strlen(master_help->helper->name)); -out_unlock: - read_unlock_bh(&nf_conntrack_lock); + ret ^= !strncmp(helper->name, info->name, + strlen(helper->name)); return ret; } -static int check(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int helper_mt_check(const struct xt_mtchk_param *par) { - struct xt_helper_info *info = matchinfo; + struct xt_helper_info *info = par->matchinfo; + int ret; - if (nf_ct_l3proto_try_module_get(match->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); - return 0; + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; } info->name[29] = '\0'; - return 1; + return 0; } -static void -destroy(const struct xt_match *match, void *matchinfo) +static void helper_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->family); } -static struct xt_match xt_helper_match[] = { - { - .name = "helper", - .family = AF_INET, - .checkentry = check, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_helper_info), - .me = THIS_MODULE, - }, - { - .name = "helper", - .family = AF_INET6, - .checkentry = check, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_helper_info), - .me = THIS_MODULE, - }, +static struct xt_match helper_mt_reg __read_mostly = { + .name = "helper", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = helper_mt_check, + .match = helper_mt, + .destroy = helper_mt_destroy, + .matchsize = sizeof(struct xt_helper_info), + .me = THIS_MODULE, }; -static int __init xt_helper_init(void) +static int __init helper_mt_init(void) { - return xt_register_matches(xt_helper_match, - ARRAY_SIZE(xt_helper_match)); + return xt_register_match(&helper_mt_reg); } -static void __exit xt_helper_fini(void) +static void __exit helper_mt_exit(void) { - xt_unregister_matches(xt_helper_match, ARRAY_SIZE(xt_helper_match)); + xt_unregister_match(&helper_mt_reg); } -module_init(xt_helper_init); -module_exit(xt_helper_fini); - +module_init(helper_mt_init); +module_exit(helper_mt_exit); diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c new file mode 100644 index 00000000000..003951149c9 --- /dev/null +++ b/net/netfilter/xt_hl.c @@ -0,0 +1,96 @@ +/* + * IP tables module for matching the value of the TTL + * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> + * + * Hop Limit matching module + * (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ipt_ttl.h> +#include <linux/netfilter_ipv6/ip6t_hl.h> + +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("Xtables: Hoplimit/TTL field match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ttl"); +MODULE_ALIAS("ip6t_hl"); + +static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ipt_ttl_info *info = par->matchinfo; + const u8 ttl = ip_hdr(skb)->ttl; + + switch (info->mode) { + case IPT_TTL_EQ: + return ttl == info->ttl; + case IPT_TTL_NE: + return ttl != info->ttl; + case IPT_TTL_LT: + return ttl < info->ttl; + case IPT_TTL_GT: + return ttl > info->ttl; + } + + return false; +} + +static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_hl_info *info = par->matchinfo; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + + switch (info->mode) { + case IP6T_HL_EQ: + return ip6h->hop_limit == info->hop_limit; + case IP6T_HL_NE: + return ip6h->hop_limit != info->hop_limit; + case IP6T_HL_LT: + return ip6h->hop_limit < info->hop_limit; + case IP6T_HL_GT: + return ip6h->hop_limit > info->hop_limit; + } + + return false; +} + +static struct xt_match hl_mt_reg[] __read_mostly = { + { + .name = "ttl", + .revision = 0, + .family = NFPROTO_IPV4, + .match = ttl_mt, + .matchsize = sizeof(struct ipt_ttl_info), + .me = THIS_MODULE, + }, + { + .name = "hl", + .revision = 0, + .family = NFPROTO_IPV6, + .match = hl_mt6, + .matchsize = sizeof(struct ip6t_hl_info), + .me = THIS_MODULE, + }, +}; + +static int __init hl_mt_init(void) +{ + return xt_register_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +static void __exit hl_mt_exit(void) +{ + xt_unregister_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +module_init(hl_mt_init); +module_exit(hl_mt_exit); diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c new file mode 100644 index 00000000000..89d53104c6b --- /dev/null +++ b/net/netfilter/xt_ipcomp.c @@ -0,0 +1,111 @@ +/* Kernel module to match IPComp parameters for IPv4 and IPv6 + * + * Copyright (C) 2013 WindRiver + * + * Author: + * Fan Du <fan.du@windriver.com> + * + * Based on: + * net/netfilter/xt_esp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter/xt_ipcomp.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fan Du <fan.du@windriver.com>"); +MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r = (spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip_comp_hdr _comphdr; + const struct ip_comp_hdr *chdr; + const struct xt_ipcomp *compinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr); + if (chdr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil IPComp tinygram.\n"); + par->hotdrop = true; + return 0; + } + + return spi_match(compinfo->spis[0], compinfo->spis[1], + ntohs(chdr->cpi), + !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); +} + +static int comp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_ipcomp *compinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) { + pr_err("unknown flags %X\n", compinfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match comp_mt_reg[] __read_mostly = { + { + .name = "ipcomp", + .family = NFPROTO_IPV4, + .match = comp_mt, + .matchsize = sizeof(struct xt_ipcomp), + .proto = IPPROTO_COMP, + .checkentry = comp_mt_check, + .me = THIS_MODULE, + }, + { + .name = "ipcomp", + .family = NFPROTO_IPV6, + .match = comp_mt, + .matchsize = sizeof(struct xt_ipcomp), + .proto = IPPROTO_COMP, + .checkentry = comp_mt_check, + .me = THIS_MODULE, + }, +}; + +static int __init comp_mt_init(void) +{ + return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +static void __exit comp_mt_exit(void) +{ + xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +module_init(comp_mt_init); +module_exit(comp_mt_exit); diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c new file mode 100644 index 00000000000..b46626cddd9 --- /dev/null +++ b/net/netfilter/xt_iprange.c @@ -0,0 +1,140 @@ +/* + * xt_iprange - Netfilter module to match IP address ranges + * + * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) CC Computer Consultants GmbH, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_iprange.h> + +static bool +iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_iprange_mtinfo *info = par->matchinfo; + const struct iphdr *iph = ip_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = ntohl(iph->saddr) < ntohl(info->src_min.ip); + m |= ntohl(iph->saddr) > ntohl(info->src_max.ip); + m ^= !!(info->flags & IPRANGE_SRC_INV); + if (m) { + pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->saddr, + (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", + &info->src_min.ip, + &info->src_max.ip); + return false; + } + } + if (info->flags & IPRANGE_DST) { + m = ntohl(iph->daddr) < ntohl(info->dst_min.ip); + m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip); + m ^= !!(info->flags & IPRANGE_DST_INV); + if (m) { + pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->daddr, + (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", + &info->dst_min.ip, + &info->dst_max.ip); + return false; + } + } + return true; +} + +static inline int +iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b) +{ + unsigned int i; + + for (i = 0; i < 4; ++i) { + if (a->s6_addr32[i] != b->s6_addr32[i]) + return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]); + } + + return 0; +} + +static bool +iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_iprange_mtinfo *info = par->matchinfo; + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6); + m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr); + m ^= !!(info->flags & IPRANGE_SRC_INV); + if (m) { + pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n", + &iph->saddr, + (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", + &info->src_min.in6, + &info->src_max.in6); + return false; + } + } + if (info->flags & IPRANGE_DST) { + m = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6); + m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr); + m ^= !!(info->flags & IPRANGE_DST_INV); + if (m) { + pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n", + &iph->daddr, + (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", + &info->dst_min.in6, + &info->dst_max.in6); + return false; + } + } + return true; +} + +static struct xt_match iprange_mt_reg[] __read_mostly = { + { + .name = "iprange", + .revision = 1, + .family = NFPROTO_IPV4, + .match = iprange_mt4, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, + { + .name = "iprange", + .revision = 1, + .family = NFPROTO_IPV6, + .match = iprange_mt6, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, +}; + +static int __init iprange_mt_init(void) +{ + return xt_register_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +static void __exit iprange_mt_exit(void) +{ + xt_unregister_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +module_init(iprange_mt_init); +module_exit(iprange_mt_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); +MODULE_ALIAS("ipt_iprange"); +MODULE_ALIAS("ip6t_iprange"); diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c new file mode 100644 index 00000000000..8d47c3780fd --- /dev/null +++ b/net/netfilter/xt_ipvs.c @@ -0,0 +1,188 @@ +/* + * xt_ipvs - kernel module to match IPVS connection properties + * + * Author: Hannes Eder <heder@google.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#endif +#include <linux/ip_vs.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_ipvs.h> +#include <net/netfilter/nf_conntrack.h> + +#include <net/ip_vs.h> + +MODULE_AUTHOR("Hannes Eder <heder@google.com>"); +MODULE_DESCRIPTION("Xtables: match IPVS connection properties"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ipvs"); +MODULE_ALIAS("ip6t_ipvs"); + +/* borrowed from xt_conntrack */ +static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, + unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; +#ifdef CONFIG_IP_VS_IPV6 + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; +#endif + else + return false; +} + +static bool +ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ipvs_mtinfo *data = par->matchinfo; + /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ + const u_int8_t family = par->family; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + bool match = true; + + if (data->bitmask == XT_IPVS_IPVS_PROPERTY) { + match = skb->ipvs_property ^ + !!(data->invert & XT_IPVS_IPVS_PROPERTY); + goto out; + } + + /* other flags than XT_IPVS_IPVS_PROPERTY are set */ + if (!skb->ipvs_property) { + match = false; + goto out; + } + + ip_vs_fill_iph_skb(family, skb, &iph); + + if (data->bitmask & XT_IPVS_PROTO) + if ((iph.protocol == data->l4proto) ^ + !(data->invert & XT_IPVS_PROTO)) { + match = false; + goto out; + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) { + match = false; + goto out; + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */); + if (unlikely(cp == NULL)) { + match = false; + goto out; + } + + /* + * We found a connection, i.e. ct != 0, make sure to call + * __ip_vs_conn_put before returning. In our case jump to out_put_con. + */ + + if (data->bitmask & XT_IPVS_VPORT) + if ((cp->vport == data->vport) ^ + !(data->invert & XT_IPVS_VPORT)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VPORTCTL) + if ((cp->control != NULL && + cp->control->vport == data->vportctl) ^ + !(data->invert & XT_IPVS_VPORTCTL)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_DIR) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct == NULL || nf_ct_is_untracked(ct)) { + match = false; + goto out_put_cp; + } + + if ((ctinfo >= IP_CT_IS_REPLY) ^ + !!(data->invert & XT_IPVS_DIR)) { + match = false; + goto out_put_cp; + } + } + + if (data->bitmask & XT_IPVS_METHOD) + if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^ + !(data->invert & XT_IPVS_METHOD)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VADDR) { + if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr, + &data->vmask, family) ^ + !(data->invert & XT_IPVS_VADDR)) { + match = false; + goto out_put_cp; + } + } + +out_put_cp: + __ip_vs_conn_put(cp); +out: + pr_debug("match=%d\n", match); + return match; +} + +static int ipvs_mt_check(const struct xt_mtchk_param *par) +{ + if (par->family != NFPROTO_IPV4 +#ifdef CONFIG_IP_VS_IPV6 + && par->family != NFPROTO_IPV6 +#endif + ) { + pr_info("protocol family %u not supported\n", par->family); + return -EINVAL; + } + + return 0; +} + +static struct xt_match xt_ipvs_mt_reg __read_mostly = { + .name = "ipvs", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = ipvs_mt, + .checkentry = ipvs_mt_check, + .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)), + .me = THIS_MODULE, +}; + +static int __init ipvs_mt_init(void) +{ + return xt_register_match(&xt_ipvs_mt_reg); +} + +static void __exit ipvs_mt_exit(void) +{ + xt_unregister_match(&xt_ipvs_mt_reg); +} + +module_init(ipvs_mt_init); +module_exit(ipvs_mt_exit); diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c new file mode 100644 index 00000000000..8aee572771f --- /dev/null +++ b/net/netfilter/xt_l2tp.c @@ -0,0 +1,354 @@ +/* Kernel module to match L2TP header parameters. */ + +/* (C) 2013 James Chapman <jchapman@katalix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <linux/l2tp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_l2tp.h> + +/* L2TP header masks */ +#define L2TP_HDR_T_BIT 0x8000 +#define L2TP_HDR_L_BIT 0x4000 +#define L2TP_HDR_VER 0x000f + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); +MODULE_DESCRIPTION("Xtables: L2TP header match"); +MODULE_ALIAS("ipt_l2tp"); +MODULE_ALIAS("ip6t_l2tp"); + +/* The L2TP fields that can be matched */ +struct l2tp_data { + u32 tid; + u32 sid; + u8 type; + u8 version; +}; + +union l2tp_val { + __be16 val16[2]; + __be32 val32; +}; + +static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data) +{ + if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type)) + return false; + + if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version)) + return false; + + /* Check tid only for L2TPv3 control or any L2TPv2 packets */ + if ((info->flags & XT_L2TP_TID) && + ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) && + (info->tid != data->tid)) + return false; + + /* Check sid only for L2TP data packets */ + if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) && + (info->sid != data->sid)) + return false; + + return true; +} + +/* Parse L2TP header fields when UDP encapsulation is used. Handles + * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a + * different format. See + * RFC2661, Section 3.1, L2TPv2 Header Format + * RFC3931, Section 3.2.1, L2TPv3 Control Message Header + * RFC3931, Section 3.2.2, L2TPv3 Data Message Header + * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP + */ +static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ + const struct xt_l2tp_info *info = par->matchinfo; + int uhlen = sizeof(struct udphdr); + int offs = thoff + uhlen; + union l2tp_val *lh; + union l2tp_val lhbuf; + u16 flags; + struct l2tp_data data = { 0, }; + + if (par->fragoff != 0) + return false; + + /* Extract L2TP header fields. The flags in the first 16 bits + * tell us where the other fields are. + */ + lh = skb_header_pointer(skb, offs, 2, &lhbuf); + if (lh == NULL) + return false; + + flags = ntohs(lh->val16[0]); + if (flags & L2TP_HDR_T_BIT) + data.type = XT_L2TP_TYPE_CONTROL; + else + data.type = XT_L2TP_TYPE_DATA; + data.version = (u8) flags & L2TP_HDR_VER; + + /* Now extract the L2TP tid/sid. These are in different places + * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we + * must also check to see if the length field is present, + * since this affects the offsets into the packet of the + * tid/sid fields. + */ + if (data.version == 3) { + lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf); + if (lh == NULL) + return false; + if (data.type == XT_L2TP_TYPE_CONTROL) + data.tid = ntohl(lh->val32); + else + data.sid = ntohl(lh->val32); + } else if (data.version == 2) { + if (flags & L2TP_HDR_L_BIT) + offs += 2; + lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf); + if (lh == NULL) + return false; + data.tid = (u32) ntohs(lh->val16[0]); + data.sid = (u32) ntohs(lh->val16[1]); + } else + return false; + + return l2tp_match(info, &data); +} + +/* Parse L2TP header fields for IP encapsulation (no UDP header). + * L2TPv3 data packets have a different form with IP encap. See + * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP. + * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP. + */ +static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ + const struct xt_l2tp_info *info = par->matchinfo; + union l2tp_val *lh; + union l2tp_val lhbuf; + struct l2tp_data data = { 0, }; + + /* For IP encap, the L2TP sid is the first 32-bits. */ + lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf); + if (lh == NULL) + return false; + if (lh->val32 == 0) { + /* Must be a control packet. The L2TP tid is further + * into the packet. + */ + data.type = XT_L2TP_TYPE_CONTROL; + lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf), + &lhbuf); + if (lh == NULL) + return false; + data.tid = ntohl(lh->val32); + } else { + data.sid = ntohl(lh->val32); + data.type = XT_L2TP_TYPE_DATA; + } + + data.version = 3; + + return l2tp_match(info, &data); +} + +static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct iphdr *iph = ip_hdr(skb); + u8 ipproto = iph->protocol; + + /* l2tp_mt_check4 already restricts the transport protocol */ + switch (ipproto) { + case IPPROTO_UDP: + return l2tp_udp_mt(skb, par, par->thoff); + case IPPROTO_L2TP: + return l2tp_ip_mt(skb, par, par->thoff); + } + + return false; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + unsigned int thoff = 0; + unsigned short fragoff = 0; + int ipproto; + + ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (fragoff != 0) + return false; + + /* l2tp_mt_check6 already restricts the transport protocol */ + switch (ipproto) { + case IPPROTO_UDP: + return l2tp_udp_mt(skb, par, thoff); + case IPPROTO_L2TP: + return l2tp_ip_mt(skb, par, thoff); + } + + return false; +} +#endif + +static int l2tp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + + /* Check for invalid flags */ + if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | + XT_L2TP_TYPE)) { + pr_info("unknown flags: %x\n", info->flags); + return -EINVAL; + } + + /* At least one of tid, sid or type=control must be specified */ + if ((!(info->flags & XT_L2TP_TID)) && + (!(info->flags & XT_L2TP_SID)) && + ((!(info->flags & XT_L2TP_TYPE)) || + (info->type != XT_L2TP_TYPE_CONTROL))) { + pr_info("invalid flags combination: %x\n", info->flags); + return -EINVAL; + } + + /* If version 2 is specified, check that incompatible params + * are not supplied + */ + if (info->flags & XT_L2TP_VERSION) { + if ((info->version < 2) || (info->version > 3)) { + pr_info("wrong L2TP version: %u\n", info->version); + return -EINVAL; + } + + if (info->version == 2) { + if ((info->flags & XT_L2TP_TID) && + (info->tid > 0xffff)) { + pr_info("v2 tid > 0xffff: %u\n", info->tid); + return -EINVAL; + } + if ((info->flags & XT_L2TP_SID) && + (info->sid > 0xffff)) { + pr_info("v2 sid > 0xffff: %u\n", info->sid); + return -EINVAL; + } + } + } + + return 0; +} + +static int l2tp_mt_check4(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + const struct ipt_entry *e = par->entryinfo; + const struct ipt_ip *ip = &e->ip; + int ret; + + ret = l2tp_mt_check(par); + if (ret != 0) + return ret; + + if ((ip->proto != IPPROTO_UDP) && + (ip->proto != IPPROTO_L2TP)) { + pr_info("missing protocol rule (udp|l2tpip)\n"); + return -EINVAL; + } + + if ((ip->proto == IPPROTO_L2TP) && + (info->version == 2)) { + pr_info("v2 doesn't support IP mode\n"); + return -EINVAL; + } + + return 0; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int l2tp_mt_check6(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + const struct ip6t_entry *e = par->entryinfo; + const struct ip6t_ip6 *ip = &e->ipv6; + int ret; + + ret = l2tp_mt_check(par); + if (ret != 0) + return ret; + + if ((ip->proto != IPPROTO_UDP) && + (ip->proto != IPPROTO_L2TP)) { + pr_info("missing protocol rule (udp|l2tpip)\n"); + return -EINVAL; + } + + if ((ip->proto == IPPROTO_L2TP) && + (info->version == 2)) { + pr_info("v2 doesn't support IP mode\n"); + return -EINVAL; + } + + return 0; +} +#endif + +static struct xt_match l2tp_mt_reg[] __read_mostly = { + { + .name = "l2tp", + .revision = 0, + .family = NFPROTO_IPV4, + .match = l2tp_mt4, + .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), + .checkentry = l2tp_mt_check4, + .hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD)), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "l2tp", + .revision = 0, + .family = NFPROTO_IPV6, + .match = l2tp_mt6, + .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), + .checkentry = l2tp_mt_check6, + .hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD)), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init l2tp_mt_init(void) +{ + return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +static void __exit l2tp_mt_exit(void) +{ + xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +module_init(l2tp_mt_init); +module_exit(l2tp_mt_exit); diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 77288c5ada7..176e5570a99 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -15,71 +15,56 @@ #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("IP tables packet length matching module"); +MODULE_DESCRIPTION("Xtables: Packet length (Layer3,4,5) match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_length"); MODULE_ALIAS("ip6t_length"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +length_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_length_info *info = matchinfo; + const struct xt_length_info *info = par->matchinfo; u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } -static int -match6(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +length_mt6(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_length_info *info = matchinfo; - const u_int16_t pktlen = (ntohs(ipv6_hdr(skb)->payload_len) + - sizeof(struct ipv6hdr)); + const struct xt_length_info *info = par->matchinfo; + const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + + sizeof(struct ipv6hdr); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } -static struct xt_match xt_length_match[] = { +static struct xt_match length_mt_reg[] __read_mostly = { { .name = "length", - .family = AF_INET, - .match = match, + .family = NFPROTO_IPV4, + .match = length_mt, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, }, { .name = "length", - .family = AF_INET6, - .match = match6, + .family = NFPROTO_IPV6, + .match = length_mt6, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, }, }; -static int __init xt_length_init(void) +static int __init length_mt_init(void) { - return xt_register_matches(xt_length_match, - ARRAY_SIZE(xt_length_match)); + return xt_register_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); } -static void __exit xt_length_fini(void) +static void __exit length_mt_exit(void) { - xt_unregister_matches(xt_length_match, ARRAY_SIZE(xt_length_match)); + xt_unregister_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); } -module_init(xt_length_init); -module_exit(xt_length_fini); +module_init(length_mt_init); +module_exit(length_mt_exit); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 571a72ab89a..bef85059655 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -1,11 +1,14 @@ -/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> - * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> +/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/spinlock.h> @@ -14,9 +17,14 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_limit.h> +struct xt_limit_priv { + unsigned long prev; + uint32_t credit; +}; + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); -MODULE_DESCRIPTION("iptables rate limit match"); +MODULE_DESCRIPTION("Xtables: rate-limit match"); MODULE_ALIAS("ipt_limit"); MODULE_ALIAS("ip6t_limit"); @@ -57,38 +65,31 @@ static DEFINE_SPINLOCK(limit_lock); #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) -static int -ipt_limit_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +limit_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct xt_rateinfo *r = ((struct xt_rateinfo *)matchinfo)->master; + const struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv = r->master; unsigned long now = jiffies; spin_lock_bh(&limit_lock); - r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; - if (r->credit > r->credit_cap) - r->credit = r->credit_cap; + priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + if (priv->credit > r->credit_cap) + priv->credit = r->credit_cap; - if (r->credit >= r->cost) { + if (priv->credit >= r->cost) { /* We're not limited. */ - r->credit -= r->cost; + priv->credit -= r->cost; spin_unlock_bh(&limit_lock); - return 1; + return true; } spin_unlock_bh(&limit_lock); - return 0; + return false; } /* Precision saver. */ -static u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user) { /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -98,34 +99,41 @@ user2credits(u_int32_t user) return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE; } -static int -ipt_limit_checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int limit_mt_check(const struct xt_mtchk_param *par) { - struct xt_rateinfo *r = matchinfo; + struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv; /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - printk("Overflow in xt_limit, try lower: %u/%u\n", - r->avg, r->burst); - return 0; + pr_info("Overflow, try lower: %u/%u\n", + r->avg, r->burst); + return -ERANGE; } - /* For SMP, we only want to use one set of counters. */ - r->master = r; + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + /* For SMP, we only want to use one set of state. */ + r->master = priv; + /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * + 128. */ + priv->prev = jiffies; + priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ if (r->cost == 0) { - /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * - 128. */ - r->prev = jiffies; - r->credit = user2credits(r->avg * r->burst); /* Credits full. */ - r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } - return 1; + return 0; +} + +static void limit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_rateinfo *info = par->matchinfo; + + kfree(info->master); } #ifdef CONFIG_COMPAT @@ -142,9 +150,9 @@ struct compat_xt_rateinfo { /* To keep the full "prev" timestamp, the upper 32 bits are stored in the * master pointer, which does not need to be preserved. */ -static void compat_from_user(void *dst, void *src) +static void limit_mt_compat_from_user(void *dst, const void *src) { - struct compat_xt_rateinfo *cm = src; + const struct compat_xt_rateinfo *cm = src; struct xt_rateinfo m = { .avg = cm->avg, .burst = cm->burst, @@ -156,9 +164,9 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &m, sizeof(m)); } -static int compat_to_user(void __user *dst, void *src) +static int limit_mt_compat_to_user(void __user *dst, const void *src) { - struct xt_rateinfo *m = src; + const struct xt_rateinfo *m = src; struct compat_xt_rateinfo cm = { .avg = m->avg, .burst = m->burst, @@ -172,39 +180,31 @@ static int compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_match xt_limit_match[] = { - { - .name = "limit", - .family = AF_INET, - .checkentry = ipt_limit_checkentry, - .match = ipt_limit_match, - .matchsize = sizeof(struct xt_rateinfo), +static struct xt_match limit_mt_reg __read_mostly = { + .name = "limit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = limit_mt, + .checkentry = limit_mt_check, + .destroy = limit_mt_destroy, + .matchsize = sizeof(struct xt_rateinfo), #ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_rateinfo), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compatsize = sizeof(struct compat_xt_rateinfo), + .compat_from_user = limit_mt_compat_from_user, + .compat_to_user = limit_mt_compat_to_user, #endif - .me = THIS_MODULE, - }, - { - .name = "limit", - .family = AF_INET6, - .checkentry = ipt_limit_checkentry, - .match = ipt_limit_match, - .matchsize = sizeof(struct xt_rateinfo), - .me = THIS_MODULE, - }, + .me = THIS_MODULE, }; -static int __init xt_limit_init(void) +static int __init limit_mt_init(void) { - return xt_register_matches(xt_limit_match, ARRAY_SIZE(xt_limit_match)); + return xt_register_match(&limit_mt_reg); } -static void __exit xt_limit_fini(void) +static void __exit limit_mt_exit(void) { - xt_unregister_matches(xt_limit_match, ARRAY_SIZE(xt_limit_match)); + xt_unregister_match(&limit_mt_reg); } -module_init(xt_limit_init); -module_exit(xt_limit_fini); +module_init(limit_mt_init); +module_exit(limit_mt_exit); diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 1d3a1d98b88..d5b4fd4f91e 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/etherdevice.h> @@ -20,62 +21,46 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables mac matching module"); +MODULE_DESCRIPTION("Xtables: MAC address match"); MODULE_ALIAS("ipt_mac"); MODULE_ALIAS("ip6t_mac"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_mac_info *info = matchinfo; + const struct xt_mac_info *info = par->matchinfo; + bool ret; - /* Is mac pointer valid? */ - return (skb_mac_header(skb) >= skb->head && - (skb_mac_header(skb) + ETH_HLEN) <= skb->data - /* If so, compare... */ - && ((!compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr)) - ^ info->invert)); + if (skb->dev == NULL || skb->dev->type != ARPHRD_ETHER) + return false; + if (skb_mac_header(skb) < skb->head) + return false; + if (skb_mac_header(skb) + ETH_HLEN > skb->data) + return false; + ret = ether_addr_equal(eth_hdr(skb)->h_source, info->srcaddr); + ret ^= info->invert; + return ret; } -static struct xt_match xt_mac_match[] = { - { - .name = "mac", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct xt_mac_info), - .hooks = (1 << NF_IP_PRE_ROUTING) | - (1 << NF_IP_LOCAL_IN) | - (1 << NF_IP_FORWARD), - .me = THIS_MODULE, - }, - { - .name = "mac", - .family = AF_INET6, - .match = match, - .matchsize = sizeof(struct xt_mac_info), - .hooks = (1 << NF_IP6_PRE_ROUTING) | - (1 << NF_IP6_LOCAL_IN) | - (1 << NF_IP6_FORWARD), - .me = THIS_MODULE, - }, +static struct xt_match mac_mt_reg __read_mostly = { + .name = "mac", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = mac_mt, + .matchsize = sizeof(struct xt_mac_info), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, }; -static int __init xt_mac_init(void) +static int __init mac_mt_init(void) { - return xt_register_matches(xt_mac_match, ARRAY_SIZE(xt_mac_match)); + return xt_register_match(&mac_mt_reg); } -static void __exit xt_mac_fini(void) +static void __exit mac_mt_exit(void) { - xt_unregister_matches(xt_mac_match, ARRAY_SIZE(xt_mac_match)); + xt_unregister_match(&mac_mt_reg); } -module_init(xt_mac_init); -module_exit(xt_mac_fini); +module_init(mac_mt_init); +module_exit(mac_mt_exit); diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 39911dddb01..23345238711 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -1,10 +1,13 @@ -/* Kernel module to match NFMARK values. */ - -/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> +/* + * xt_mark - Netfilter module to match NFMARK value + * + * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@medozas.de> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ #include <linux/module.h> @@ -15,105 +18,67 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables mark matching module"); +MODULE_DESCRIPTION("Xtables: packet mark operations"); MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); +MODULE_ALIAS("ipt_MARK"); +MODULE_ALIAS("ip6t_MARK"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static unsigned int +mark_tg(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_mark_info *info = matchinfo; + const struct xt_mark_tginfo2 *info = par->targinfo; - return ((skb->mark & info->mask) == info->mark) ^ info->invert; + skb->mark = (skb->mark & ~info->mask) ^ info->mark; + return XT_CONTINUE; } -static int -checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool +mark_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_mark_info *minfo = matchinfo; + const struct xt_mark_mtinfo1 *info = par->matchinfo; - if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { - printk(KERN_WARNING "mark: only supports 32bit mark\n"); - return 0; - } - return 1; + return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -#ifdef CONFIG_COMPAT -struct compat_xt_mark_info { - compat_ulong_t mark, mask; - u_int8_t invert; - u_int8_t __pad1; - u_int16_t __pad2; +static struct xt_target mark_tg_reg __read_mostly = { + .name = "MARK", + .revision = 2, + .family = NFPROTO_UNSPEC, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, }; -static void compat_from_user(void *dst, void *src) -{ - struct compat_xt_mark_info *cm = src; - struct xt_mark_info m = { - .mark = cm->mark, - .mask = cm->mask, - .invert = cm->invert, - }; - memcpy(dst, &m, sizeof(m)); -} - -static int compat_to_user(void __user *dst, void *src) -{ - struct xt_mark_info *m = src; - struct compat_xt_mark_info cm = { - .mark = m->mark, - .mask = m->mask, - .invert = m->invert, - }; - return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_match xt_mark_match[] = { - { - .name = "mark", - .family = AF_INET, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_mark_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_mark_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, -#endif - .me = THIS_MODULE, - }, - { - .name = "mark", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_mark_info), - .me = THIS_MODULE, - }, +static struct xt_match mark_mt_reg __read_mostly = { + .name = "mark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = mark_mt, + .matchsize = sizeof(struct xt_mark_mtinfo1), + .me = THIS_MODULE, }; -static int __init xt_mark_init(void) +static int __init mark_mt_init(void) { - return xt_register_matches(xt_mark_match, ARRAY_SIZE(xt_mark_match)); + int ret; + + ret = xt_register_target(&mark_tg_reg); + if (ret < 0) + return ret; + ret = xt_register_match(&mark_mt_reg); + if (ret < 0) { + xt_unregister_target(&mark_tg_reg); + return ret; + } + return 0; } -static void __exit xt_mark_fini(void) +static void __exit mark_mt_exit(void) { - xt_unregister_matches(xt_mark_match, ARRAY_SIZE(xt_mark_match)); + xt_unregister_match(&mark_mt_reg); + xt_unregister_target(&mark_tg_reg); } -module_init(xt_mark_init); -module_exit(xt_mark_fini); +module_init(mark_mt_init); +module_exit(mark_mt_exit); diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index 4dce2a81702..ac1d3c3d09e 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -8,7 +8,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/udp.h> @@ -22,35 +22,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("x_tables multiple port match module"); +MODULE_DESCRIPTION("Xtables: multiple port matching for TCP, UDP, UDP-Lite, SCTP and DCCP"); MODULE_ALIAS("ipt_multiport"); MODULE_ALIAS("ip6t_multiport"); -#if 0 -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif - -/* Returns 1 if the port is matched by the test, 0 otherwise. */ -static inline int -ports_match(const u_int16_t *portlist, enum xt_multiport_flags flags, - u_int8_t count, u_int16_t src, u_int16_t dst) -{ - unsigned int i; - for (i = 0; i < count; i++) { - if (flags != XT_MULTIPORT_DESTINATION && portlist[i] == src) - return 1; - - if (flags != XT_MULTIPORT_SOURCE && portlist[i] == dst) - return 1; - } - - return 0; -} - /* Returns 1 if the port is matched by the test, 0 otherwise. */ -static inline int +static inline bool ports_match_v1(const struct xt_multiport_v1 *minfo, u_int16_t src, u_int16_t dst) { @@ -63,98 +40,61 @@ ports_match_v1(const struct xt_multiport_v1 *minfo, if (minfo->pflags[i]) { /* range port matching */ e = minfo->ports[++i]; - duprintf("src or dst matches with %d-%d?\n", s, e); + pr_debug("src or dst matches with %d-%d?\n", s, e); if (minfo->flags == XT_MULTIPORT_SOURCE && src >= s && src <= e) - return 1 ^ minfo->invert; + return true ^ minfo->invert; if (minfo->flags == XT_MULTIPORT_DESTINATION && dst >= s && dst <= e) - return 1 ^ minfo->invert; + return true ^ minfo->invert; if (minfo->flags == XT_MULTIPORT_EITHER && ((dst >= s && dst <= e) || (src >= s && src <= e))) - return 1 ^ minfo->invert; + return true ^ minfo->invert; } else { /* exact port matching */ - duprintf("src or dst matches with %d?\n", s); + pr_debug("src or dst matches with %d?\n", s); if (minfo->flags == XT_MULTIPORT_SOURCE && src == s) - return 1 ^ minfo->invert; + return true ^ minfo->invert; if (minfo->flags == XT_MULTIPORT_DESTINATION && dst == s) - return 1 ^ minfo->invert; + return true ^ minfo->invert; if (minfo->flags == XT_MULTIPORT_EITHER && (src == s || dst == s)) - return 1 ^ minfo->invert; + return true ^ minfo->invert; } } return minfo->invert; } -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +multiport_mt(const struct sk_buff *skb, struct xt_action_param *par) { - __be16 _ports[2], *pptr; - const struct xt_multiport *multiinfo = matchinfo; + const __be16 *pptr; + __be16 _ports[2]; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; - if (offset) - return 0; + if (par->fragoff != 0) + return false; - pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n"); - *hotdrop = 1; - return 0; - } - - return ports_match(multiinfo->ports, - multiinfo->flags, multiinfo->count, - ntohs(pptr[0]), ntohs(pptr[1])); -} - -static int -match_v1(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - __be16 _ports[2], *pptr; - const struct xt_multiport_v1 *multiinfo = matchinfo; - - if (offset) - return 0; - - pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); - if (pptr == NULL) { - /* We've been asked to examine this packet, and we - * can't. Hence, no choice but to drop. - */ - duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n"); - *hotdrop = 1; - return 0; + pr_debug("Dropping evil offset=0 tinygram.\n"); + par->hotdrop = true; + return false; } return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); } -static inline int +static inline bool check(u_int16_t proto, u_int8_t ip_invflags, u_int8_t match_flags, @@ -171,113 +111,55 @@ check(u_int16_t proto, && count <= XT_MULTI_PORTS; } -/* Called when user tries to insert an entry of this type. */ -static int -checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) -{ - const struct ipt_ip *ip = info; - const struct xt_multiport *multiinfo = matchinfo; - - return check(ip->proto, ip->invflags, multiinfo->flags, - multiinfo->count); -} - -static int -checkentry_v1(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int multiport_mt_check(const struct xt_mtchk_param *par) { - const struct ipt_ip *ip = info; - const struct xt_multiport_v1 *multiinfo = matchinfo; + const struct ipt_ip *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, - multiinfo->count); + multiinfo->count) ? 0 : -EINVAL; } -static int -checkentry6(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int multiport_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_ip6 *ip = info; - const struct xt_multiport *multiinfo = matchinfo; + const struct ip6t_ip6 *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, - multiinfo->count); + multiinfo->count) ? 0 : -EINVAL; } -static int -checkentry6_v1(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) -{ - const struct ip6t_ip6 *ip = info; - const struct xt_multiport_v1 *multiinfo = matchinfo; - - return check(ip->proto, ip->invflags, multiinfo->flags, - multiinfo->count); -} - -static struct xt_match xt_multiport_match[] = { - { - .name = "multiport", - .family = AF_INET, - .revision = 0, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_multiport), - .me = THIS_MODULE, - }, +static struct xt_match multiport_mt_reg[] __read_mostly = { { .name = "multiport", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 1, - .checkentry = checkentry_v1, - .match = match_v1, + .checkentry = multiport_mt_check, + .match = multiport_mt, .matchsize = sizeof(struct xt_multiport_v1), .me = THIS_MODULE, }, { .name = "multiport", - .family = AF_INET6, - .revision = 0, - .checkentry = checkentry6, - .match = match, - .matchsize = sizeof(struct xt_multiport), - .me = THIS_MODULE, - }, - { - .name = "multiport", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 1, - .checkentry = checkentry6_v1, - .match = match_v1, + .checkentry = multiport_mt6_check, + .match = multiport_mt, .matchsize = sizeof(struct xt_multiport_v1), .me = THIS_MODULE, }, }; -static int __init xt_multiport_init(void) +static int __init multiport_mt_init(void) { - return xt_register_matches(xt_multiport_match, - ARRAY_SIZE(xt_multiport_match)); + return xt_register_matches(multiport_mt_reg, + ARRAY_SIZE(multiport_mt_reg)); } -static void __exit xt_multiport_fini(void) +static void __exit multiport_mt_exit(void) { - xt_unregister_matches(xt_multiport_match, - ARRAY_SIZE(xt_multiport_match)); + xt_unregister_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg)); } -module_init(xt_multiport_init); -module_exit(xt_multiport_fini); +module_init(multiport_mt_init); +module_exit(multiport_mt_exit); diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c new file mode 100644 index 00000000000..bea7464cc43 --- /dev/null +++ b/net/netfilter/xt_nat.c @@ -0,0 +1,170 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat_core.h> + +static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + + if (mr->rangesize != 1) { + pr_info("%s: multiple ranges no longer supported\n", + par->target->name); + return -EINVAL; + } + return 0; +} + +static void xt_nat_convert_range(struct nf_nat_range *dst, + const struct nf_nat_ipv4_range *src) +{ + memset(&dst->min_addr, 0, sizeof(dst->min_addr)); + memset(&dst->max_addr, 0, sizeof(dst->max_addr)); + + dst->flags = src->flags; + dst->min_addr.ip = src->min_ip; + dst->max_addr.ip = src->max_ip; + dst->min_proto = src->min; + dst->max_proto = src->max; +} + +static unsigned int +xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range range; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + xt_nat_convert_range(&range, &mr->range[0]); + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range range; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + xt_nat_convert_range(&range, &mr->range[0]); + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} + +static unsigned int +xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST); +} + +static struct xt_target xt_nat_target_reg[] __read_mostly = { + { + .name = "SNAT", + .revision = 0, + .checkentry = xt_nat_checkentry_v0, + .target = xt_snat_target_v0, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .family = NFPROTO_IPV4, + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "DNAT", + .revision = 0, + .checkentry = xt_nat_checkentry_v0, + .target = xt_dnat_target_v0, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .family = NFPROTO_IPV4, + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, + { + .name = "SNAT", + .revision = 1, + .target = xt_snat_target_v1, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "DNAT", + .revision = 1, + .target = xt_dnat_target_v1, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, +}; + +static int __init xt_nat_init(void) +{ + return xt_register_targets(xt_nat_target_reg, + ARRAY_SIZE(xt_nat_target_reg)); +} + +static void __exit xt_nat_exit(void) +{ + xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); +} + +module_init(xt_nat_init); +module_exit(xt_nat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ipt_SNAT"); +MODULE_ALIAS("ipt_DNAT"); +MODULE_ALIAS("ip6t_SNAT"); +MODULE_ALIAS("ip6t_DNAT"); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c new file mode 100644 index 00000000000..8c646ed9c92 --- /dev/null +++ b/net/netfilter/xt_nfacct.c @@ -0,0 +1,79 @@ +/* + * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2011 Intra2net AG <http://www.intra2net.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 (or any + * later at your option) as published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/nfnetlink_acct.h> +#include <linux/netfilter/xt_nfacct.h> + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: match for the extended accounting infrastructure"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_nfacct"); +MODULE_ALIAS("ip6t_nfacct"); + +static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + int overquota; + const struct xt_nfacct_match_info *info = par->targinfo; + + nfnl_acct_update(skb, info->nfacct); + + overquota = nfnl_acct_overquota(skb, info->nfacct); + + return overquota == NFACCT_UNDERQUOTA ? false : true; +} + +static int +nfacct_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_nfacct_match_info *info = par->matchinfo; + struct nf_acct *nfacct; + + nfacct = nfnl_acct_find_get(info->name); + if (nfacct == NULL) { + pr_info("xt_nfacct: accounting object with name `%s' " + "does not exists\n", info->name); + return -ENOENT; + } + info->nfacct = nfacct; + return 0; +} + +static void +nfacct_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_nfacct_match_info *info = par->matchinfo; + + nfnl_acct_put(info->nfacct); +} + +static struct xt_match nfacct_mt_reg __read_mostly = { + .name = "nfacct", + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info), + .me = THIS_MODULE, +}; + +static int __init nfacct_mt_init(void) +{ + return xt_register_match(&nfacct_mt_reg); +} + +static void __exit nfacct_mt_exit(void) +{ + xt_unregister_match(&nfacct_mt_reg); +} + +module_init(nfacct_mt_init); +module_exit(nfacct_mt_exit); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c new file mode 100644 index 00000000000..c529161cdbf --- /dev/null +++ b/net/netfilter/xt_osf.c @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2003+ Evgeniy Polyakov <zbr@ioremap.net> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/kernel.h> + +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/tcp.h> + +#include <net/ip.h> +#include <net/tcp.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netfilter/xt_osf.h> + +struct xt_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct xt_osf_user_finger finger; +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +/* + * Indexed by dont-fragment bit. + * It is the only constant value in the fingerprint. + */ +static struct list_head xt_osf_fingers[2]; + +static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { + [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) }, +}; + +static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[]) +{ + struct xt_osf_user_finger *f; + struct xt_osf_finger *kf = NULL, *sf; + int err = 0; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL); + if (!kf) + return -ENOMEM; + + memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger)); + + list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) + continue; + + kfree(kf); + kf = NULL; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + err = -EEXIST; + break; + } + + /* + * We are protected by nfnl mutex. + */ + if (kf) + list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]); + + return err; +} + +static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[]) +{ + struct xt_osf_user_finger *f; + struct xt_osf_finger *sf; + int err = -ENOENT; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) + continue; + + /* + * We are protected by nfnl mutex. + */ + list_del_rcu(&sf->finger_entry); + kfree_rcu(sf, rcu_head); + + err = 0; + break; + } + + return err; +} + +static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = { + [OSF_MSG_ADD] = { + .call = xt_osf_add_callback, + .attr_count = OSF_ATTR_MAX, + .policy = xt_osf_policy, + }, + [OSF_MSG_REMOVE] = { + .call = xt_osf_remove_callback, + .attr_count = OSF_ATTR_MAX, + .policy = xt_osf_policy, + }, +}; + +static const struct nfnetlink_subsystem xt_osf_nfnetlink = { + .name = "osf", + .subsys_id = NFNL_SUBSYS_OSF, + .cb_count = OSF_MSG_MAX, + .cb = xt_osf_nfnetlink_callbacks, +}; + +static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info, + unsigned char f_ttl) +{ + const struct iphdr *ip = ip_hdr(skb); + + if (info->flags & XT_OSF_TTL) { + if (info->ttl == XT_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (info->ttl == XT_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + else { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + int ret = 0; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; + } + } + endfor_ifa(in_dev); + + return ret; + } + } + + return ip->ttl == f_ttl; +} + +static bool +xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) +{ + const struct xt_osf_info *info = p->matchinfo; + const struct iphdr *ip = ip_hdr(skb); + const struct tcphdr *tcp; + struct tcphdr _tcph; + int fmatch = FMATCH_WRONG, fcount = 0; + unsigned int optsize = 0, check_WSS = 0; + u16 window, totlen, mss = 0; + bool df; + const unsigned char *optp = NULL, *_optp = NULL; + unsigned char opts[MAX_IPOPTLEN]; + const struct xt_osf_finger *kf; + const struct xt_osf_user_finger *f; + struct net *net = dev_net(p->in ? p->in : p->out); + + if (!info) + return false; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return false; + + if (!tcp->syn) + return false; + + totlen = ntohs(ip->tot_len); + df = ntohs(ip->frag_off) & IP_DF; + window = ntohs(tcp->window); + + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + optsize = tcp->doff * 4 - sizeof(struct tcphdr); + + _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), optsize, opts); + } + + rcu_read_lock(); + list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) { + f = &kf->finger; + + if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre)) + continue; + + optp = _optp; + fmatch = FMATCH_WRONG; + + if (totlen == f->ss && xt_osf_ttl(skb, info, f->ttl)) { + int foptsize, optnum; + + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) + continue; + + /* Check options */ + + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; + + if (foptsize > MAX_IPOPTLEN || + optsize > MAX_IPOPTLEN || + optsize != foptsize) + continue; + + check_WSS = f->wss.wc; + + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == (*optp)) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = optp + len; + int loop_cont = 0; + + fmatch = FMATCH_OK; + + switch (*optp) { + case OSFOPT_MSS: + mss = optp[3]; + mss <<= 8; + mss |= optp[2]; + + mss = ntohs((__force __be16)mss); + break; + case OSFOPT_TS: + loop_cont = 1; + break; + } + + optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } + + if (fmatch != FMATCH_OPT_WRONG) { + fmatch = FMATCH_WRONG; + + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (window == f->wss.val * mss || + window == f->wss.val * SMART_MSS_1 || + window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (window == f->wss.val * (mss + 40) || + window == f->wss.val * (SMART_MSS_1 + 40) || + window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } + + if (fmatch != FMATCH_OK) + continue; + + fcount++; + + if (info->flags & XT_OSF_LOG) + nf_log_packet(net, p->family, p->hooknum, skb, + p->in, p->out, NULL, + "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", + f->genre, f->version, f->subtype, + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest), + f->ttl - ip->ttl); + + if ((info->flags & XT_OSF_LOG) && + info->loglevel == XT_OSF_LOGLEVEL_FIRST) + break; + } + } + rcu_read_unlock(); + + if (!fcount && (info->flags & XT_OSF_LOG)) + nf_log_packet(net, p->family, p->hooknum, skb, p->in, + p->out, NULL, + "Remote OS is not known: %pI4:%u -> %pI4:%u\n", + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest)); + + if (fcount) + fmatch = FMATCH_OK; + + return fmatch == FMATCH_OK; +} + +static struct xt_match xt_osf_match = { + .name = "osf", + .revision = 0, + .family = NFPROTO_IPV4, + .proto = IPPROTO_TCP, + .hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD), + .match = xt_osf_match_packet, + .matchsize = sizeof(struct xt_osf_info), + .me = THIS_MODULE, +}; + +static int __init xt_osf_init(void) +{ + int err = -EINVAL; + int i; + + for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) + INIT_LIST_HEAD(&xt_osf_fingers[i]); + + err = nfnetlink_subsys_register(&xt_osf_nfnetlink); + if (err < 0) { + pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); + goto err_out_exit; + } + + err = xt_register_match(&xt_osf_match); + if (err) { + pr_err("Failed to register OS fingerprint " + "matching module (%d)\n", err); + goto err_out_remove; + } + + return 0; + +err_out_remove: + nfnetlink_subsys_unregister(&xt_osf_nfnetlink); +err_out_exit: + return err; +} + +static void __exit xt_osf_fini(void) +{ + struct xt_osf_finger *f; + int i; + + nfnetlink_subsys_unregister(&xt_osf_nfnetlink); + xt_unregister_match(&xt_osf_match); + + rcu_read_lock(); + for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) { + + list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) { + list_del_rcu(&f->finger_entry); + kfree_rcu(f, rcu_head); + } + } + rcu_read_unlock(); + + rcu_barrier(); +} + +module_init(xt_osf_init); +module_exit(xt_osf_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); +MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c new file mode 100644 index 00000000000..ca2e577ed8a --- /dev/null +++ b/net/netfilter/xt_owner.c @@ -0,0 +1,100 @@ +/* + * Kernel module to match various things tied to sockets associated with + * locally generated outgoing packets. + * + * (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/file.h> +#include <net/sock.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_owner.h> + +static int owner_check(const struct xt_mtchk_param *par) +{ + struct xt_owner_match_info *info = par->matchinfo; + + /* For now only allow adding matches from the initial user namespace */ + if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && + (current_user_ns() != &init_user_ns)) + return -EINVAL; + return 0; +} + +static bool +owner_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_owner_match_info *info = par->matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return (info->match ^ info->invert) == 0; + else if (info->match & info->invert & XT_OWNER_SOCKET) + /* + * Socket exists but user wanted ! --socket-exists. + * (Single ampersands intended.) + */ + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return ((info->match ^ info->invert) & + (XT_OWNER_UID | XT_OWNER_GID)) == 0; + + if (info->match & XT_OWNER_UID) { + kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); + kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); + if ((uid_gte(filp->f_cred->fsuid, uid_min) && + uid_lte(filp->f_cred->fsuid, uid_max)) ^ + !(info->invert & XT_OWNER_UID)) + return false; + } + + if (info->match & XT_OWNER_GID) { + kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); + kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); + if ((gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) ^ + !(info->invert & XT_OWNER_GID)) + return false; + } + + return true; +} + +static struct xt_match owner_mt_reg __read_mostly = { + .name = "owner", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = owner_check, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, +}; + +static int __init owner_mt_init(void) +{ + return xt_register_match(&owner_mt_reg); +} + +static void __exit owner_mt_exit(void) +{ + xt_unregister_match(&owner_mt_reg); +} + +module_init(owner_mt_init); +module_exit(owner_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: socket owner matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_owner"); +MODULE_ALIAS("ip6t_owner"); diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 35a0fe200c3..d7ca16b8b8d 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -7,38 +7,28 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter/xt_physdev.h> #include <linux/netfilter/x_tables.h> -#include <linux/netfilter_bridge.h> -#define MATCH 1 -#define NOMATCH 0 MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); -MODULE_DESCRIPTION("iptables bridge physical device match module"); +MODULE_DESCRIPTION("Xtables: Bridge physical device match"); MODULE_ALIAS("ipt_physdev"); MODULE_ALIAS("ip6t_physdev"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) + +static bool +physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) { - int i; - static const char nulldevname[IFNAMSIZ]; - const struct xt_physdev_info *info = matchinfo; - unsigned int ret; + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + const struct xt_physdev_info *info = par->matchinfo; + unsigned long ret; const char *indev, *outdev; - struct nf_bridge_info *nf_bridge; + const struct nf_bridge_info *nf_bridge; /* Not a bridged IP packet or no info available yet: * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if @@ -47,115 +37,92 @@ match(const struct sk_buff *skb, /* Return MATCH if the invert flags of the used options are on */ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && !(info->invert & XT_PHYSDEV_OP_BRIDGED)) - return NOMATCH; + return false; if ((info->bitmask & XT_PHYSDEV_OP_ISIN) && !(info->invert & XT_PHYSDEV_OP_ISIN)) - return NOMATCH; + return false; if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) && !(info->invert & XT_PHYSDEV_OP_ISOUT)) - return NOMATCH; + return false; if ((info->bitmask & XT_PHYSDEV_OP_IN) && !(info->invert & XT_PHYSDEV_OP_IN)) - return NOMATCH; + return false; if ((info->bitmask & XT_PHYSDEV_OP_OUT) && !(info->invert & XT_PHYSDEV_OP_OUT)) - return NOMATCH; - return MATCH; + return false; + return true; } /* This only makes sense in the FORWARD and POSTROUTING chains */ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && (!!(nf_bridge->mask & BRNF_BRIDGED) ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED))) - return NOMATCH; + return false; if ((info->bitmask & XT_PHYSDEV_OP_ISIN && (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) || (info->bitmask & XT_PHYSDEV_OP_ISOUT && (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT)))) - return NOMATCH; + return false; if (!(info->bitmask & XT_PHYSDEV_OP_IN)) goto match_outdev; indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { - ret |= (((const unsigned int *)indev)[i] - ^ ((const unsigned int *)info->physindev)[i]) - & ((const unsigned int *)info->in_mask)[i]; - } + ret = ifname_compare_aligned(indev, info->physindev, info->in_mask); - if ((ret == 0) ^ !(info->invert & XT_PHYSDEV_OP_IN)) - return NOMATCH; + if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN)) + return false; match_outdev: if (!(info->bitmask & XT_PHYSDEV_OP_OUT)) - return MATCH; + return true; outdev = nf_bridge->physoutdev ? nf_bridge->physoutdev->name : nulldevname; - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { - ret |= (((const unsigned int *)outdev)[i] - ^ ((const unsigned int *)info->physoutdev)[i]) - & ((const unsigned int *)info->out_mask)[i]; - } + ret = ifname_compare_aligned(outdev, info->physoutdev, info->out_mask); - return (ret != 0) ^ !(info->invert & XT_PHYSDEV_OP_OUT); + return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); } -static int -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int physdev_mt_check(const struct xt_mtchk_param *par) { - const struct xt_physdev_info *info = matchinfo; + const struct xt_physdev_info *info = par->matchinfo; if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) - return 0; + return -EINVAL; if (info->bitmask & XT_PHYSDEV_OP_OUT && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && - hook_mask & ((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_POST_ROUTING))) { - printk(KERN_WARNING "physdev match: using --physdev-out in the " - "OUTPUT, FORWARD and POSTROUTING chains for non-bridged " - "traffic is not supported anymore.\n"); - if (hook_mask & (1 << NF_IP_LOCAL_OUT)) - return 0; + par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { + pr_info("using --physdev-out in the OUTPUT, FORWARD and " + "POSTROUTING chains for non-bridged traffic is not " + "supported anymore.\n"); + if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) + return -EINVAL; } - return 1; + return 0; } -static struct xt_match xt_physdev_match[] = { - { - .name = "physdev", - .family = AF_INET, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_physdev_info), - .me = THIS_MODULE, - }, - { - .name = "physdev", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_physdev_info), - .me = THIS_MODULE, - }, +static struct xt_match physdev_mt_reg __read_mostly = { + .name = "physdev", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = physdev_mt_check, + .match = physdev_mt, + .matchsize = sizeof(struct xt_physdev_info), + .me = THIS_MODULE, }; -static int __init xt_physdev_init(void) +static int __init physdev_mt_init(void) { - return xt_register_matches(xt_physdev_match, - ARRAY_SIZE(xt_physdev_match)); + return xt_register_match(&physdev_mt_reg); } -static void __exit xt_physdev_fini(void) +static void __exit physdev_mt_exit(void) { - xt_unregister_matches(xt_physdev_match, ARRAY_SIZE(xt_physdev_match)); + xt_unregister_match(&physdev_mt_reg); } -module_init(xt_physdev_init); -module_exit(xt_physdev_fini); +module_init(physdev_mt_init); +module_exit(physdev_mt_exit); diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index e1409fc5c28..5b645cb598f 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -11,65 +11,55 @@ #include <linux/if_packet.h> #include <linux/in.h> #include <linux/ip.h> +#include <linux/ipv6.h> #include <linux/netfilter/xt_pkttype.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>"); -MODULE_DESCRIPTION("IP tables match to match on linklayer packet type"); +MODULE_DESCRIPTION("Xtables: link layer packet type match"); MODULE_ALIAS("ipt_pkttype"); MODULE_ALIAS("ip6t_pkttype"); -static int match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) { + const struct xt_pkttype_info *info = par->matchinfo; u_int8_t type; - const struct xt_pkttype_info *info = matchinfo; - if (skb->pkt_type == PACKET_LOOPBACK) - type = (MULTICAST(ip_hdr(skb)->daddr) - ? PACKET_MULTICAST - : PACKET_BROADCAST); - else + if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; + else if (par->family == NFPROTO_IPV4 && + ipv4_is_multicast(ip_hdr(skb)->daddr)) + type = PACKET_MULTICAST; + else if (par->family == NFPROTO_IPV6 && + ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + type = PACKET_MULTICAST; + else + type = PACKET_BROADCAST; return (type == info->pkttype) ^ info->invert; } -static struct xt_match xt_pkttype_match[] = { - { - .name = "pkttype", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct xt_pkttype_info), - .me = THIS_MODULE, - }, - { - .name = "pkttype", - .family = AF_INET6, - .match = match, - .matchsize = sizeof(struct xt_pkttype_info), - .me = THIS_MODULE, - }, +static struct xt_match pkttype_mt_reg __read_mostly = { + .name = "pkttype", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = pkttype_mt, + .matchsize = sizeof(struct xt_pkttype_info), + .me = THIS_MODULE, }; -static int __init xt_pkttype_init(void) +static int __init pkttype_mt_init(void) { - return xt_register_matches(xt_pkttype_match, - ARRAY_SIZE(xt_pkttype_match)); + return xt_register_match(&pkttype_mt_reg); } -static void __exit xt_pkttype_fini(void) +static void __exit pkttype_mt_exit(void) { - xt_unregister_matches(xt_pkttype_match, ARRAY_SIZE(xt_pkttype_match)); + xt_unregister_match(&pkttype_mt_reg); } -module_init(xt_pkttype_init); -module_exit(xt_pkttype_fini); +module_init(pkttype_mt_init); +module_exit(pkttype_mt_exit); diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 15b45a95ec1..f23e97bb42d 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -6,44 +6,45 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <net/xfrm.h> +#include <linux/netfilter.h> #include <linux/netfilter/xt_policy.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables IPsec policy matching module"); +MODULE_DESCRIPTION("Xtables: IPsec policy match"); MODULE_LICENSE("GPL"); -static inline int -xt_addr_cmp(const union xt_policy_addr *a1, const union xt_policy_addr *m, - const union xt_policy_addr *a2, unsigned short family) +static inline bool +xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, + const union nf_inet_addr *a2, unsigned short family) { switch (family) { - case AF_INET: - return !((a1->a4.s_addr ^ a2->a4.s_addr) & m->a4.s_addr); - case AF_INET6: - return !ipv6_masked_addr_cmp(&a1->a6, &m->a6, &a2->a6); + case NFPROTO_IPV4: + return ((a1->ip ^ a2->ip) & m->ip) == 0; + case NFPROTO_IPV6: + return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; } - return 0; + return false; } -static inline int -match_xfrm_state(struct xfrm_state *x, const struct xt_policy_elem *e, +static bool +match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e, unsigned short family) { #define MATCH_ADDR(x,y,z) (!e->match.x || \ - (xt_addr_cmp(&e->x, &e->y, z, family) \ + (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \ ^ e->invert.x)) #define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) - return MATCH_ADDR(saddr, smask, (union xt_policy_addr *)&x->props.saddr) && - MATCH_ADDR(daddr, dmask, (union xt_policy_addr *)&x->id.daddr) && + return MATCH_ADDR(saddr, smask, &x->props.saddr) && + MATCH_ADDR(daddr, dmask, &x->id.daddr) && MATCH(proto, x->id.proto) && MATCH(mode, x->props.mode) && MATCH(spi, x->id.spi) && @@ -55,7 +56,7 @@ match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; - struct sec_path *sp = skb->sp; + const struct sec_path *sp = skb->sp; int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; @@ -85,7 +86,7 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; - struct dst_entry *dst = skb->dst; + const struct dst_entry *dst = skb_dst(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; @@ -108,93 +109,80 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, return strict ? i == info->len : 0; } -static int match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +policy_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_policy_info *info = matchinfo; + const struct xt_policy_info *info = par->matchinfo; int ret; if (info->flags & XT_POLICY_MATCH_IN) - ret = match_policy_in(skb, info, match->family); + ret = match_policy_in(skb, info, par->family); else - ret = match_policy_out(skb, info, match->family); + ret = match_policy_out(skb, info, par->family); if (ret < 0) - ret = info->flags & XT_POLICY_MATCH_NONE ? 1 : 0; + ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; else if (info->flags & XT_POLICY_MATCH_NONE) - ret = 0; + ret = false; return ret; } -static int checkentry(const char *tablename, const void *ip_void, - const struct xt_match *match, - void *matchinfo, unsigned int hook_mask) +static int policy_mt_check(const struct xt_mtchk_param *par) { - struct xt_policy_info *info = matchinfo; + const struct xt_policy_info *info = par->matchinfo; if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) { - printk(KERN_ERR "xt_policy: neither incoming nor " - "outgoing policy selected\n"); - return 0; + pr_info("neither incoming nor outgoing policy selected\n"); + return -EINVAL; } - /* hook values are equal for IPv4 and IPv6 */ - if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN) - && info->flags & XT_POLICY_MATCH_OUT) { - printk(KERN_ERR "xt_policy: output policy not valid in " - "PRE_ROUTING and INPUT\n"); - return 0; + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { + pr_info("output policy not valid in PREROUTING and INPUT\n"); + return -EINVAL; } - if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT) - && info->flags & XT_POLICY_MATCH_IN) { - printk(KERN_ERR "xt_policy: input policy not valid in " - "POST_ROUTING and OUTPUT\n"); - return 0; + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { + pr_info("input policy not valid in POSTROUTING and OUTPUT\n"); + return -EINVAL; } if (info->len > XT_POLICY_MAX_ELEM) { - printk(KERN_ERR "xt_policy: too many policy elements\n"); - return 0; + pr_info("too many policy elements\n"); + return -EINVAL; } - return 1; + return 0; } -static struct xt_match xt_policy_match[] = { +static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", - .family = AF_INET, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV4, + .checkentry = policy_mt_check, + .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, { .name = "policy", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV6, + .checkentry = policy_mt_check, + .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, }; -static int __init init(void) +static int __init policy_mt_init(void) { - return xt_register_matches(xt_policy_match, - ARRAY_SIZE(xt_policy_match)); + return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } -static void __exit fini(void) +static void __exit policy_mt_exit(void) { - xt_unregister_matches(xt_policy_match, ARRAY_SIZE(xt_policy_match)); + xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } -module_init(init); -module_exit(fini); +module_init(policy_mt_init); +module_exit(policy_mt_exit); MODULE_ALIAS("ipt_policy"); MODULE_ALIAS("ip6t_policy"); diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index bfdde06ca0b..44c8eb4c9d6 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -4,82 +4,87 @@ * Sam Johnston <samj@samj.net> */ #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_quota.h> +#include <linux/module.h> + +struct xt_quota_priv { + spinlock_t lock; + uint64_t quota; +}; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); +MODULE_DESCRIPTION("Xtables: countdown quota match"); MODULE_ALIAS("ipt_quota"); MODULE_ALIAS("ip6t_quota"); -static DEFINE_SPINLOCK(quota_lock); - -static int -match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, int *hotdrop) +static bool +quota_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master; - int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0; + struct xt_quota_info *q = (void *)par->matchinfo; + struct xt_quota_priv *priv = q->master; + bool ret = q->flags & XT_QUOTA_INVERT; - spin_lock_bh("a_lock); - if (q->quota >= skb->len) { - q->quota -= skb->len; - ret ^= 1; + spin_lock_bh(&priv->lock); + if (priv->quota >= skb->len) { + priv->quota -= skb->len; + ret = !ret; } else { /* we do not allow even small packets from now on */ - q->quota = 0; + priv->quota = 0; } - spin_unlock_bh("a_lock); + spin_unlock_bh(&priv->lock); return ret; } -static int -checkentry(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int quota_mt_check(const struct xt_mtchk_param *par) { - struct xt_quota_info *q = (struct xt_quota_info *)matchinfo; + struct xt_quota_info *q = par->matchinfo; if (q->flags & ~XT_QUOTA_MASK) - return 0; - /* For SMP, we only want to use one set of counters. */ - q->master = q; - return 1; + return -EINVAL; + + q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); + if (q->master == NULL) + return -ENOMEM; + + spin_lock_init(&q->master->lock); + q->master->quota = q->quota; + return 0; +} + +static void quota_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_quota_info *q = par->matchinfo; + + kfree(q->master); } -static struct xt_match xt_quota_match[] = { - { - .name = "quota", - .family = AF_INET, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_quota_info), - .me = THIS_MODULE - }, - { - .name = "quota", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_quota_info), - .me = THIS_MODULE - }, +static struct xt_match quota_mt_reg __read_mostly = { + .name = "quota", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = quota_mt, + .checkentry = quota_mt_check, + .destroy = quota_mt_destroy, + .matchsize = sizeof(struct xt_quota_info), + .me = THIS_MODULE, }; -static int __init xt_quota_init(void) +static int __init quota_mt_init(void) { - return xt_register_matches(xt_quota_match, ARRAY_SIZE(xt_quota_match)); + return xt_register_match("a_mt_reg); } -static void __exit xt_quota_fini(void) +static void __exit quota_mt_exit(void) { - xt_unregister_matches(xt_quota_match, ARRAY_SIZE(xt_quota_match)); + xt_unregister_match("a_mt_reg); } -module_init(xt_quota_init); -module_exit(xt_quota_fini); +module_init(quota_mt_init); +module_exit(quota_mt_exit); diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c new file mode 100644 index 00000000000..7720b036d76 --- /dev/null +++ b/net/netfilter/xt_rateest.c @@ -0,0 +1,157 @@ +/* + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/gen_stats.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_rateest.h> +#include <net/netfilter/xt_rateest.h> + + +static bool +xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rateest_match_info *info = par->matchinfo; + struct gnet_stats_rate_est64 *r; + u_int32_t bps1, bps2, pps1, pps2; + bool ret = true; + + spin_lock_bh(&info->est1->lock); + r = &info->est1->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0; + pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0; + } else { + bps1 = r->bps; + pps1 = r->pps; + } + spin_unlock_bh(&info->est1->lock); + + if (info->flags & XT_RATEEST_MATCH_ABS) { + bps2 = info->bps2; + pps2 = info->pps2; + } else { + spin_lock_bh(&info->est2->lock); + r = &info->est2->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0; + pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0; + } else { + bps2 = r->bps; + pps2 = r->pps; + } + spin_unlock_bh(&info->est2->lock); + } + + switch (info->mode) { + case XT_RATEEST_MATCH_LT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 < bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 < pps2; + break; + case XT_RATEEST_MATCH_GT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 > bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 > pps2; + break; + case XT_RATEEST_MATCH_EQ: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 == bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 == pps2; + break; + } + + ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false; + return ret; +} + +static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_rateest_match_info *info = par->matchinfo; + struct xt_rateest *est1, *est2; + int ret = -EINVAL; + + if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | + XT_RATEEST_MATCH_REL)) != 1) + goto err1; + + if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS))) + goto err1; + + switch (info->mode) { + case XT_RATEEST_MATCH_EQ: + case XT_RATEEST_MATCH_LT: + case XT_RATEEST_MATCH_GT: + break; + default: + goto err1; + } + + ret = -ENOENT; + est1 = xt_rateest_lookup(info->name1); + if (!est1) + goto err1; + + est2 = NULL; + if (info->flags & XT_RATEEST_MATCH_REL) { + est2 = xt_rateest_lookup(info->name2); + if (!est2) + goto err2; + } + + info->est1 = est1; + info->est2 = est2; + return 0; + +err2: + xt_rateest_put(est1); +err1: + return ret; +} + +static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_rateest_match_info *info = par->matchinfo; + + xt_rateest_put(info->est1); + if (info->est2) + xt_rateest_put(info->est2); +} + +static struct xt_match xt_rateest_mt_reg __read_mostly = { + .name = "rateest", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = xt_rateest_mt, + .checkentry = xt_rateest_mt_checkentry, + .destroy = xt_rateest_mt_destroy, + .matchsize = sizeof(struct xt_rateest_match_info), + .me = THIS_MODULE, +}; + +static int __init xt_rateest_mt_init(void) +{ + return xt_register_match(&xt_rateest_mt_reg); +} + +static void __exit xt_rateest_mt_fini(void) +{ + xt_unregister_match(&xt_rateest_mt_reg); +} + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xtables rate estimator match"); +MODULE_ALIAS("ipt_rateest"); +MODULE_ALIAS("ip6t_rateest"); +module_init(xt_rateest_mt_init); +module_exit(xt_rateest_mt_fini); diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index c2017f8af9c..459a7b256eb 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -18,44 +18,37 @@ MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("X_tables realm match"); +MODULE_DESCRIPTION("Xtables: Routing realm match"); MODULE_ALIAS("ipt_realm"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +realm_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_realm_info *info = matchinfo; - struct dst_entry *dst = skb->dst; + const struct xt_realm_info *info = par->matchinfo; + const struct dst_entry *dst = skb_dst(skb); return (info->id == (dst->tclassid & info->mask)) ^ info->invert; } -static struct xt_match realm_match = { +static struct xt_match realm_mt_reg __read_mostly = { .name = "realm", - .match = match, + .match = realm_mt, .matchsize = sizeof(struct xt_realm_info), - .hooks = (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN), - .family = AF_INET, + .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), + .family = NFPROTO_UNSPEC, .me = THIS_MODULE }; -static int __init xt_realm_init(void) +static int __init realm_mt_init(void) { - return xt_register_match(&realm_match); + return xt_register_match(&realm_mt_reg); } -static void __exit xt_realm_fini(void) +static void __exit realm_mt_exit(void) { - xt_unregister_match(&realm_match); + xt_unregister_match(&realm_mt_reg); } -module_init(xt_realm_init); -module_exit(xt_realm_fini); +module_init(realm_mt_init); +module_exit(realm_mt_exit); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c new file mode 100644 index 00000000000..a9faae89f95 --- /dev/null +++ b/net/netfilter/xt_recent.c @@ -0,0 +1,740 @@ +/* + * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost <sfrost@snowman.net> + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/list.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/bitops.h> +#include <linux/skbuff.h> +#include <linux/inet.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_recent.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_recent"); +MODULE_ALIAS("ip6t_recent"); + +static unsigned int ip_list_tot = 100; +static unsigned int ip_pkt_list_tot = 20; +static unsigned int ip_list_hash_size = 0; +static unsigned int ip_list_perms = 0644; +static unsigned int ip_list_uid = 0; +static unsigned int ip_list_gid = 0; +module_param(ip_list_tot, uint, 0400); +module_param(ip_pkt_list_tot, uint, 0400); +module_param(ip_list_hash_size, uint, 0400); +module_param(ip_list_perms, uint, 0400); +module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR); +module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_uid, "default owner of /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* files"); + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + union nf_inet_addr addr; + u_int16_t family; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; +}; + +struct recent_table { + struct list_head list; + char name[XT_RECENT_NAME_LEN]; + union nf_inet_addr mask; + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; +}; + +struct recent_net { + struct list_head tables; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *xt_recent; +#endif +}; + +static int recent_net_id; +static inline struct recent_net *recent_pernet(struct net *net) +{ + return net_generic(net, recent_net_id); +} + +static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); + +#ifdef CONFIG_PROC_FS +static const struct file_operations recent_old_fops, recent_mt_fops; +#endif + +static u_int32_t hash_rnd __read_mostly; +static bool hash_rnd_inited __read_mostly; + +static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr) +{ + return jhash_1word((__force u32)addr->ip, hash_rnd) & + (ip_list_hash_size - 1); +} + +static inline unsigned int recent_entry_hash6(const union nf_inet_addr *addr) +{ + return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) & + (ip_list_hash_size - 1); +} + +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, + const union nf_inet_addr *addrp, u_int16_t family, + u_int8_t ttl) +{ + struct recent_entry *e; + unsigned int h; + + if (family == NFPROTO_IPV4) + h = recent_entry_hash4(addrp); + else + h = recent_entry_hash6(addrp); + + list_for_each_entry(e, &table->iphash[h], list) + if (e->family == family && + memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 && + (ttl == e->ttl || ttl == 0 || e->ttl == 0)) + return e; + return NULL; +} + +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} + +/* + * Drop entries with timestamps older then 'time'. + */ +static void recent_entry_reap(struct recent_table *t, unsigned long time) +{ + struct recent_entry *e; + + /* + * The head of the LRU list is always the oldest entry. + */ + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + + /* + * The last time stamp is the most recent. + */ + if (time_after(time, e->stamps[e->index-1])) + recent_entry_remove(t, e); +} + +static struct recent_entry * +recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, + u_int16_t family, u_int8_t ttl) +{ + struct recent_entry *e; + + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); + } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + memcpy(&e->addr, addr, sizeof(e->addr)); + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + e->family = family; + if (family == NFPROTO_IPV4) + list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]); + else + list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} + +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->index %= ip_pkt_list_tot; + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + list_move_tail(&e->lru_list, &t->lru_list); +} + +static struct recent_table *recent_table_lookup(struct recent_net *recent_net, + const char *name) +{ + struct recent_table *t; + + list_for_each_entry(t, &recent_net->tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} + +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; + + for (i = 0; i < ip_list_hash_size; i++) + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); +} + +static bool +recent_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + struct recent_net *recent_net = recent_pernet(net); + const struct xt_recent_mtinfo_v1 *info = par->matchinfo; + struct recent_table *t; + struct recent_entry *e; + union nf_inet_addr addr = {}, addr_mask; + u_int8_t ttl; + bool ret = info->invert; + + if (par->family == NFPROTO_IPV4) { + const struct iphdr *iph = ip_hdr(skb); + + if (info->side == XT_RECENT_DEST) + addr.ip = iph->daddr; + else + addr.ip = iph->saddr; + + ttl = iph->ttl; + } else { + const struct ipv6hdr *iph = ipv6_hdr(skb); + + if (info->side == XT_RECENT_DEST) + memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6)); + else + memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6)); + + ttl = iph->hop_limit; + } + + /* use TTL as seen before forwarding */ + if (par->out != NULL && skb->sk == NULL) + ttl++; + + spin_lock_bh(&recent_lock); + t = recent_table_lookup(recent_net, info->name); + + nf_inet_addr_mask(&addr, &addr_mask, &t->mask); + + e = recent_entry_lookup(t, &addr_mask, par->family, + (info->check_set & XT_RECENT_TTL) ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & XT_RECENT_SET)) + goto out; + e = recent_entry_init(t, &addr_mask, par->family, ttl); + if (e == NULL) + par->hotdrop = true; + ret = !ret; + goto out; + } + + if (info->check_set & XT_RECENT_SET) + ret = !ret; + else if (info->check_set & XT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret = !ret; + } else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) { + unsigned long time = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(time, e->stamps[i])) + continue; + if (!info->hit_count || ++hits >= info->hit_count) { + ret = !ret; + break; + } + } + + /* info->seconds must be non-zero */ + if (info->check_set & XT_RECENT_REAP) + recent_entry_reap(t, time); + } + + if (info->check_set & XT_RECENT_SET || + (info->check_set & XT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; +} + +static void recent_table_free(void *addr) +{ + kvfree(addr); +} + +static int recent_mt_check(const struct xt_mtchk_param *par, + const struct xt_recent_mtinfo_v1 *info) +{ + struct recent_net *recent_net = recent_pernet(par->net); + struct recent_table *t; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; + kuid_t uid; + kgid_t gid; +#endif + unsigned int i; + int ret = -EINVAL; + size_t sz; + + if (unlikely(!hash_rnd_inited)) { + get_random_bytes(&hash_rnd, sizeof(hash_rnd)); + hash_rnd_inited = true; + } + if (info->check_set & ~XT_RECENT_VALID_FLAGS) { + pr_info("Unsupported user space flags (%08x)\n", + info->check_set); + return -EINVAL; + } + if (hweight8(info->check_set & + (XT_RECENT_SET | XT_RECENT_REMOVE | + XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1) + return -EINVAL; + if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) && + (info->seconds || info->hit_count || + (info->check_set & XT_RECENT_MODIFIERS))) + return -EINVAL; + if ((info->check_set & XT_RECENT_REAP) && !info->seconds) + return -EINVAL; + if (info->hit_count > ip_pkt_list_tot) { + pr_info("hitcount (%u) is larger than " + "packets to be remembered (%u)\n", + info->hit_count, ip_pkt_list_tot); + return -EINVAL; + } + if (info->name[0] == '\0' || + strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) + return -EINVAL; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(recent_net, info->name); + if (t != NULL) { + t->refcnt++; + ret = 0; + goto out; + } + + sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size; + if (sz <= PAGE_SIZE) + t = kzalloc(sz, GFP_KERNEL); + else + t = vzalloc(sz); + if (t == NULL) { + ret = -ENOMEM; + goto out; + } + t->refcnt = 1; + + memcpy(&t->mask, &info->mask, sizeof(t->mask)); + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + uid = make_kuid(&init_user_ns, ip_list_uid); + gid = make_kgid(&init_user_ns, ip_list_gid); + if (!uid_valid(uid) || !gid_valid(gid)) { + recent_table_free(t); + ret = -EINVAL; + goto out; + } + pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent, + &recent_mt_fops, t); + if (pde == NULL) { + recent_table_free(t); + ret = -ENOMEM; + goto out; + } + proc_set_user(pde, uid, gid); +#endif + spin_lock_bh(&recent_lock); + list_add_tail(&t->list, &recent_net->tables); + spin_unlock_bh(&recent_lock); + ret = 0; +out: + mutex_unlock(&recent_mutex); + return ret; +} + +static int recent_mt_check_v0(const struct xt_mtchk_param *par) +{ + const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo; + struct xt_recent_mtinfo_v1 info_v1; + + /* Copy revision 0 structure to revision 1 */ + memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo)); + /* Set default mask to ensure backward compatible behaviour */ + memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all)); + + return recent_mt_check(par, &info_v1); +} + +static int recent_mt_check_v1(const struct xt_mtchk_param *par) +{ + return recent_mt_check(par, par->matchinfo); +} + +static void recent_mt_destroy(const struct xt_mtdtor_param *par) +{ + struct recent_net *recent_net = recent_pernet(par->net); + const struct xt_recent_mtinfo_v1 *info = par->matchinfo; + struct recent_table *t; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(recent_net, info->name); + if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); + list_del(&t->list); + spin_unlock_bh(&recent_lock); +#ifdef CONFIG_PROC_FS + if (recent_net->xt_recent != NULL) + remove_proc_entry(t->name, recent_net->xt_recent); +#endif + recent_table_flush(t); + recent_table_free(t); + } + mutex_unlock(&recent_mutex); +} + +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + const struct recent_table *table; + unsigned int bucket; +}; + +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(recent_lock) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; + + spin_lock_bh(&recent_lock); + + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) + list_for_each_entry(e, &t->iphash[st->bucket], list) + if (p-- == 0) + return e; + return NULL; +} + +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + const struct recent_entry *e = v; + const struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); +} + +static void recent_seq_stop(struct seq_file *s, void *v) + __releases(recent_lock) +{ + spin_unlock_bh(&recent_lock); +} + +static int recent_seq_show(struct seq_file *seq, void *v) +{ + const struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + if (e->family == NFPROTO_IPV4) + seq_printf(seq, "src=%pI4 ttl: %u last_seen: %lu oldest_pkt: %u", + &e->addr.ip, e->ttl, e->stamps[i], e->index); + else + seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u", + &e->addr.in6, e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} + +static const struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; + +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct recent_iter_state *st; + + st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); + if (st == NULL) + return -ENOMEM; + + st->table = PDE_DATA(inode); + return 0; +} + +static ssize_t +recent_mt_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + struct recent_table *t = PDE_DATA(file_inode(file)); + struct recent_entry *e; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + const char *c = buf; + union nf_inet_addr addr = {}; + u_int16_t family; + bool add, succ; + + if (size == 0) + return 0; + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size) != 0) + return -EFAULT; + + /* Strict protocol! */ + if (*loff != 0) + return -ESPIPE; + switch (*c) { + case '/': /* flush table */ + spin_lock_bh(&recent_lock); + recent_table_flush(t); + spin_unlock_bh(&recent_lock); + return size; + case '-': /* remove address */ + add = false; + break; + case '+': /* add address */ + add = true; + break; + default: + pr_info("Need \"+ip\", \"-ip\" or \"/\"\n"); + return -EINVAL; + } + + ++c; + --size; + if (strnchr(c, size, ':') != NULL) { + family = NFPROTO_IPV6; + succ = in6_pton(c, size, (void *)&addr, '\n', NULL); + } else { + family = NFPROTO_IPV4; + succ = in4_pton(c, size, (void *)&addr, '\n', NULL); + } + + if (!succ) { + pr_info("illegal address written to procfs\n"); + return -EINVAL; + } + + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, &addr, family, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, &addr, family, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); + } + spin_unlock_bh(&recent_lock); + /* Note we removed one above */ + *loff += size + 1; + return size + 1; +} + +static const struct file_operations recent_mt_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_mt_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, + .llseek = seq_lseek, +}; + +static int __net_init recent_proc_net_init(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net); + if (!recent_net->xt_recent) + return -ENOMEM; + return 0; +} + +static void __net_exit recent_proc_net_exit(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + struct recent_table *t; + + /* recent_net_exit() is called before recent_mt_destroy(). Make sure + * that the parent xt_recent proc entry is is empty before trying to + * remove it. + */ + spin_lock_bh(&recent_lock); + list_for_each_entry(t, &recent_net->tables, list) + remove_proc_entry(t->name, recent_net->xt_recent); + + recent_net->xt_recent = NULL; + spin_unlock_bh(&recent_lock); + + remove_proc_entry("xt_recent", net->proc_net); +} +#else +static inline int recent_proc_net_init(struct net *net) +{ + return 0; +} + +static inline void recent_proc_net_exit(struct net *net) +{ +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init recent_net_init(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + INIT_LIST_HEAD(&recent_net->tables); + return recent_proc_net_init(net); +} + +static void __net_exit recent_net_exit(struct net *net) +{ + recent_proc_net_exit(net); +} + +static struct pernet_operations recent_net_ops = { + .init = recent_net_init, + .exit = recent_net_exit, + .id = &recent_net_id, + .size = sizeof(struct recent_net), +}; + +static struct xt_match recent_mt_reg[] __read_mostly = { + { + .name = "recent", + .revision = 0, + .family = NFPROTO_IPV4, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check_v0, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "recent", + .revision = 0, + .family = NFPROTO_IPV6, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check_v0, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "recent", + .revision = 1, + .family = NFPROTO_IPV4, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo_v1), + .checkentry = recent_mt_check_v1, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "recent", + .revision = 1, + .family = NFPROTO_IPV6, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo_v1), + .checkentry = recent_mt_check_v1, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + } +}; + +static int __init recent_mt_init(void) +{ + int err; + + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); + + err = register_pernet_subsys(&recent_net_ops); + if (err) + return err; + err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + if (err) + unregister_pernet_subsys(&recent_net_ops); + return err; +} + +static void __exit recent_mt_exit(void) +{ + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + unregister_pernet_subsys(&recent_net_ops); +} + +module_init(recent_mt_init); +module_exit(recent_mt_exit); diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h new file mode 100644 index 00000000000..8fd324116e6 --- /dev/null +++ b/net/netfilter/xt_repldata.h @@ -0,0 +1,47 @@ +/* + * Today's hack: quantum tunneling in structs + * + * 'entries' and 'term' are never anywhere referenced by word in code. In fact, + * they serve as the hanging-off data accessed through repl.data[]. + */ + +/* tbl has the following structure equivalent, but is C99 compliant: + * struct { + * struct type##_replace repl; + * struct type##_standard entries[nhooks]; + * struct type##_error term; + * } *tbl; + */ + +#define xt_alloc_initial_table(type, typ2) ({ \ + unsigned int hook_mask = info->valid_hooks; \ + unsigned int nhooks = hweight32(hook_mask); \ + unsigned int bytes = 0, hooknum = 0, i = 0; \ + struct { \ + struct type##_replace repl; \ + struct type##_standard entries[]; \ + } *tbl; \ + struct type##_error *term; \ + size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \ + __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \ + tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \ + if (tbl == NULL) \ + return NULL; \ + term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ + strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + *term = (struct type##_error)typ2##_ERROR_INIT; \ + tbl->repl.valid_hooks = hook_mask; \ + tbl->repl.num_entries = nhooks + 1; \ + tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ + sizeof(struct type##_error); \ + for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \ + if (!(hook_mask & 1)) \ + continue; \ + tbl->repl.hook_entry[hooknum] = bytes; \ + tbl->repl.underflow[hooknum] = bytes; \ + tbl->entries[i++] = (struct type##_standard) \ + typ2##_STANDARD_INIT(NF_ACCEPT); \ + bytes += sizeof(struct type##_standard); \ + } \ + tbl; \ +}) diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index f86d8d769d4..ef36a56a02c 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -1,7 +1,9 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/sctp/sctp.h> #include <linux/sctp.h> #include <linux/netfilter/x_tables.h> @@ -11,19 +13,14 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kiran Kumar Immidi"); -MODULE_DESCRIPTION("Match for SCTP protocol packets"); +MODULE_DESCRIPTION("Xtables: SCTP protocol packet match"); MODULE_ALIAS("ipt_sctp"); - -#ifdef DEBUG_SCTP -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif +MODULE_ALIAS("ip6t_sctp"); #define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ || (!!((invflag) & (option)) ^ (cond))) -static int +static bool match_flags(const struct xt_sctp_flag_info *flag_info, const int flag_count, u_int8_t chunktype, @@ -31,182 +28,171 @@ match_flags(const struct xt_sctp_flag_info *flag_info, { int i; - for (i = 0; i < flag_count; i++) { - if (flag_info[i].chunktype == chunktype) { + for (i = 0; i < flag_count; i++) + if (flag_info[i].chunktype == chunktype) return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; - } - } - return 1; + return true; } -static inline int +static inline bool match_packet(const struct sk_buff *skb, unsigned int offset, - const u_int32_t *chunkmap, - int chunk_match_type, - const struct xt_sctp_flag_info *flag_info, - const int flag_count, - int *hotdrop) + const struct xt_sctp_info *info, + bool *hotdrop) { u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; - sctp_chunkhdr_t _sch, *sch; + const sctp_chunkhdr_t *sch; + sctp_chunkhdr_t _sch; + int chunk_match_type = info->chunk_match_type; + const struct xt_sctp_flag_info *flag_info = info->flag_info; + int flag_count = info->flag_count; -#ifdef DEBUG_SCTP +#ifdef DEBUG int i = 0; #endif - if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) { - SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap); - } + if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) + SCTP_CHUNKMAP_COPY(chunkmapcopy, info->chunkmap); do { sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); if (sch == NULL || sch->length == 0) { - duprintf("Dropping invalid SCTP packet.\n"); - *hotdrop = 1; - return 0; + pr_debug("Dropping invalid SCTP packet.\n"); + *hotdrop = true; + return false; } +#ifdef DEBUG + pr_debug("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d" + "\tflags: %x\n", + ++i, offset, sch->type, htons(sch->length), + sch->flags); +#endif + offset += WORD_ROUND(ntohs(sch->length)); - duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", - ++i, offset, sch->type, htons(sch->length), sch->flags); - - offset += (ntohs(sch->length) + 3) & ~3; - - duprintf("skb->len: %d\toffset: %d\n", skb->len, offset); + pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); - if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) { + if (SCTP_CHUNKMAP_IS_SET(info->chunkmap, sch->type)) { switch (chunk_match_type) { case SCTP_CHUNK_MATCH_ANY: if (match_flags(flag_info, flag_count, sch->type, sch->flags)) { - return 1; + return true; } break; case SCTP_CHUNK_MATCH_ALL: if (match_flags(flag_info, flag_count, - sch->type, sch->flags)) { + sch->type, sch->flags)) SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type); - } break; case SCTP_CHUNK_MATCH_ONLY: if (!match_flags(flag_info, flag_count, - sch->type, sch->flags)) { - return 0; - } + sch->type, sch->flags)) + return false; break; } } else { switch (chunk_match_type) { case SCTP_CHUNK_MATCH_ONLY: - return 0; + return false; } } } while (offset < skb->len); switch (chunk_match_type) { case SCTP_CHUNK_MATCH_ALL: - return SCTP_CHUNKMAP_IS_CLEAR(chunkmap); + return SCTP_CHUNKMAP_IS_CLEAR(chunkmapcopy); case SCTP_CHUNK_MATCH_ANY: - return 0; + return false; case SCTP_CHUNK_MATCH_ONLY: - return 1; + return true; } /* This will never be reached, but required to stop compiler whine */ - return 0; + return false; } -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +sctp_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_sctp_info *info = matchinfo; - sctp_sctphdr_t _sh, *sh; + const struct xt_sctp_info *info = par->matchinfo; + const sctp_sctphdr_t *sh; + sctp_sctphdr_t _sh; - if (offset) { - duprintf("Dropping non-first fragment.. FIXME\n"); - return 0; + if (par->fragoff != 0) { + pr_debug("Dropping non-first fragment.. FIXME\n"); + return false; } - sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh); + sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh); if (sh == NULL) { - duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = 1; - return 0; + pr_debug("Dropping evil TCP offset=0 tinygram.\n"); + par->hotdrop = true; + return false; } - duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); + pr_debug("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); - return SCCHECK(((ntohs(sh->source) >= info->spts[0]) - && (ntohs(sh->source) <= info->spts[1])), + return SCCHECK(ntohs(sh->source) >= info->spts[0] + && ntohs(sh->source) <= info->spts[1], XT_SCTP_SRC_PORTS, info->flags, info->invflags) - && SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) - && (ntohs(sh->dest) <= info->dpts[1])), + && SCCHECK(ntohs(sh->dest) >= info->dpts[0] + && ntohs(sh->dest) <= info->dpts[1], XT_SCTP_DEST_PORTS, info->flags, info->invflags) - && SCCHECK(match_packet(skb, protoff + sizeof (sctp_sctphdr_t), - info->chunkmap, info->chunk_match_type, - info->flag_info, info->flag_count, - hotdrop), + && SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t), + info, &par->hotdrop), XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); } -static int -checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int sctp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_sctp_info *info = matchinfo; - - return !(info->flags & ~XT_SCTP_VALID_FLAGS) - && !(info->invflags & ~XT_SCTP_VALID_FLAGS) - && !(info->invflags & ~info->flags) - && ((!(info->flags & XT_SCTP_CHUNK_TYPES)) || - (info->chunk_match_type & - (SCTP_CHUNK_MATCH_ALL - | SCTP_CHUNK_MATCH_ANY - | SCTP_CHUNK_MATCH_ONLY))); + const struct xt_sctp_info *info = par->matchinfo; + + if (info->flags & ~XT_SCTP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~XT_SCTP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~info->flags) + return -EINVAL; + if (!(info->flags & XT_SCTP_CHUNK_TYPES)) + return 0; + if (info->chunk_match_type & (SCTP_CHUNK_MATCH_ALL | + SCTP_CHUNK_MATCH_ANY | SCTP_CHUNK_MATCH_ONLY)) + return 0; + return -EINVAL; } -static struct xt_match xt_sctp_match[] = { +static struct xt_match sctp_mt_reg[] __read_mostly = { { .name = "sctp", - .family = AF_INET, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV4, + .checkentry = sctp_mt_check, + .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), .proto = IPPROTO_SCTP, .me = THIS_MODULE }, { .name = "sctp", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .family = NFPROTO_IPV6, + .checkentry = sctp_mt_check, + .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), .proto = IPPROTO_SCTP, .me = THIS_MODULE }, }; -static int __init xt_sctp_init(void) +static int __init sctp_mt_init(void) { - return xt_register_matches(xt_sctp_match, ARRAY_SIZE(xt_sctp_match)); + return xt_register_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); } -static void __exit xt_sctp_fini(void) +static void __exit sctp_mt_exit(void) { - xt_unregister_matches(xt_sctp_match, ARRAY_SIZE(xt_sctp_match)); + xt_unregister_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); } -module_init(xt_sctp_init); -module_exit(xt_sctp_fini); +module_init(sctp_mt_init); +module_exit(sctp_mt_exit); diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c new file mode 100644 index 00000000000..80c2e2d603e --- /dev/null +++ b/net/netfilter/xt_set.c @@ -0,0 +1,523 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Martin Josefsson <gandalf@wlug.westbo.se> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module which implements the set match and SET target + * for netfilter/iptables. */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("Xtables: IP set match and target module"); +MODULE_ALIAS("xt_SET"); +MODULE_ALIAS("ipt_set"); +MODULE_ALIAS("ip6t_set"); +MODULE_ALIAS("ipt_SET"); +MODULE_ALIAS("ip6t_SET"); + +static inline int +match_set(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, int inv) +{ + if (ip_set_test(index, skb, par, opt)) + inv = !inv; + return inv; +} + +#define ADT_OPT(n, f, d, fs, cfs, t) \ +struct ip_set_adt_opt n = { \ + .family = f, \ + .dim = d, \ + .flags = fs, \ + .cmdflags = cfs, \ + .ext.timeout = t, \ +} + +/* Revision 0 interface: backward compatible with netfilter/iptables */ + +static bool +set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v0 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.u.compat.dim, + info->match_set.u.compat.flags, 0, UINT_MAX); + + return match_set(info->match_set.index, skb, par, &opt, + info->match_set.u.compat.flags & IPSET_INV_MATCH); +} + +static void +compat_flags(struct xt_set_info_v0 *info) +{ + u_int8_t i; + + /* Fill out compatibility data according to enum ip_set_kopt */ + info->u.compat.dim = IPSET_DIM_ZERO; + if (info->u.flags[0] & IPSET_MATCH_INV) + info->u.compat.flags |= IPSET_INV_MATCH; + for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { + info->u.compat.dim++; + if (info->u.flags[i] & IPSET_SRC) + info->u.compat.flags |= (1<<info->u.compat.dim); + } +} + +static int +set_match_v0_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_set_info_match_v0 *info = par->matchinfo; + ip_set_id_t index; + + index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); + + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find set identified by id %u to match\n", + info->match_set.index); + return -ENOENT; + } + if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { + pr_warning("Protocol error: set match dimension " + "is over the limit!\n"); + ip_set_nfnl_put(par->net, info->match_set.index); + return -ERANGE; + } + + /* Fill out compatibility data */ + compat_flags(&info->match_set); + + return 0; +} + +static void +set_match_v0_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_set_info_match_v0 *info = par->matchinfo; + + ip_set_nfnl_put(par->net, info->match_set.index); +} + +/* Revision 1 match */ + +static bool +set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v1 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, 0, UINT_MAX); + + if (opt.flags & IPSET_RETURN_NOMATCH) + opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; + + return match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); +} + +static int +set_match_v1_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_set_info_match_v1 *info = par->matchinfo; + ip_set_id_t index; + + index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); + + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find set identified by id %u to match\n", + info->match_set.index); + return -ENOENT; + } + if (info->match_set.dim > IPSET_DIM_MAX) { + pr_warning("Protocol error: set match dimension " + "is over the limit!\n"); + ip_set_nfnl_put(par->net, info->match_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_match_v1_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_set_info_match_v1 *info = par->matchinfo; + + ip_set_nfnl_put(par->net, info->match_set.index); +} + +/* Revision 3 match */ + +static bool +match_counter(u64 counter, const struct ip_set_counter_match *info) +{ + switch (info->op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == info->value; + case IPSET_COUNTER_NE: + return counter != info->value; + case IPSET_COUNTER_LT: + return counter < info->value; + case IPSET_COUNTER_GT: + return counter > info->value; + } + return false; +} + +static bool +set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v3 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, info->flags, UINT_MAX); + int ret; + + if (info->packets.op != IPSET_COUNTER_NONE || + info->bytes.op != IPSET_COUNTER_NONE) + opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; + + ret = match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); + + if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) + return ret; + + if (!match_counter(opt.ext.packets, &info->packets)) + return 0; + return match_counter(opt.ext.bytes, &info->bytes); +} + +#define set_match_v3_checkentry set_match_v1_checkentry +#define set_match_v3_destroy set_match_v1_destroy + +/* Revision 0 interface: backward compatible with netfilter/iptables */ + +static unsigned int +set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v0 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, + info->add_set.u.compat.flags, 0, UINT_MAX); + ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, + info->del_set.u.compat.flags, 0, UINT_MAX); + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + + return XT_CONTINUE; +} + +static int +set_target_v0_checkentry(const struct xt_tgchk_param *par) +{ + struct xt_set_info_target_v0 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + return -ENOENT; + } + } + if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || + info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { + pr_warning("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + return -ERANGE; + } + + /* Fill out compatibility data */ + compat_flags(&info->add_set); + compat_flags(&info->del_set); + + return 0; +} + +static void +set_target_v0_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v0 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); +} + +/* Revision 1 target */ + +static unsigned int +set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v1 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, 0, UINT_MAX); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + + return XT_CONTINUE; +} + +static int +set_target_v1_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v1 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + return -ENOENT; + } + } + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX) { + pr_warning("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_v1_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v1 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); +} + +/* Revision 2 target */ + +static unsigned int +set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v2 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + + /* Normalize to fit into jiffies */ + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + + return XT_CONTINUE; +} + +#define set_target_v2_checkentry set_target_v1_checkentry +#define set_target_v2_destroy set_target_v1_destroy + +static struct xt_match set_matches[] __read_mostly = { + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 0, + .match = set_match_v0, + .matchsize = sizeof(struct xt_set_info_match_v0), + .checkentry = set_match_v0_checkentry, + .destroy = set_match_v0_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 1, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 1, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, + /* --return-nomatch flag support */ + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 2, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 2, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, + /* counters support: update, match */ + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, +}; + +static struct xt_target set_targets[] __read_mostly = { + { + .name = "SET", + .revision = 0, + .family = NFPROTO_IPV4, + .target = set_target_v0, + .targetsize = sizeof(struct xt_set_info_target_v0), + .checkentry = set_target_v0_checkentry, + .destroy = set_target_v0_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 1, + .family = NFPROTO_IPV4, + .target = set_target_v1, + .targetsize = sizeof(struct xt_set_info_target_v1), + .checkentry = set_target_v1_checkentry, + .destroy = set_target_v1_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 1, + .family = NFPROTO_IPV6, + .target = set_target_v1, + .targetsize = sizeof(struct xt_set_info_target_v1), + .checkentry = set_target_v1_checkentry, + .destroy = set_target_v1_destroy, + .me = THIS_MODULE + }, + /* --timeout and --exist flags support */ + { + .name = "SET", + .revision = 2, + .family = NFPROTO_IPV4, + .target = set_target_v2, + .targetsize = sizeof(struct xt_set_info_target_v2), + .checkentry = set_target_v2_checkentry, + .destroy = set_target_v2_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 2, + .family = NFPROTO_IPV6, + .target = set_target_v2, + .targetsize = sizeof(struct xt_set_info_target_v2), + .checkentry = set_target_v2_checkentry, + .destroy = set_target_v2_destroy, + .me = THIS_MODULE + }, +}; + +static int __init xt_set_init(void) +{ + int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches)); + + if (!ret) { + ret = xt_register_targets(set_targets, + ARRAY_SIZE(set_targets)); + if (ret) + xt_unregister_matches(set_matches, + ARRAY_SIZE(set_matches)); + } + return ret; +} + +static void __exit xt_set_fini(void) +{ + xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches)); + xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets)); +} + +module_init(xt_set_init); +module_exit(xt_set_fini); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c new file mode 100644 index 00000000000..1ba67931eb1 --- /dev/null +++ b/net/netfilter/xt_socket.c @@ -0,0 +1,495 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (C) 2007-2008 BalaBit IT Ltd. + * Author: Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/sock.h> +#include <net/inet_sock.h> +#include <net/netfilter/ipv4/nf_defrag_ipv4.h> + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#define XT_SOCKET_HAVE_IPV6 1 +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/inet6_hashtables.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#endif + +#include <linux/netfilter/xt_socket.h> + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#define XT_SOCKET_HAVE_CONNTRACK 1 +#include <net/netfilter/nf_conntrack.h> +#endif + +static int +extract_icmp4_fields(const struct sk_buff *skb, + u8 *protocol, + __be32 *raddr, + __be32 *laddr, + __be16 *rport, + __be16 *lport) +{ + unsigned int outside_hdrlen = ip_hdrlen(skb); + struct iphdr *inside_iph, _inside_iph; + struct icmphdr *icmph, _icmph; + __be16 *ports, _ports[2]; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return 1; + } + + inside_iph = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr), + sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + + if (inside_iph->protocol != IPPROTO_TCP && + inside_iph->protocol != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr) + + (inside_iph->ihl << 2), + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_iph->protocol; + *laddr = inside_iph->saddr; + *lport = ports[0]; + *raddr = inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +/* "socket" match based redirection (no specific rule) + * =================================================== + * + * There are connections with dynamic endpoints (e.g. FTP data + * connection) that the user is unable to add explicit rules + * for. These are taken care of by a generic "socket" rule. It is + * assumed that the proxy application is trusted to open such + * connections without explicit iptables rule (except of course the + * generic 'socket' rule). In this case the following sockets are + * matched in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple + * + * - match: if there's a non-zero bound listener (possibly with a + * non-local address) We don't accept zero-bound listeners, since + * then local services could intercept traffic going through the + * box. + */ +static struct sock * +xt_socket_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return __inet_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + return NULL; +} + +static bool +socket_match(const struct sk_buff *skb, struct xt_action_param *par, + const struct xt_socket_mtinfo1 *info) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk = skb->sk; + __be32 uninitialized_var(daddr), uninitialized_var(saddr); + __be16 uninitialized_var(dport), uninitialized_var(sport); + u8 uninitialized_var(protocol); +#ifdef XT_SOCKET_HAVE_CONNTRACK + struct nf_conn const *ct; + enum ip_conntrack_info ctinfo; +#endif + + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + protocol = iph->protocol; + saddr = iph->saddr; + sport = hp->source; + daddr = iph->daddr; + dport = hp->dest; + + } else if (iph->protocol == IPPROTO_ICMP) { + if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, + &sport, &dport)) + return false; + } else { + return false; + } + +#ifdef XT_SOCKET_HAVE_CONNTRACK + /* Do the lookup with the original socket address in case this is a + * reply packet of an established SNAT-ted connection. */ + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && + ((iph->protocol != IPPROTO_ICMP && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (iph->protocol == IPPROTO_ICMP && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + + daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; + dport = (iph->protocol == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + + if (!sk) + sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, + saddr, daddr, sport, dport, + par->in); + if (sk) { + bool wildcard; + bool transparent = true; + + /* Ignore sockets listening on INADDR_ANY, + * unless XT_SOCKET_NOWILDCARD is set + */ + wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && + sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->inet_rcv_saddr == 0); + + /* Ignore non-transparent sockets, + if XT_SOCKET_TRANSPARENT is used */ + if (info->flags & XT_SOCKET_TRANSPARENT) + transparent = ((sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->transparent) || + (sk->sk_state == TCP_TIME_WAIT && + inet_twsk(sk)->tw_transparent)); + + if (sk != skb->sk) + sock_gen_put(sk); + + if (wildcard || !transparent) + sk = NULL; + } + + pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n", + protocol, &saddr, ntohs(sport), + &daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); + + return (sk != NULL); +} + +static bool +socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + static struct xt_socket_mtinfo1 xt_info_v0 = { + .flags = 0, + }; + + return socket_match(skb, par, &xt_info_v0); +} + +static bool +socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +{ + return socket_match(skb, par, par->matchinfo); +} + +#ifdef XT_SOCKET_HAVE_IPV6 + +static int +extract_icmp6_fields(const struct sk_buff *skb, + unsigned int outside_hdrlen, + int *protocol, + struct in6_addr **raddr, + struct in6_addr **laddr, + __be16 *rport, + __be16 *lport) +{ + struct ipv6hdr *inside_iph, _inside_iph; + struct icmp6hdr *icmph, _icmph; + __be16 *ports, _ports[2]; + u8 inside_nexthdr; + __be16 inside_fragoff; + int inside_hdrlen; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK) + return 1; + + inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + inside_nexthdr = inside_iph->nexthdr; + + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), + &inside_nexthdr, &inside_fragoff); + if (inside_hdrlen < 0) + return 1; /* hjm: Packet has no/incomplete transport layer headers. */ + + if (inside_nexthdr != IPPROTO_TCP && + inside_nexthdr != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, inside_hdrlen, + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_nexthdr; + *laddr = &inside_iph->saddr; + *lport = ports[0]; + *raddr = &inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +static struct sock * +xt_socket_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return inet6_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + + return NULL; +} + +static bool +socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk = skb->sk; + struct in6_addr *daddr = NULL, *saddr = NULL; + __be16 uninitialized_var(dport), uninitialized_var(sport); + int thoff = 0, uninitialized_var(tproto); + const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) { + hp = skb_header_pointer(skb, thoff, + sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + saddr = &iph->saddr; + sport = hp->source; + daddr = &iph->daddr; + dport = hp->dest; + + } else if (tproto == IPPROTO_ICMPV6) { + if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, + &sport, &dport)) + return false; + } else { + return false; + } + + if (!sk) + sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, + saddr, daddr, sport, dport, + par->in); + if (sk) { + bool wildcard; + bool transparent = true; + + /* Ignore sockets listening on INADDR_ANY + * unless XT_SOCKET_NOWILDCARD is set + */ + wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && + sk->sk_state != TCP_TIME_WAIT && + ipv6_addr_any(&sk->sk_v6_rcv_saddr)); + + /* Ignore non-transparent sockets, + if XT_SOCKET_TRANSPARENT is used */ + if (info->flags & XT_SOCKET_TRANSPARENT) + transparent = ((sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->transparent) || + (sk->sk_state == TCP_TIME_WAIT && + inet_twsk(sk)->tw_transparent)); + + if (sk != skb->sk) + sock_gen_put(sk); + + if (wildcard || !transparent) + sk = NULL; + } + + pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu " + "(orig %pI6:%hu) sock %p\n", + tproto, saddr, ntohs(sport), + daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); + + return (sk != NULL); +} +#endif + +static int socket_mt_v1_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V1) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); + return -EINVAL; + } + return 0; +} + +static int socket_mt_v2_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V2) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); + return -EINVAL; + } + return 0; +} + +static struct xt_match socket_mt_reg[] __read_mostly = { + { + .name = "socket", + .revision = 0, + .family = NFPROTO_IPV4, + .match = socket_mt4_v0, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "socket", + .revision = 1, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2, + .checkentry = socket_mt_v1_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 1, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2, + .checkentry = socket_mt_v1_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif + { + .name = "socket", + .revision = 2, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2, + .checkentry = socket_mt_v2_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 2, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2, + .checkentry = socket_mt_v2_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init socket_mt_init(void) +{ + nf_defrag_ipv4_enable(); +#ifdef XT_SOCKET_HAVE_IPV6 + nf_defrag_ipv6_enable(); +#endif + + return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); +} + +static void __exit socket_mt_exit(void) +{ + xt_unregister_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); +} + +module_init(socket_mt_init); +module_exit(socket_mt_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); +MODULE_DESCRIPTION("x_tables socket match module"); +MODULE_ALIAS("ipt_socket"); +MODULE_ALIAS("ip6t_socket"); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 149294f7df7..a507922d80c 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -20,80 +20,60 @@ MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module"); MODULE_ALIAS("ipt_state"); MODULE_ALIAS("ip6t_state"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +state_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_state_info *sinfo = matchinfo; + const struct xt_state_info *sinfo = par->matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (nf_ct_is_untracked(skb)) - statebit = XT_STATE_UNTRACKED; - else if (!nf_ct_get(skb, &ctinfo)) + if (!ct) statebit = XT_STATE_INVALID; - else - statebit = XT_STATE_BIT(ctinfo); - + else { + if (nf_ct_is_untracked(ct)) + statebit = XT_STATE_UNTRACKED; + else + statebit = XT_STATE_BIT(ctinfo); + } return (sinfo->statemask & statebit); } -static int check(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int state_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { - printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); - return 0; - } - return 1; + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; } -static void -destroy(const struct xt_match *match, void *matchinfo) +static void state_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->family); } -static struct xt_match xt_state_match[] = { - { - .name = "state", - .family = AF_INET, - .checkentry = check, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_state_info), - .me = THIS_MODULE, - }, - { - .name = "state", - .family = AF_INET6, - .checkentry = check, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_state_info), - .me = THIS_MODULE, - }, +static struct xt_match state_mt_reg __read_mostly = { + .name = "state", + .family = NFPROTO_UNSPEC, + .checkentry = state_mt_check, + .match = state_mt, + .destroy = state_mt_destroy, + .matchsize = sizeof(struct xt_state_info), + .me = THIS_MODULE, }; -static int __init xt_state_init(void) +static int __init state_mt_init(void) { - return xt_register_matches(xt_state_match, ARRAY_SIZE(xt_state_match)); + return xt_register_match(&state_mt_reg); } -static void __exit xt_state_fini(void) +static void __exit state_mt_exit(void) { - xt_unregister_matches(xt_state_match, ARRAY_SIZE(xt_state_match)); + xt_unregister_match(&state_mt_reg); } -module_init(xt_state_init); -module_exit(xt_state_fini); +module_init(state_mt_init); +module_exit(state_mt_exit); diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 091a9f89f5d..11de55e7a86 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -12,90 +12,90 @@ #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/net.h> +#include <linux/slab.h> #include <linux/netfilter/xt_statistic.h> #include <linux/netfilter/x_tables.h> +#include <linux/module.h> + +struct xt_statistic_priv { + atomic_t count; +} ____cacheline_aligned_in_smp; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("xtables statistical match module"); +MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); MODULE_ALIAS("ipt_statistic"); MODULE_ALIAS("ip6t_statistic"); -static DEFINE_SPINLOCK(nth_lock); - -static int -match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, int *hotdrop) +static bool +statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; - int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0; + const struct xt_statistic_info *info = par->matchinfo; + bool ret = info->flags & XT_STATISTIC_INVERT; + int nval, oval; switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: - if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) - ret ^= 1; + if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability) + ret = !ret; break; case XT_STATISTIC_MODE_NTH: - info = info->master; - spin_lock_bh(&nth_lock); - if (info->u.nth.count++ == info->u.nth.every) { - info->u.nth.count = 0; - ret ^= 1; - } - spin_unlock_bh(&nth_lock); + do { + oval = atomic_read(&info->master->count); + nval = (oval == info->u.nth.every) ? 0 : oval + 1; + } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval); + if (nval == 0) + ret = !ret; break; } return ret; } -static int -checkentry(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int statistic_mt_check(const struct xt_mtchk_param *par) { - struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; + struct xt_statistic_info *info = par->matchinfo; if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) - return 0; - info->master = info; - return 1; + return -EINVAL; + + info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); + if (info->master == NULL) + return -ENOMEM; + atomic_set(&info->master->count, info->u.nth.count); + + return 0; +} + +static void statistic_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_statistic_info *info = par->matchinfo; + + kfree(info->master); } -static struct xt_match xt_statistic_match[] = { - { - .name = "statistic", - .family = AF_INET, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_statistic_info), - .me = THIS_MODULE, - }, - { - .name = "statistic", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .matchsize = sizeof(struct xt_statistic_info), - .me = THIS_MODULE, - }, +static struct xt_match xt_statistic_mt_reg __read_mostly = { + .name = "statistic", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = statistic_mt, + .checkentry = statistic_mt_check, + .destroy = statistic_mt_destroy, + .matchsize = sizeof(struct xt_statistic_info), + .me = THIS_MODULE, }; -static int __init xt_statistic_init(void) +static int __init statistic_mt_init(void) { - return xt_register_matches(xt_statistic_match, - ARRAY_SIZE(xt_statistic_match)); + return xt_register_match(&xt_statistic_mt_reg); } -static void __exit xt_statistic_fini(void) +static void __exit statistic_mt_exit(void) { - xt_unregister_matches(xt_statistic_match, - ARRAY_SIZE(xt_statistic_match)); + xt_unregister_match(&xt_statistic_mt_reg); } -module_init(xt_statistic_init); -module_exit(xt_statistic_fini); +module_init(statistic_mt_init); +module_exit(statistic_mt_exit); diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 999a005dbd0..d3c48b14ab9 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -7,6 +7,7 @@ * published by the Free Software Foundation. */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> @@ -16,93 +17,80 @@ #include <linux/textsearch.h> MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); -MODULE_DESCRIPTION("IP tables string match module"); +MODULE_DESCRIPTION("Xtables: string-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); -static int match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +string_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_string_info *conf = matchinfo; + const struct xt_string_info *conf = par->matchinfo; struct ts_state state; + bool invert; memset(&state, 0, sizeof(struct ts_state)); + invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; return (skb_find_text((struct sk_buff *)skb, conf->from_offset, conf->to_offset, conf->config, &state) - != UINT_MAX) ^ conf->invert; + != UINT_MAX) ^ invert; } -#define STRING_TEXT_PRIV(m) ((struct xt_string_info *) m) +#define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) -static int checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int string_mt_check(const struct xt_mtchk_param *par) { - struct xt_string_info *conf = matchinfo; + struct xt_string_info *conf = par->matchinfo; struct ts_config *ts_conf; + int flags = TS_AUTOLOAD; /* Damn, can't handle this case properly with iptables... */ if (conf->from_offset > conf->to_offset) - return 0; + return -EINVAL; if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0') - return 0; + return -EINVAL; if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) - return 0; + return -EINVAL; + if (conf->u.v1.flags & + ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) + return -EINVAL; + if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE) + flags |= TS_IGNORECASE; ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, - GFP_KERNEL, TS_AUTOLOAD); + GFP_KERNEL, flags); if (IS_ERR(ts_conf)) - return 0; + return PTR_ERR(ts_conf); conf->config = ts_conf; - - return 1; + return 0; } -static void destroy(const struct xt_match *match, void *matchinfo) +static void string_mt_destroy(const struct xt_mtdtor_param *par) { - textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); + textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); } -static struct xt_match xt_string_match[] = { - { - .name = "string", - .family = AF_INET, - .checkentry = checkentry, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_string_info), - .me = THIS_MODULE - }, - { - .name = "string", - .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .destroy = destroy, - .matchsize = sizeof(struct xt_string_info), - .me = THIS_MODULE - }, +static struct xt_match xt_string_mt_reg __read_mostly = { + .name = "string", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = string_mt_check, + .match = string_mt, + .destroy = string_mt_destroy, + .matchsize = sizeof(struct xt_string_info), + .me = THIS_MODULE, }; -static int __init xt_string_init(void) +static int __init string_mt_init(void) { - return xt_register_matches(xt_string_match, ARRAY_SIZE(xt_string_match)); + return xt_register_match(&xt_string_mt_reg); } -static void __exit xt_string_fini(void) +static void __exit string_mt_exit(void) { - xt_unregister_matches(xt_string_match, ARRAY_SIZE(xt_string_match)); + xt_unregister_match(&xt_string_mt_reg); } -module_init(xt_string_init); -module_exit(xt_string_fini); +module_init(string_mt_init); +module_exit(string_mt_exit); diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 80571d0749f..c53d4d18ead 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -20,27 +20,23 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables TCP MSS match module"); +MODULE_DESCRIPTION("Xtables: TCP MSS match"); MODULE_ALIAS("ipt_tcpmss"); +MODULE_ALIAS("ip6t_tcpmss"); -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool +tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_tcpmss_match_info *info = matchinfo; - struct tcphdr _tcph, *th; + const struct xt_tcpmss_match_info *info = par->matchinfo; + const struct tcphdr *th; + struct tcphdr _tcph; /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ - u8 _opt[15 * 4 - sizeof(_tcph)], *op; + const u_int8_t *op; + u8 _opt[15 * 4 - sizeof(_tcph)]; unsigned int i, optlen; /* If we don't have the whole header, drop packet. */ - th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) goto dropit; @@ -53,7 +49,7 @@ match(const struct sk_buff *skb, goto out; /* Truncated options. */ - op = skb_header_pointer(skb, protoff + sizeof(*th), optlen, _opt); + op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt); if (op == NULL) goto dropit; @@ -77,39 +73,38 @@ out: return info->invert; dropit: - *hotdrop = 1; - return 0; + par->hotdrop = true; + return false; } -static struct xt_match xt_tcpmss_match[] = { +static struct xt_match tcpmss_mt_reg[] __read_mostly = { { .name = "tcpmss", - .family = AF_INET, - .match = match, + .family = NFPROTO_IPV4, + .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "tcpmss", - .family = AF_INET6, - .match = match, + .family = NFPROTO_IPV6, + .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, }; -static int __init xt_tcpmss_init(void) +static int __init tcpmss_mt_init(void) { - return xt_register_matches(xt_tcpmss_match, - ARRAY_SIZE(xt_tcpmss_match)); + return xt_register_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); } -static void __exit xt_tcpmss_fini(void) +static void __exit tcpmss_mt_exit(void) { - xt_unregister_matches(xt_tcpmss_match, ARRAY_SIZE(xt_tcpmss_match)); + xt_unregister_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); } -module_init(xt_tcpmss_init); -module_exit(xt_tcpmss_fini); +module_init(tcpmss_mt_init); +module_exit(tcpmss_mt_exit); diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 46414b562a1..c14d4645daa 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -1,3 +1,4 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> @@ -10,7 +11,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> -MODULE_DESCRIPTION("x_tables match for TCP and UDP(-Lite), supports IPv4 and IPv6"); +MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("xt_tcp"); MODULE_ALIAS("xt_udp"); @@ -19,36 +20,27 @@ MODULE_ALIAS("ipt_tcp"); MODULE_ALIAS("ip6t_udp"); MODULE_ALIAS("ip6t_tcp"); -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif - - /* Returns 1 if the port is matched by the range, 0 otherwise */ -static inline int -port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +static inline bool +port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert) { - int ret; - - ret = (port >= min && port <= max) ^ invert; - return ret; + return (port >= min && port <= max) ^ invert; } -static int +static bool tcp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, unsigned int optlen, - int invert, - int *hotdrop) + bool invert, + bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ - u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + const u_int8_t *op; + u_int8_t _opt[60 - sizeof(struct tcphdr)]; unsigned int i; - duprintf("tcp_match: finding option\n"); + pr_debug("finding option\n"); if (!optlen) return invert; @@ -57,8 +49,8 @@ tcp_find_option(u_int8_t option, op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), optlen, _opt); if (op == NULL) { - *hotdrop = 1; - return 0; + *hotdrop = true; + return false; } for (i = 0; i < optlen; ) { @@ -70,109 +62,89 @@ tcp_find_option(u_int8_t option, return invert; } -static int -tcp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct tcphdr _tcph, *th; - const struct xt_tcp *tcpinfo = matchinfo; + const struct tcphdr *th; + struct tcphdr _tcph; + const struct xt_tcp *tcpinfo = par->matchinfo; - if (offset) { + if (par->fragoff != 0) { /* To quote Alan: Don't allow a fragment of TCP 8 bytes in. Nobody normal causes this. Its a cracker trying to break in by doing a flag overwrite to pass the direction checks. */ - if (offset == 1) { - duprintf("Dropping evil TCP offset=1 frag.\n"); - *hotdrop = 1; + if (par->fragoff == 1) { + pr_debug("Dropping evil TCP offset=1 frag.\n"); + par->hotdrop = true; } /* Must not be a fragment. */ - return 0; + return false; } -#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) +#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) - th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ - duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = 1; - return 0; + pr_debug("Dropping evil TCP offset=0 tinygram.\n"); + par->hotdrop = true; + return false; } if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], ntohs(th->source), !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) - return 0; + return false; if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], ntohs(th->dest), !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) - return 0; + return false; if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp, XT_TCP_INV_FLAGS)) - return 0; + return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { - *hotdrop = 1; - return 0; + par->hotdrop = true; + return false; } - if (!tcp_find_option(tcpinfo->option, skb, protoff, + if (!tcp_find_option(tcpinfo->option, skb, par->thoff, th->doff*4 - sizeof(_tcph), tcpinfo->invflags & XT_TCP_INV_OPTION, - hotdrop)) - return 0; + &par->hotdrop)) + return false; } - return 1; + return true; } -/* Called when user tries to insert an entry of this type. */ -static int -tcp_checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int tcp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_tcp *tcpinfo = matchinfo; + const struct xt_tcp *tcpinfo = par->matchinfo; /* Must specify no unknown invflags */ - return !(tcpinfo->invflags & ~XT_TCP_INV_MASK); + return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0; } -static int -udp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct udphdr _udph, *uh; - const struct xt_udp *udpinfo = matchinfo; + const struct udphdr *uh; + struct udphdr _udph; + const struct xt_udp *udpinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) - return 0; + if (par->fragoff != 0) + return false; - uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph); + uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ - duprintf("Dropping evil UDP tinygram.\n"); - *hotdrop = 1; - return 0; + pr_debug("Dropping evil UDP tinygram.\n"); + par->hotdrop = true; + return false; } return port_match(udpinfo->spts[0], udpinfo->spts[1], @@ -183,87 +155,80 @@ udp_match(const struct sk_buff *skb, !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); } -/* Called when user tries to insert an entry of this type. */ -static int -udp_checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int udp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_tcp *udpinfo = matchinfo; + const struct xt_udp *udpinfo = par->matchinfo; /* Must specify no unknown invflags */ - return !(udpinfo->invflags & ~XT_UDP_INV_MASK); + return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; } -static struct xt_match xt_tcpudp_match[] = { +static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", - .family = AF_INET, - .checkentry = tcp_checkentry, - .match = tcp_match, + .family = NFPROTO_IPV4, + .checkentry = tcp_mt_check, + .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "tcp", - .family = AF_INET6, - .checkentry = tcp_checkentry, - .match = tcp_match, + .family = NFPROTO_IPV6, + .checkentry = tcp_mt_check, + .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "udp", - .family = AF_INET, - .checkentry = udp_checkentry, - .match = udp_match, + .family = NFPROTO_IPV4, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udp", - .family = AF_INET6, - .checkentry = udp_checkentry, - .match = udp_match, + .family = NFPROTO_IPV6, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udplite", - .family = AF_INET, - .checkentry = udp_checkentry, - .match = udp_match, + .family = NFPROTO_IPV4, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "udplite", - .family = AF_INET6, - .checkentry = udp_checkentry, - .match = udp_match, + .family = NFPROTO_IPV6, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, }; -static int __init xt_tcpudp_init(void) +static int __init tcpudp_mt_init(void) { - return xt_register_matches(xt_tcpudp_match, - ARRAY_SIZE(xt_tcpudp_match)); + return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } -static void __exit xt_tcpudp_fini(void) +static void __exit tcpudp_mt_exit(void) { - xt_unregister_matches(xt_tcpudp_match, ARRAY_SIZE(xt_tcpudp_match)); + xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } -module_init(xt_tcpudp_init); -module_exit(xt_tcpudp_fini); +module_init(tcpudp_mt_init); +module_exit(tcpudp_mt_exit); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c new file mode 100644 index 00000000000..0ae55a36f49 --- /dev/null +++ b/net/netfilter/xt_time.c @@ -0,0 +1,291 @@ +/* + * xt_time + * Copyright © CC Computer Consultants GmbH, 2007 + * + * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org> + * This is a module which is used for time matching + * It is using some modified code from dietlibc (localtime() function) + * that you can find at http://www.fefe.de/dietlibc/ + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. + */ +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_time.h> + +struct xtm { + u_int8_t month; /* (1-12) */ + u_int8_t monthday; /* (1-31) */ + u_int8_t weekday; /* (1-7) */ + u_int8_t hour; /* (0-23) */ + u_int8_t minute; /* (0-59) */ + u_int8_t second; /* (0-59) */ + unsigned int dse; +}; + +extern struct timezone sys_tz; /* ouch */ + +static const u_int16_t days_since_year[] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, +}; + +static const u_int16_t days_since_leapyear[] = { + 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, +}; + +/* + * Since time progresses forward, it is best to organize this array in reverse, + * to minimize lookup time. + */ +enum { + DSE_FIRST = 2039, + SECONDS_PER_DAY = 86400, +}; +static const u_int16_t days_since_epoch[] = { + /* 2039 - 2030 */ + 25202, 24837, 24472, 24106, 23741, 23376, 23011, 22645, 22280, 21915, + /* 2029 - 2020 */ + 21550, 21184, 20819, 20454, 20089, 19723, 19358, 18993, 18628, 18262, + /* 2019 - 2010 */ + 17897, 17532, 17167, 16801, 16436, 16071, 15706, 15340, 14975, 14610, + /* 2009 - 2000 */ + 14245, 13879, 13514, 13149, 12784, 12418, 12053, 11688, 11323, 10957, + /* 1999 - 1990 */ + 10592, 10227, 9862, 9496, 9131, 8766, 8401, 8035, 7670, 7305, + /* 1989 - 1980 */ + 6940, 6574, 6209, 5844, 5479, 5113, 4748, 4383, 4018, 3652, + /* 1979 - 1970 */ + 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0, +}; + +static inline bool is_leap(unsigned int y) +{ + return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0); +} + +/* + * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp. + * Since we match against days and daytime, the SSTE value needs to be + * computed back into human-readable dates. + * + * This is done in three separate functions so that the most expensive + * calculations are done last, in case a "simple match" can be found earlier. + */ +static inline unsigned int localtime_1(struct xtm *r, time_t time) +{ + unsigned int v, w; + + /* Each day has 86400s, so finding the hour/minute is actually easy. */ + v = time % SECONDS_PER_DAY; + r->second = v % 60; + w = v / 60; + r->minute = w % 60; + r->hour = w / 60; + return v; +} + +static inline void localtime_2(struct xtm *r, time_t time) +{ + /* + * Here comes the rest (weekday, monthday). First, divide the SSTE + * by seconds-per-day to get the number of _days_ since the epoch. + */ + r->dse = time / 86400; + + /* + * 1970-01-01 (w=0) was a Thursday (4). + * -1 and +1 map Sunday properly onto 7. + */ + r->weekday = (4 + r->dse - 1) % 7 + 1; +} + +static void localtime_3(struct xtm *r, time_t time) +{ + unsigned int year, i, w = r->dse; + + /* + * In each year, a certain number of days-since-the-epoch have passed. + * Find the year that is closest to said days. + * + * Consider, for example, w=21612 (2029-03-04). Loop will abort on + * dse[i] <= w, which happens when dse[i] == 21550. This implies + * year == 2009. w will then be 62. + */ + for (i = 0, year = DSE_FIRST; days_since_epoch[i] > w; + ++i, --year) + /* just loop */; + + w -= days_since_epoch[i]; + + /* + * By now we have the current year, and the day of the year. + * r->yearday = w; + * + * On to finding the month (like above). In each month, a certain + * number of days-since-New Year have passed, and find the closest + * one. + * + * Consider w=62 (in a non-leap year). Loop will abort on + * dsy[i] < w, which happens when dsy[i] == 31+28 (i == 2). + * Concludes i == 2, i.e. 3rd month => March. + * + * (A different approach to use would be to subtract a monthlength + * from w repeatedly while counting.) + */ + if (is_leap(year)) { + /* use days_since_leapyear[] in a leap year */ + for (i = ARRAY_SIZE(days_since_leapyear) - 1; + i > 0 && days_since_leapyear[i] > w; --i) + /* just loop */; + r->monthday = w - days_since_leapyear[i] + 1; + } else { + for (i = ARRAY_SIZE(days_since_year) - 1; + i > 0 && days_since_year[i] > w; --i) + /* just loop */; + r->monthday = w - days_since_year[i] + 1; + } + + r->month = i + 1; +} + +static bool +time_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_time_info *info = par->matchinfo; + unsigned int packet_time; + struct xtm current_time; + s64 stamp; + + /* + * We cannot use get_seconds() instead of __net_timestamp() here. + * Suppose you have two rules: + * 1. match before 13:00 + * 2. match after 13:00 + * If you match against processing time (get_seconds) it + * may happen that the same packet matches both rules if + * it arrived at the right moment before 13:00. + */ + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + + stamp = ktime_to_ns(skb->tstamp); + stamp = div_s64(stamp, NSEC_PER_SEC); + + if (info->flags & XT_TIME_LOCAL_TZ) + /* Adjust for local timezone */ + stamp -= 60 * sys_tz.tz_minuteswest; + + /* + * xt_time will match when _all_ of the following hold: + * - 'now' is in the global time range date_start..date_end + * - 'now' is in the monthday mask + * - 'now' is in the weekday mask + * - 'now' is in the daytime range time_start..time_end + * (and by default, libxt_time will set these so as to match) + */ + + if (stamp < info->date_start || stamp > info->date_stop) + return false; + + packet_time = localtime_1(¤t_time, stamp); + + if (info->daytime_start < info->daytime_stop) { + if (packet_time < info->daytime_start || + packet_time > info->daytime_stop) + return false; + } else { + if (packet_time < info->daytime_start && + packet_time > info->daytime_stop) + return false; + + /** if user asked to ignore 'next day', then e.g. + * '1 PM Wed, August 1st' should be treated + * like 'Tue 1 PM July 31st'. + * + * This also causes + * 'Monday, "23:00 to 01:00", to match for 2 hours, starting + * Monday 23:00 to Tuesday 01:00. + */ + if ((info->flags & XT_TIME_CONTIGUOUS) && + packet_time <= info->daytime_stop) + stamp -= SECONDS_PER_DAY; + } + + localtime_2(¤t_time, stamp); + + if (!(info->weekdays_match & (1 << current_time.weekday))) + return false; + + /* Do not spend time computing monthday if all days match anyway */ + if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) { + localtime_3(¤t_time, stamp); + if (!(info->monthdays_match & (1 << current_time.monthday))) + return false; + } + + return true; +} + +static int time_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_time_info *info = par->matchinfo; + + if (info->daytime_start > XT_TIME_MAX_DAYTIME || + info->daytime_stop > XT_TIME_MAX_DAYTIME) { + pr_info("invalid argument - start or " + "stop time greater than 23:59:59\n"); + return -EDOM; + } + + if (info->flags & ~XT_TIME_ALL_FLAGS) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS); + return -EINVAL; + } + + if ((info->flags & XT_TIME_CONTIGUOUS) && + info->daytime_start < info->daytime_stop) + return -EINVAL; + + return 0; +} + +static struct xt_match xt_time_mt_reg __read_mostly = { + .name = "time", + .family = NFPROTO_UNSPEC, + .match = time_mt, + .checkentry = time_mt_check, + .matchsize = sizeof(struct xt_time_info), + .me = THIS_MODULE, +}; + +static int __init time_mt_init(void) +{ + int minutes = sys_tz.tz_minuteswest; + + if (minutes < 0) /* east of Greenwich */ + printk(KERN_INFO KBUILD_MODNAME + ": kernel timezone is +%02d%02d\n", + -minutes / 60, -minutes % 60); + else /* west of Greenwich */ + printk(KERN_INFO KBUILD_MODNAME + ": kernel timezone is -%02d%02d\n", + minutes / 60, minutes % 60); + + return xt_register_match(&xt_time_mt_reg); +} + +static void __exit time_mt_exit(void) +{ + xt_unregister_match(&xt_time_mt_reg); +} + +module_init(time_mt_init); +module_exit(time_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: time-based matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_time"); +MODULE_ALIAS("ip6t_time"); diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c new file mode 100644 index 00000000000..a95b50342db --- /dev/null +++ b/net/netfilter/xt_u32.c @@ -0,0 +1,123 @@ +/* + * xt_u32 - kernel module to match u32 packet content + * + * Original author: Don Cohen <don@isis.cs3-inc.com> + * (C) CC Computer Consultants GmbH, 2007 + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_u32.h> + +static bool u32_match_it(const struct xt_u32 *data, + const struct sk_buff *skb) +{ + const struct xt_u32_test *ct; + unsigned int testind; + unsigned int nnums; + unsigned int nvals; + unsigned int i; + __be32 n; + u_int32_t pos; + u_int32_t val; + u_int32_t at; + + /* + * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17" + * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands. + */ + for (testind = 0; testind < data->ntests; ++testind) { + ct = &data->tests[testind]; + at = 0; + pos = ct->location[0].number; + + if (skb->len < 4 || pos > skb->len - 4) + return false; + + if (skb_copy_bits(skb, pos, &n, sizeof(n)) < 0) + BUG(); + val = ntohl(n); + nnums = ct->nnums; + + /* Inner loop runs over "&", "<<", ">>" and "@" operands */ + for (i = 1; i < nnums; ++i) { + u_int32_t number = ct->location[i].number; + switch (ct->location[i].nextop) { + case XT_U32_AND: + val &= number; + break; + case XT_U32_LEFTSH: + val <<= number; + break; + case XT_U32_RIGHTSH: + val >>= number; + break; + case XT_U32_AT: + if (at + val < at) + return false; + at += val; + pos = number; + if (at + 4 < at || skb->len < at + 4 || + pos > skb->len - at - 4) + return false; + + if (skb_copy_bits(skb, at + pos, &n, + sizeof(n)) < 0) + BUG(); + val = ntohl(n); + break; + } + } + + /* Run over the "," and ":" operands */ + nvals = ct->nvalues; + for (i = 0; i < nvals; ++i) + if (ct->value[i].min <= val && val <= ct->value[i].max) + break; + + if (i >= ct->nvalues) + return false; + } + + return true; +} + +static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_u32 *data = par->matchinfo; + bool ret; + + ret = u32_match_it(data, skb); + return ret ^ data->invert; +} + +static struct xt_match xt_u32_mt_reg __read_mostly = { + .name = "u32", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = u32_mt, + .matchsize = sizeof(struct xt_u32), + .me = THIS_MODULE, +}; + +static int __init u32_mt_init(void) +{ + return xt_register_match(&xt_u32_mt_reg); +} + +static void __exit u32_mt_exit(void) +{ + xt_unregister_match(&xt_u32_mt_reg); +} + +module_init(u32_mt_init); +module_exit(u32_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: arbitrary byte matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_u32"); +MODULE_ALIAS("ip6t_u32"); |
