diff options
Diffstat (limited to 'net/netfilter')
170 files changed, 32820 insertions, 10020 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f8ac4ef0b79..e9410d17619 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -103,6 +103,16 @@ config NF_CONNTRACK_EVENTS If unsure, say `N'. +config NF_CONNTRACK_TIMEOUT + bool 'Connection tracking timeout' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timeout + extension. This allows you to attach timeout policies to flow + via the CT target. + + If unsure, say `N'. + config NF_CONNTRACK_TIMESTAMP bool 'Connection tracking timestamping' depends on NETFILTER_ADVANCED @@ -114,9 +124,14 @@ config NF_CONNTRACK_TIMESTAMP If unsure, say `N'. +config NF_CONNTRACK_LABELS + bool + help + This option enables support for assigning user-defined flag bits + to connection tracking entries. It selected by the connlabel match. + config NF_CT_PROTO_DCCP - tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate 'DCCP protocol connection tracking support' depends on NETFILTER_ADVANCED default IP_DCCP help @@ -129,8 +144,7 @@ config NF_CT_PROTO_GRE tristate config NF_CT_PROTO_SCTP - tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default IP_SCTP help @@ -271,8 +285,7 @@ config NF_CONNTRACK_PPTP To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_SANE - tristate "SANE protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "SANE protocol support" depends on NETFILTER_ADVANCED help SANE is a protocol for remote access to scanners as implemented @@ -314,22 +327,212 @@ config NF_CT_NETLINK help This option enables support for a netlink-based userspace interface -endif # NF_CONNTRACK +config NF_CT_NETLINK_TIMEOUT + tristate 'Connection tracking timeout tuning via Netlink' + select NETFILTER_NETLINK + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timeout + fine-grain tuning. This allows you to attach specific timeout + policies to flows, instead of using the global timeout policy. -# transparent proxy support -config NETFILTER_TPROXY - tristate "Transparent proxying support (EXPERIMENTAL)" - depends on EXPERIMENTAL - depends on IP_NF_MANGLE + If unsure, say `N'. + +config NF_CT_NETLINK_HELPER + tristate 'Connection tracking helpers in user-space via Netlink' + select NETFILTER_NETLINK + depends on NF_CT_NETLINK + depends on NETFILTER_NETLINK_QUEUE + depends on NETFILTER_NETLINK_QUEUE_CT depends on NETFILTER_ADVANCED help - This option enables transparent proxying support, that is, - support for handling non-locally bound IPv4 TCP and UDP sockets. - For it to work you will have to configure certain iptables rules - and use policy routing. For more information on how to set it up - see Documentation/networking/tproxy.txt. + This option enables the user-space connection tracking helpers + infrastructure. - To compile it as a module, choose M here. If unsure, say N. + If unsure, say `N'. + +config NETFILTER_NETLINK_QUEUE_CT + bool "NFQUEUE integration with Connection Tracking" + default n + depends on NETFILTER_NETLINK_QUEUE + help + If this option is enabled, NFQUEUE can include Connection Tracking + information together with the packet is the enqueued via NFNETLINK. + +config NF_NAT + tristate + +config NF_NAT_NEEDED + bool + depends on NF_NAT + default y + +config NF_NAT_PROTO_DCCP + tristate + depends on NF_NAT && NF_CT_PROTO_DCCP + default NF_NAT && NF_CT_PROTO_DCCP + +config NF_NAT_PROTO_UDPLITE + tristate + depends on NF_NAT && NF_CT_PROTO_UDPLITE + default NF_NAT && NF_CT_PROTO_UDPLITE + +config NF_NAT_PROTO_SCTP + tristate + default NF_NAT && NF_CT_PROTO_SCTP + depends on NF_NAT && NF_CT_PROTO_SCTP + select LIBCRC32C + +config NF_NAT_AMANDA + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_AMANDA + +config NF_NAT_FTP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_FTP + +config NF_NAT_IRC + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_IRC + +config NF_NAT_SIP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_SIP + +config NF_NAT_TFTP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_TFTP + +config NETFILTER_SYNPROXY + tristate + +endif # NF_CONNTRACK + +config NF_TABLES + select NETFILTER_NETLINK + tristate "Netfilter nf_tables support" + help + nftables is the new packet classification framework that intends to + replace the existing {ip,ip6,arp,eb}_tables infrastructure. It + provides a pseudo-state machine with an extensible instruction-set + (also known as expressions) that the userspace 'nft' utility + (http://www.netfilter.org/projects/nftables) uses to build the + rule-set. It also comes with the generic set infrastructure that + allows you to construct mappings between matchings and actions + for performance lookups. + + To compile it as a module, choose M here. + +config NF_TABLES_INET + depends on NF_TABLES && IPV6 + select NF_TABLES_IPV4 + select NF_TABLES_IPV6 + tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" + help + This option enables support for a mixed IPv4/IPv6 "inet" table. + +config NFT_EXTHDR + depends on NF_TABLES + tristate "Netfilter nf_tables IPv6 exthdr module" + help + This option adds the "exthdr" expression that you can use to match + IPv6 extension headers. + +config NFT_META + depends on NF_TABLES + tristate "Netfilter nf_tables meta module" + help + This option adds the "meta" expression that you can use to match and + to set packet metainformation such as the packet mark. + +config NFT_CT + depends on NF_TABLES + depends on NF_CONNTRACK + tristate "Netfilter nf_tables conntrack module" + help + This option adds the "meta" expression that you can use to match + connection tracking information such as the flow state. + +config NFT_RBTREE + depends on NF_TABLES + tristate "Netfilter nf_tables rbtree set module" + help + This option adds the "rbtree" set type (Red Black tree) that is used + to build interval-based sets. + +config NFT_HASH + depends on NF_TABLES + tristate "Netfilter nf_tables hash set module" + help + This option adds the "hash" set type that is used to build one-way + mappings between matchings and actions. + +config NFT_COUNTER + depends on NF_TABLES + tristate "Netfilter nf_tables counter module" + help + This option adds the "counter" expression that you can use to + include packet and byte counters in a rule. + +config NFT_LOG + depends on NF_TABLES + tristate "Netfilter nf_tables log module" + help + This option adds the "log" expression that you can use to log + packets matching some criteria. + +config NFT_LIMIT + depends on NF_TABLES + tristate "Netfilter nf_tables limit module" + help + This option adds the "limit" expression that you can use to + ratelimit rule matchings. + +config NFT_NAT + depends on NF_TABLES + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables nat module" + help + This option adds the "nat" expression that you can use to perform + typical Network Address Translation (NAT) packet transformations. + +config NFT_QUEUE + depends on NF_TABLES + depends on NETFILTER_XTABLES + depends on NETFILTER_NETLINK_QUEUE + tristate "Netfilter nf_tables queue module" + help + This is required if you intend to use the userspace queueing + infrastructure (also known as NFQUEUE) from nftables. + +config NFT_REJECT + depends on NF_TABLES + default m if NETFILTER_ADVANCED=n + tristate "Netfilter nf_tables reject support" + help + This option adds the "reject" expression that you can use to + explicitly deny and notify via TCP reset/ICMP informational errors + unallowed traffic. + +config NFT_REJECT_INET + depends on NF_TABLES_INET + default NFT_REJECT + tristate + +config NFT_COMPAT + depends on NF_TABLES + depends on NETFILTER_XTABLES + tristate "Netfilter x_tables over nf_tables module" + help + This is required if you intend to use any of existing + x_tables match/target extensions over the nf_tables + framework. config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" @@ -488,6 +691,21 @@ config NETFILTER_XT_TARGET_HL since you can easily create immortal packets that loop forever on the network. +config NETFILTER_XT_TARGET_HMARK + tristate '"HMARK" target support' + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED + ---help--- + This option adds the "HMARK" target. + + The target allows you to create rules in the "raw" and "mangle" tables + which set the skbuff mark by means of hash calculation within a given + range. The nfmark can influence the routing method (see "Use netfilter + MARK value as routing key") and can also be used by other subsystems to + change their behaviour. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_IDLETIMER tristate "IDLETIMER target support" depends on NETFILTER_ADVANCED @@ -524,6 +742,15 @@ config NETFILTER_XT_TARGET_LED For more information on the LEDs available on your system, see Documentation/leds/leds-class.txt +config NETFILTER_XT_TARGET_LOG + tristate "LOG target support" + default m if NETFILTER_ADVANCED=n + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' depends on NETFILTER_ADVANCED @@ -533,6 +760,16 @@ config NETFILTER_XT_TARGET_MARK (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). +config NETFILTER_XT_TARGET_NETMAP + tristate '"NETMAP" target support' + depends on NF_NAT + ---help--- + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_NFLOG tristate '"NFLOG" target support' default m if NETFILTER_ADVANCED=n @@ -556,17 +793,11 @@ config NETFILTER_XT_TARGET_NFQUEUE To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_NOTRACK - tristate '"NOTRACK" target support' - depends on IP_NF_RAW || IP6_NF_RAW + tristate '"NOTRACK" target support (DEPRECATED)' depends on NF_CONNTRACK - help - The NOTRACK target allows a select rule to specify - which packets *not* to enter the conntrack/NAT - subsystem with all the consequences (no ICMP error tracking, - no protocol helpers for the selected packets). - - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_CT config NETFILTER_XT_TARGET_RATEEST tristate '"RATEEST" target support' @@ -578,6 +809,17 @@ config NETFILTER_XT_TARGET_RATEEST To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_REDIRECT + tristate "REDIRECT target support" + depends on NF_NAT + ---help--- + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED @@ -588,11 +830,10 @@ config NETFILTER_XT_TARGET_TEE this clone be rerouted to another nexthop. config NETFILTER_XT_TARGET_TPROXY - tristate '"TPROXY" target support (EXPERIMENTAL)' - depends on EXPERIMENTAL - depends on NETFILTER_TPROXY + tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help @@ -600,6 +841,9 @@ config NETFILTER_XT_TARGET_TPROXY REDIRECT. It can only be used in the mangle table and is useful to redirect traffic to a transparent proxy. It does _not_ depend on Netfilter connection tracking and NAT, unlike REDIRECT. + For it to work you will have to configure certain iptables rules + and use policy routing. For more information on how to set it up + see Documentation/networking/tproxy.txt. To compile it as a module, choose M here. If unsure, say N. @@ -653,8 +897,7 @@ config NETFILTER_XT_TARGET_TCPMSS To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_TCPOPTSTRIP - tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate '"TCPOPTSTRIP" target support' depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED help @@ -675,6 +918,25 @@ config NETFILTER_XT_MATCH_ADDRTYPE If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_BPF + tristate '"bpf" match support' + depends on NETFILTER_ADVANCED + help + BPF matching applies a linux socket filter to each packet and + accepts those for which the filter returns non-zero. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_CGROUP + tristate '"control group" match support' + depends on NETFILTER_ADVANCED + depends on CGROUPS + select CGROUP_NET_CLASSID + ---help--- + Socket/process control group matching allows you to match locally + generated packets based on which net_cls control group processes + belong to. + config NETFILTER_XT_MATCH_CLUSTER tristate '"cluster" match support' depends on NF_CONNTRACK @@ -712,8 +974,21 @@ config NETFILTER_XT_MATCH_CONNBYTES If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_CONNLABEL + tristate '"connlabel" match support' + select NF_CONNTRACK_LABELS + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This match allows you to test and assign userspace-defined labels names + to a connection. The kernel only stores bit values - mapping + names to bits is done by userspace. + + Unlike connmark, more than 32 flag bits may be assigned to a + connection simultaneously. + config NETFILTER_XT_MATCH_CONNLIMIT - tristate '"connlimit" match support"' + tristate '"connlimit" match support' depends on NF_CONNTRACK depends on NETFILTER_ADVANCED ---help--- @@ -839,6 +1114,15 @@ config NETFILTER_XT_MATCH_HL in the IPv6 header, or the time-to-live field in the IPv4 header of the packet. +config NETFILTER_XT_MATCH_IPCOMP + tristate '"ipcomp" match support' + depends on NETFILTER_ADVANCED + help + This match extension allows you to match a range of CPIs(16 bits) + inside IPComp header of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_IPRANGE tristate '"iprange" address range match support' depends on NETFILTER_ADVANCED @@ -859,6 +1143,16 @@ config NETFILTER_XT_MATCH_IPVS If unsure, say N. +config NETFILTER_XT_MATCH_L2TP + tristate '"l2tp" match support' + depends on NETFILTER_ADVANCED + default L2TP + ---help--- + This option adds an "L2TP" match, which allows you to match against + L2TP protocol header fields. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' depends on NETFILTER_ADVANCED @@ -1015,8 +1309,7 @@ config NETFILTER_XT_MATCH_RECENT Official Website: <http://snowman.net/projects/ipt_recent/> config NETFILTER_XT_MATCH_SCTP - tristate '"sctp" protocol match support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate '"sctp" protocol match support' depends on NETFILTER_ADVANCED default IP_SCTP help @@ -1028,12 +1321,11 @@ config NETFILTER_XT_MATCH_SCTP <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. config NETFILTER_XT_MATCH_SOCKET - tristate '"socket" match support (EXPERIMENTAL)' - depends on EXPERIMENTAL - depends on NETFILTER_TPROXY + tristate '"socket" match support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK + depends on (IPV6 || IPV6=n) select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 40f4c3d636c..bffdad774da 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,13 +1,17 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o obj-$(CONFIG_NETFILTER) = netfilter.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o +nfnetlink_queue-y := nfnetlink_queue_core.o +nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o @@ -22,6 +26,8 @@ obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o # netlink interface for nf_conntrack obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o +obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o +obj-$(CONFIG_NF_CT_NETLINK_HELPER) += nfnetlink_cthelper.o # connection tracking helpers nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o @@ -38,8 +44,46 @@ obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o -# transparent proxy support -obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o +nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ + nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o + +obj-$(CONFIG_NF_NAT) += nf_nat.o + +# NAT protocols (nf_nat) +obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o +obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o +obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o + +# NAT helpers +obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o +obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o +obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o +obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o +obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o + +# SYNPROXY +obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o + +# nf_tables +nf_tables-objs += nf_tables_core.o nf_tables_api.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o +nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o + +obj-$(CONFIG_NF_TABLES) += nf_tables.o +obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o +obj-$(CONFIG_NFT_COMPAT) += nft_compat.o +obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o +obj-$(CONFIG_NFT_META) += nft_meta.o +obj-$(CONFIG_NFT_CT) += nft_ct.o +obj-$(CONFIG_NFT_LIMIT) += nft_limit.o +obj-$(CONFIG_NFT_NAT) += nft_nat.o +obj-$(CONFIG_NFT_QUEUE) += nft_queue.o +obj-$(CONFIG_NFT_REJECT) += nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o +obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o +obj-$(CONFIG_NFT_HASH) += nft_hash.o +obj-$(CONFIG_NFT_COUNTER) += nft_counter.o +obj-$(CONFIG_NFT_LOG) += nft_log.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o @@ -48,6 +92,7 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o +obj-$(CONFIG_NF_NAT) += xt_nat.o # targets obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o @@ -57,11 +102,14 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o -obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o +obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o @@ -72,9 +120,11 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_BPF) += xt_bpf.o obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLABEL) += xt_connlabel.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o @@ -86,8 +136,10 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o +obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o @@ -95,6 +147,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b4e8ff05b30..1fbab0cdd30 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -5,6 +5,7 @@ * way. * * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 */ #include <linux/kernel.h> #include <linux/netfilter.h> @@ -29,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex); const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); +const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; +EXPORT_SYMBOL_GPL(nf_ipv6_ops); int nf_register_afinfo(const struct nf_afinfo *afinfo) { @@ -56,7 +59,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); #if defined(CONFIG_JUMP_LABEL) -struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -77,7 +80,7 @@ int nf_register_hook(struct nf_hook_ops *reg) list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } @@ -89,7 +92,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); } @@ -126,7 +129,7 @@ unsigned int nf_iterate(struct list_head *head, unsigned int hook, const struct net_device *indev, const struct net_device *outdev, - struct list_head **i, + struct nf_hook_ops **elemp, int (*okfn)(struct sk_buff *), int hook_thresh) { @@ -136,22 +139,20 @@ unsigned int nf_iterate(struct list_head *head, * The caller must not block between calls to this * function because of risk of continuing from deleted element. */ - list_for_each_continue_rcu(*i, head) { - struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; - - if (hook_thresh > elem->priority) + list_for_each_entry_continue_rcu((*elemp), head, list) { + if (hook_thresh > (*elemp)->priority) continue; /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = elem->hook(hook, skb, indev, outdev, okfn); + verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", - elem->hook, hook); + (*elemp)->hook, hook); continue; } #endif @@ -172,14 +173,14 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, int (*okfn)(struct sk_buff *), int hook_thresh) { - struct list_head *elem; + struct nf_hook_ops *elem; unsigned int verdict; int ret = 0; /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = &nf_hooks[pf][hook]; + elem = list_entry_rcu(&nf_hooks[pf][hook], struct nf_hook_ops, list); next_hook: verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh); @@ -233,12 +234,13 @@ EXPORT_SYMBOL(skb_make_writable); /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly; +void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) + __rcu __read_mostly; EXPORT_SYMBOL(ip_ct_attach); -void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { - void (*attach)(struct sk_buff *, struct sk_buff *); + void (*attach)(struct sk_buff *, const struct sk_buff *); if (skb->nfct) { rcu_read_lock(); @@ -264,38 +266,65 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) rcu_read_unlock(); } EXPORT_SYMBOL(nf_conntrack_destroy); + +struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfq_ct_hook); + +struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfq_ct_nat_hook); + #endif /* CONFIG_NF_CONNTRACK */ +#ifdef CONFIG_NF_NAT_NEEDED +void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); +EXPORT_SYMBOL(nf_nat_decode_session_hook); +#endif + +static int __net_init netfilter_net_init(struct net *net) +{ #ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_net_netfilter; -EXPORT_SYMBOL(proc_net_netfilter); + net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", + net->proc_net); + if (!net->nf.proc_netfilter) { + if (!net_eq(net, &init_net)) + pr_err("cannot create netfilter proc entry"); + + return -ENOMEM; + } #endif + return 0; +} + +static void __net_exit netfilter_net_exit(struct net *net) +{ + remove_proc_entry("netfilter", net->proc_net); +} + +static struct pernet_operations netfilter_net_ops = { + .init = netfilter_net_init, + .exit = netfilter_net_exit, +}; -void __init netfilter_init(void) +int __init netfilter_init(void) { - int i, h; + int i, h, ret; + for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) INIT_LIST_HEAD(&nf_hooks[i][h]); } -#ifdef CONFIG_PROC_FS - proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net); - if (!proc_net_netfilter) - panic("cannot create netfilter proc entry"); -#endif + ret = register_pernet_subsys(&netfilter_net_ops); + if (ret < 0) + goto err; - if (netfilter_queue_init() < 0) - panic("cannot initialize nf_queue"); - if (netfilter_log_init() < 0) - panic("cannot initialize nf_log"); -} + ret = netfilter_log_init(); + if (ret < 0) + goto err_pernet; -#ifdef CONFIG_SYSCTL -struct ctl_path nf_net_netfilter_sysctl_path[] = { - { .procname = "net", }, - { .procname = "netfilter", }, - { } -}; -EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path); -#endif /* CONFIG_SYSCTL */ + return 0; +err_pernet: + unregister_pernet_subsys(&netfilter_net_ops); +err: + return ret; +} diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index ba36c283d83..2f7f5c32c6f 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -1,7 +1,7 @@ menuconfig IP_SET tristate "IP set support" depends on INET && NETFILTER - depends on NETFILTER_NETLINK + select NETFILTER_NETLINK help This option adds IP set support to the kernel. In order to define and use the sets, you need the userspace utility @@ -21,7 +21,7 @@ config IP_SET_MAX You can define here default value of the maximum number of IP sets for the kernel. - The value can be overriden by the 'max_sets' module + The value can be overridden by the 'max_sets' module parameter of the 'ip_set' module. config IP_SET_BITMAP_IP @@ -61,6 +61,15 @@ config IP_SET_HASH_IP To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_IPMARK + tristate "hash:ip,mark set support" + depends on IP_SET + help + This option adds the hash:ip,mark set type support, by which one + can store IPv4/IPv6 address and mark pairs. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_IPPORT tristate "hash:ip,port set support" depends on IP_SET @@ -90,6 +99,15 @@ config IP_SET_HASH_IPPORTNET To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_NETPORTNET + tristate "hash:net,port,net set support" + depends on IP_SET + help + This option adds the hash:net,port,net set type support, by which + one can store two IPv4/IPv6 subnets, and a protocol/port in a set. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_NET tristate "hash:net set support" depends on IP_SET @@ -99,6 +117,15 @@ config IP_SET_HASH_NET To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_NETNET + tristate "hash:net,net set support" + depends on IP_SET + help + This option adds the hash:net,net set type support, by which + one can store IPv4/IPv6 network address/prefix pairs in a set. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_NETPORT tristate "hash:net,port set support" depends on IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile index 6e965ecd544..231f10196cb 100644 --- a/net/netfilter/ipset/Makefile +++ b/net/netfilter/ipset/Makefile @@ -14,12 +14,15 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o # hash types obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o +obj-$(CONFIG_IP_SET_HASH_NETNET) += ip_set_hash_netnet.o +obj-$(CONFIG_IP_SET_HASH_NETPORTNET) += ip_set_hash_netportnet.o # list types obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h new file mode 100644 index 00000000000..f2c7d83dc23 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -0,0 +1,289 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __IP_SET_BITMAP_IP_GEN_H +#define __IP_SET_BITMAP_IP_GEN_H + +#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) +#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) +#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) +#define mtype_do_add IPSET_TOKEN(MTYPE, _do_add) +#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_do_del IPSET_TOKEN(MTYPE, _do_del) +#define mtype_do_list IPSET_TOKEN(MTYPE, _do_list) +#define mtype_do_head IPSET_TOKEN(MTYPE, _do_head) +#define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem) +#define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) +#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) +#define mtype_flush IPSET_TOKEN(MTYPE, _flush) +#define mtype_head IPSET_TOKEN(MTYPE, _head) +#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) +#define mtype_elem IPSET_TOKEN(MTYPE, _elem) +#define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_add IPSET_TOKEN(MTYPE, _add) +#define mtype_del IPSET_TOKEN(MTYPE, _del) +#define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype MTYPE + +#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id)) + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct mtype *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&map->gc); +} + +static void +mtype_ext_cleanup(struct ip_set *set) +{ + struct mtype *map = set->data; + u32 id; + + for (id = 0; id < map->elements; id++) + if (test_bit(id, map->members)) + ip_set_ext_destroy(set, get_ext(set, map, id)); +} + +static void +mtype_destroy(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + if (set->dsize) { + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + ip_set_free(map->extensions); + } + kfree(map); + + set->data = NULL; +} + +static void +mtype_flush(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + memset(map->members, 0, map->memsize); +} + +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct mtype *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (mtype_do_head(skb, map) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + + map->memsize + + set->dsize * map->elements))) + goto nla_put_failure; + if (unlikely(ip_set_put_flags(skb, set))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(set, map, e->id); + int ret = mtype_do_test(e, map, set->dsize); + + if (ret <= 0) + return ret; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, set))) + return 0; + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(x, set), ext, mext, flags); + return 1; +} + +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(set, map, e->id); + int ret = mtype_do_add(e, map, flags, set->dsize); + + if (ret == IPSET_ADD_FAILED) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, set))) + ret = 0; + else if (!(flags & IPSET_FLAG_EXIST)) + return -IPSET_ERR_EXIST; + /* Element is re-added, cleanup extensions */ + ip_set_ext_destroy(set, x); + } + + if (SET_WITH_TIMEOUT(set)) +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret); +#else + ip_set_timeout_set(ext_timeout(x, set), ext->timeout); +#endif + + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(x, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(x, set), ext); + return 0; +} + +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(set, map, e->id); + + if (mtype_do_del(e, map)) + return -IPSET_ERR_EXIST; + + ip_set_ext_destroy(set, x); + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, set))) + return -IPSET_ERR_EXIST; + + return 0; +} + +#ifndef IP_SET_BITMAP_STORED_TIMEOUT +static inline bool +mtype_is_filled(const struct mtype_elem *x) +{ + return true; +} +#endif + +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + struct mtype *map = set->data; + struct nlattr *adt, *nested; + void *x; + u32 id, first = cb->args[IPSET_CB_ARG0]; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[IPSET_CB_ARG0] < map->elements; + cb->args[IPSET_CB_ARG0]++) { + id = cb->args[IPSET_CB_ARG0]; + x = get_ext(set, map, id); + if (!test_bit(id, map->members) || + (SET_WITH_TIMEOUT(set) && +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_is_filled((const struct mtype_elem *) x) && +#endif + ip_set_timeout_expired(ext_timeout(x, set)))) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_do_list(skb, map, id, set->dsize)) + goto nla_put_failure; + if (ip_set_put_extensions(skb, set, x, + mtype_is_filled((const struct mtype_elem *) x))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[IPSET_CB_ARG0] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + if (unlikely(id == first)) { + cb->args[IPSET_CB_ARG0] = 0; + return -EMSGSIZE; + } + ipset_nest_end(skb, adt); + return 0; +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct mtype *map = set->data; + void *x; + u32 id; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id < map->elements; id++) + if (mtype_gc_test(id, map, set->dsize)) { + x = get_ext(set, map, id); + if (ip_set_timeout_expired(ext_timeout(x, set))) { + clear_bit(id, map->members); + ip_set_ext_destroy(set, x); + } + } + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&map->gc); +} + +static const struct ip_set_type_variant mtype = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .same_set = mtype_same_set, +}; + +#endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e3e73997c3b..6f1f9f49480 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -24,28 +24,35 @@ #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> -#define IP_SET_BITMAP_TIMEOUT -#include <linux/netfilter/ipset/ip_set_timeout.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counter support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("bitmap:ip type of IP sets"); +IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); +#define MTYPE bitmap_ip + /* Type structure */ struct bitmap_ip { void *members; /* the set members */ + void *extensions; /* data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ u32 hosts; /* number of hosts in a subnet */ size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ - u32 timeout; /* timeout parameter */ struct timer_list gc; /* garbage collection */ }; -/* Base variant */ +/* ADT structure for generic function args */ +struct bitmap_ip_adt_elem { + u16 id; +}; static inline u32 ip_to_id(const struct bitmap_ip *m, u32 ip) @@ -53,186 +60,69 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; } -static int -bitmap_ip_test(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - const struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - return !!test_bit(id, map->members); -} - -static int -bitmap_ip_add(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - if (test_and_set_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; -} +/* Common functions */ -static int -bitmap_ip_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, + struct bitmap_ip *map, size_t dsize) { - struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - if (!test_and_clear_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; + return !!test_bit(e->id, map->members); } -static int -bitmap_ip_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) { - const struct bitmap_ip *map = set->data; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] < map->elements; cb->args[2]++) { - id = cb->args[2]; - if (!test_bit(id, map->members)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id * map->hosts)); - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return !!test_bit(id, map->members); } -/* Timeout variant */ - -static int -bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, + u32 flags, size_t dsize) { - const struct bitmap_ip *map = set->data; - const unsigned long *members = map->members; - u16 id = *(u16 *)value; - - return ip_set_timeout_test(members[id]); + return !!test_and_set_bit(e->id, map->members); } -static int -bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { - struct bitmap_ip *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - - if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) - return -IPSET_ERR_EXIST; - - members[id] = ip_set_timeout_set(timeout); - - return 0; + return !test_and_clear_bit(e->id, map->members); } -static int -bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, + size_t dsize) { - struct bitmap_ip *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - int ret = -IPSET_ERR_EXIST; - - if (ip_set_timeout_test(members[id])) - ret = 0; - - members[id] = IPSET_ELEM_UNSET; - return ret; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); } -static int -bitmap_ip_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { - const struct bitmap_ip *map = set->data; - struct nlattr *adt, *nested; - u32 id, first = cb->args[2]; - const unsigned long *members = map->members; - - adt = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!adt) - return -EMSGSIZE; - for (; cb->args[2] < map->elements; cb->args[2]++) { - id = cb->args[2]; - if (!ip_set_timeout_test(members[id])) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id * map->hosts)); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(members[id]))); - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, adt); - - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, adt); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || + (map->netmask != 32 && + nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)); } static int bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; - ip = ip_to_id(map, ip); + e.id = ip_to_id(map, ip); - return adtfn(set, &ip, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -241,33 +131,31 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 timeout = map->timeout; - u32 ip, ip_to, id; + u32 ip = 0, ip_to = 0; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - if (adt == IPSET_TEST) { - id = ip_to_id(map, ip); - return adtfn(set, &id, timeout, flags); + e.id = ip_to_id(map, ip); + return adtfn(set, &e, &ext, &ext, flags); } if (tb[IPSET_ATTR_IP_TO]) { @@ -282,7 +170,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr > 32) + if (!cidr || cidr > 32) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } else @@ -292,8 +180,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { - id = ip_to_id(map, ip); - ret = adtfn(set, &id, timeout, flags); + e.id = ip_to_id(map, ip); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -303,53 +191,6 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static void -bitmap_ip_destroy(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_ip_flush(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; - - memset(map->members, 0, map->memsize); -} - -static int -bitmap_ip_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_ip *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); - if (map->netmask != 32) - NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); - NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->memsize)); - if (with_timeout(map->timeout)) - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -359,70 +200,16 @@ bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_ip == y->first_ip && x->last_ip == y->last_ip && x->netmask == y->netmask && - x->timeout == y->timeout; + a->timeout == b->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_ip = { - .kadt = bitmap_ip_kadt, - .uadt = bitmap_ip_uadt, - .adt = { - [IPSET_ADD] = bitmap_ip_add, - [IPSET_DEL] = bitmap_ip_del, - [IPSET_TEST] = bitmap_ip_test, - }, - .destroy = bitmap_ip_destroy, - .flush = bitmap_ip_flush, - .head = bitmap_ip_head, - .list = bitmap_ip_list, - .same_set = bitmap_ip_same_set, -}; +/* Plain variant */ -static const struct ip_set_type_variant bitmap_tip = { - .kadt = bitmap_ip_kadt, - .uadt = bitmap_ip_uadt, - .adt = { - [IPSET_ADD] = bitmap_ip_tadd, - [IPSET_DEL] = bitmap_ip_tdel, - [IPSET_TEST] = bitmap_ip_ttest, - }, - .destroy = bitmap_ip_destroy, - .flush = bitmap_ip_flush, - .head = bitmap_ip_head, - .list = bitmap_ip_tlist, - .same_set = bitmap_ip_same_set, +struct bitmap_ip_elem { }; -static void -bitmap_ip_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_ip *map = set->data; - unsigned long *table = map->members; - u32 id; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id < map->elements; id++) - if (ip_set_timeout_expired(table[id])) - table[id] = IPSET_ELEM_UNSET; - read_unlock_bh(&set->lock); - - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} - -static void -bitmap_ip_gc_init(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; - - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_ip_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip type of sets */ @@ -434,29 +221,39 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; + if (set->dsize) { + map->extensions = ip_set_alloc(set->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; map->hosts = hosts; map->netmask = netmask; - map->timeout = IPSET_NO_TIMEOUT; + set->timeout = IPSET_NO_TIMEOUT; set->data = map; - set->family = AF_INET; + set->family = NFPROTO_IPV4; return true; } static int -bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], + u32 flags) { struct bitmap_ip *map; - u32 first_ip, last_ip, hosts, elements; + u32 first_ip = 0, last_ip = 0, hosts; + u64 elements; u8 netmask = 32; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); @@ -494,7 +291,7 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) if (netmask == 32) { hosts = 1; - elements = last_ip - first_ip + 1; + elements = (u64)last_ip - first_ip + 1; } else { u8 mask_bits; u32 mask; @@ -512,35 +309,24 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; - pr_debug("hosts %u, elements %u\n", hosts, elements); + pr_debug("hosts %u, elements %llu\n", + hosts, (unsigned long long)elements); map = kzalloc(sizeof(*map), GFP_KERNEL); if (!map) return -ENOMEM; + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ip; + set->dsize = ip_set_elem_len(set, tb, 0); + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } if (tb[IPSET_ATTR_TIMEOUT]) { - map->memsize = elements * sizeof(unsigned long); - - if (!init_map_ip(set, map, first_ip, last_ip, - elements, hosts, netmask)) { - kfree(map); - return -ENOMEM; - } - - map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - set->variant = &bitmap_tip; - - bitmap_ip_gc_init(set); - } else { - map->memsize = bitmap_bytes(0, elements - 1); - - if (!init_map_ip(set, map, first_ip, last_ip, - elements, hosts, netmask)) { - kfree(map); - return -ENOMEM; - } - - set->variant = &bitmap_ip; + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + bitmap_ip_gc_init(set, bitmap_ip_gc); } return 0; } @@ -550,9 +336,9 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP, .dimension = IPSET_DIM_ONE, - .family = AF_INET, - .revision_min = 0, - .revision_max = 0, + .family = NFPROTO_IPV4, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = bitmap_ip_create, .create_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -560,6 +346,7 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -567,6 +354,9 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 56096f54497..740eabededd 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -23,334 +23,196 @@ #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counter support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("bitmap:ip,mac type of IP sets"); +IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); +#define MTYPE bitmap_ipmac +#define IP_SET_BITMAP_STORED_TIMEOUT + enum { - MAC_EMPTY, /* element is not set */ - MAC_FILLED, /* element is set with MAC */ MAC_UNSET, /* element is set, without MAC */ + MAC_FILLED, /* element is set with MAC */ }; /* Type structure */ struct bitmap_ipmac { void *members; /* the set members */ + void *extensions; /* MAC + data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ - u32 timeout; /* timeout value */ + u32 elements; /* number of max elements in the set */ + size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ - size_t dsize; /* size of element */ }; /* ADT structure for generic function args */ -struct ipmac { - u32 id; /* id in array */ - unsigned char *ether; /* ethernet address */ +struct bitmap_ipmac_adt_elem { + u16 id; + unsigned char *ether; }; -/* Member element without and with timeout */ - -struct ipmac_elem { +struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; - unsigned char match; + unsigned char filled; } __attribute__ ((aligned)); -struct ipmac_telem { - unsigned char ether[ETH_ALEN]; - unsigned char match; - unsigned long timeout; -} __attribute__ ((aligned)); - -static inline void * -bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id) +static inline u32 +ip_to_id(const struct bitmap_ipmac *m, u32 ip) { - return (void *)((char *)map->members + id * map->dsize); + return ip - m->first_ip; } -static inline bool -bitmap_timeout(const struct bitmap_ipmac *map, u32 id) +static inline struct bitmap_ipmac_elem * +get_elem(void *extensions, u16 id, size_t dsize) { - const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); - - return ip_set_timeout_test(elem->timeout); + return (struct bitmap_ipmac_elem *)(extensions + id * dsize); } -static inline bool -bitmap_expired(const struct bitmap_ipmac *map, u32 id) -{ - const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); - - return ip_set_timeout_expired(elem->timeout); -} +/* Common functions */ static inline int -bitmap_ipmac_exist(const struct ipmac_telem *elem) -{ - return elem->match == MAC_UNSET || - (elem->match == MAC_FILLED && - !ip_set_timeout_expired(elem->timeout)); -} - -/* Base variant */ - -static int -bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout, u32 flags) +bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, + const struct bitmap_ipmac *map, size_t dsize) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - /* Trigger kernel to fill out the ethernet address */ - return -EAGAIN; - case MAC_FILLED: - return data->ether == NULL || - compare_ether_addr(data->ether, elem->ether) == 0; - } - return 0; -} + const struct bitmap_ipmac_elem *elem; -static int -bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - if (!data->ether) - /* Already added without ethernet address */ - return -IPSET_ERR_EXIST; - /* Fill the MAC address */ - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - break; - case MAC_FILLED: - return -IPSET_ERR_EXIST; - case MAC_EMPTY: - if (data->ether) { - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - } else - elem->match = MAC_UNSET; - } - - return 0; -} - -static int -bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - if (elem->match == MAC_EMPTY) - return -IPSET_ERR_EXIST; - - elem->match = MAC_EMPTY; - - return 0; + if (!test_bit(e->id, map->members)) + return 0; + elem = get_elem(map->extensions, e->id, dsize); + if (elem->filled == MAC_FILLED) + return e->ether == NULL || + ether_addr_equal(e->ether, elem->ether); + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; } -static int -bitmap_ipmac_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac_elem *elem; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - u32 last = map->last_ip - map->first_ip; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - elem = bitmap_ipmac_elem(map, id); - if (elem->match == MAC_EMPTY) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id)); - if (elem->match == MAC_FILLED) - NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN, - elem->ether); - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - - return 0; + const struct bitmap_ipmac_elem *elem; -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + if (!test_bit(id, map->members)) + return 0; + elem = get_elem(map->extensions, id, dsize); + /* Timer not started for the incomplete elements */ + return elem->filled == MAC_FILLED; } -/* Timeout variant */ - -static int -bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - /* Trigger kernel to fill out the ethernet address */ - return -EAGAIN; - case MAC_FILLED: - return (data->ether == NULL || - compare_ether_addr(data->ether, elem->ether) == 0) && - !bitmap_expired(map, data->id); - } - return 0; + return elem->filled == MAC_FILLED; } -static int -bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_add_timeout(unsigned long *timeout, + const struct bitmap_ipmac_adt_elem *e, + const struct ip_set_ext *ext, struct ip_set *set, + struct bitmap_ipmac *map, int mode) { - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); - bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 t = ext->timeout; - switch (elem->match) { - case MAC_UNSET: - if (!(data->ether || flag_exist)) - /* Already added without ethernet address */ - return -IPSET_ERR_EXIST; - /* Fill the MAC address and activate the timer */ - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - if (timeout == map->timeout) + if (mode == IPSET_ADD_START_STORED_TIMEOUT) { + if (t == set->timeout) /* Timeout was not specified, get stored one */ - timeout = elem->timeout; - elem->timeout = ip_set_timeout_set(timeout); - break; - case MAC_FILLED: - if (!(bitmap_expired(map, data->id) || flag_exist)) - return -IPSET_ERR_EXIST; - /* Fall through */ - case MAC_EMPTY: - if (data->ether) { - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - } else - elem->match = MAC_UNSET; + t = *timeout; + ip_set_timeout_set(timeout, t); + } else { /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, * possibly by the kernel */ - elem->timeout = data->ether ? ip_set_timeout_set(timeout) - : timeout; - break; + if (e->ether) + ip_set_timeout_set(timeout, t); + else + *timeout = t; } - return 0; } -static int -bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map, u32 flags, size_t dsize) { - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); - - if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id)) - return -IPSET_ERR_EXIST; - - elem->match = MAC_EMPTY; + struct bitmap_ipmac_elem *elem; + + elem = get_elem(map->extensions, e->id, dsize); + if (test_and_set_bit(e->id, map->members)) { + if (elem->filled == MAC_FILLED) { + if (e->ether && (flags & IPSET_FLAG_EXIST)) + memcpy(elem->ether, e->ether, ETH_ALEN); + return IPSET_ADD_FAILED; + } else if (!e->ether) + /* Already added without ethernet address */ + return IPSET_ADD_FAILED; + /* Fill the MAC address and trigger the timer activation */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return IPSET_ADD_START_STORED_TIMEOUT; + } else if (e->ether) { + /* We can store MAC too */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return 0; + } else { + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; + } +} - return 0; +static inline int +bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map) +{ + return !test_and_clear_bit(e->id, map->members); } -static int -bitmap_ipmac_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, + u32 id, size_t dsize) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac_telem *elem; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - u32 timeout, last = map->last_ip - map->first_ip; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - elem = bitmap_ipmac_elem(map, id); - if (!bitmap_ipmac_exist(elem)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id)); - if (elem->match == MAC_FILLED) - NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN, - elem->ether); - timeout = elem->match == MAC_UNSET ? elem->timeout - : ip_set_timeout_get(elem->timeout); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout)); - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; + const struct bitmap_ipmac_elem *elem = + get_elem(map->extensions, id, dsize); - return 0; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)) || + (elem->filled == MAC_FILLED && + nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); +} -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - return -EMSGSIZE; +static inline int +bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) +{ + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); } static int bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct ipmac data; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + u32 ip; /* MAC can be src only */ if (!(opt->flags & IPSET_DIM_TWO_SRC)) return 0; - data.id = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); - if (data.id < map->first_ip || data.id > map->last_ip) + ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); + if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; /* Backward compatibility: we don't check the second flag */ @@ -358,10 +220,10 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - data.id -= map->first_ip; - data.ether = eth_hdr(skb)->h_source; + e.id = ip_to_id(map, ip); + e.ether = eth_hdr(skb)->h_source; - return adtfn(set, &data, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -370,89 +232,39 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct ipmac data; - u32 timeout = map->timeout; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0; int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (data.id < map->first_ip || data.id > map->last_ip) + if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; + e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) - data.ether = nla_data(tb[IPSET_ATTR_ETHER]); + e.ether = nla_data(tb[IPSET_ATTR_ETHER]); else - data.ether = NULL; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - data.id -= map->first_ip; + e.ether = NULL; - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } -static void -bitmap_ipmac_destroy(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_ipmac_flush(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; - - memset(map->members, 0, - (map->last_ip - map->first_ip + 1) * map->dsize); -} - -static int -bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_ipmac *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); - NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) - + (map->last_ip - map->first_ip + 1) * map->dsize)); - if (with_timeout(map->timeout)) - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -461,103 +273,53 @@ bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_ip == y->first_ip && x->last_ip == y->last_ip && - x->timeout == y->timeout; + a->timeout == b->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_ipmac = { - .kadt = bitmap_ipmac_kadt, - .uadt = bitmap_ipmac_uadt, - .adt = { - [IPSET_ADD] = bitmap_ipmac_add, - [IPSET_DEL] = bitmap_ipmac_del, - [IPSET_TEST] = bitmap_ipmac_test, - }, - .destroy = bitmap_ipmac_destroy, - .flush = bitmap_ipmac_flush, - .head = bitmap_ipmac_head, - .list = bitmap_ipmac_list, - .same_set = bitmap_ipmac_same_set, -}; +/* Plain variant */ -static const struct ip_set_type_variant bitmap_tipmac = { - .kadt = bitmap_ipmac_kadt, - .uadt = bitmap_ipmac_uadt, - .adt = { - [IPSET_ADD] = bitmap_ipmac_tadd, - [IPSET_DEL] = bitmap_ipmac_tdel, - [IPSET_TEST] = bitmap_ipmac_ttest, - }, - .destroy = bitmap_ipmac_destroy, - .flush = bitmap_ipmac_flush, - .head = bitmap_ipmac_head, - .list = bitmap_ipmac_tlist, - .same_set = bitmap_ipmac_same_set, -}; - -static void -bitmap_ipmac_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_ipmac *map = set->data; - struct ipmac_telem *elem; - u32 id, last = map->last_ip - map->first_ip; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id <= last; id++) { - elem = bitmap_ipmac_elem(map, id); - if (elem->match == MAC_FILLED && - ip_set_timeout_expired(elem->timeout)) - elem->match = MAC_EMPTY; - } - read_unlock_bh(&set->lock); - - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} - -static void -bitmap_ipmac_gc_init(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; - - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_ipmac_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip,mac type of sets */ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, - u32 first_ip, u32 last_ip) + u32 first_ip, u32 last_ip, u32 elements) { - map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize); + map->members = ip_set_alloc(map->memsize); if (!map->members) return false; + if (set->dsize) { + map->extensions = ip_set_alloc(set->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_ip = first_ip; map->last_ip = last_ip; - map->timeout = IPSET_NO_TIMEOUT; + map->elements = elements; + set->timeout = IPSET_NO_TIMEOUT; set->data = map; - set->family = AF_INET; + set->family = NFPROTO_IPV4; return true; } static int -bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], +bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { - u32 first_ip, last_ip, elements; + u32 first_ip = 0, last_ip = 0; + u64 elements; struct bitmap_ipmac *map; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); @@ -583,7 +345,7 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], } else return -IPSET_ERR_PROTOCOL; - elements = last_ip - first_ip + 1; + elements = (u64)last_ip - first_ip + 1; if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; @@ -592,28 +354,17 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ipmac; + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct bitmap_ipmac_elem)); + if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { + kfree(map); + return -ENOMEM; + } if (tb[IPSET_ATTR_TIMEOUT]) { - map->dsize = sizeof(struct ipmac_telem); - - if (!init_map_ipmac(set, map, first_ip, last_ip)) { - kfree(map); - return -ENOMEM; - } - - map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = &bitmap_tipmac; - - bitmap_ipmac_gc_init(set); - } else { - map->dsize = sizeof(struct ipmac_elem); - - if (!init_map_ipmac(set, map, first_ip, last_ip)) { - kfree(map); - return -ENOMEM; - } - set->variant = &bitmap_ipmac; - + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); } return 0; } @@ -623,15 +374,16 @@ static struct ip_set_type bitmap_ipmac_type = { .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, .dimension = IPSET_DIM_TWO, - .family = AF_INET, - .revision_min = 0, - .revision_max = 0, + .family = NFPROTO_IPV4, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = bitmap_ipmac_create, .create_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -639,6 +391,9 @@ static struct ip_set_type bitmap_ipmac_type = { .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 29ba93bb94b..cf99676e69f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,200 +19,93 @@ #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #include <linux/netfilter/ipset/ip_set_getport.h> -#define IP_SET_BITMAP_TIMEOUT -#include <linux/netfilter/ipset/ip_set_timeout.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counter support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("bitmap:port type of IP sets"); +IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:port"); +#define MTYPE bitmap_port + /* Type structure */ struct bitmap_port { void *members; /* the set members */ + void *extensions; /* data extensions */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ - u32 timeout; /* timeout parameter */ struct timer_list gc; /* garbage collection */ }; -/* Base variant */ +/* ADT structure for generic function args */ +struct bitmap_port_adt_elem { + u16 id; +}; -static int -bitmap_port_test(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline u16 +port_to_id(const struct bitmap_port *m, u16 port) { - const struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - return !!test_bit(id, map->members); + return port - m->first_port; } -static int -bitmap_port_add(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; +/* Common functions */ - if (test_and_set_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; -} - -static int -bitmap_port_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_test(const struct bitmap_port_adt_elem *e, + const struct bitmap_port *map, size_t dsize) { - struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - if (!test_and_clear_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; + return !!test_bit(e->id, map->members); } -static int -bitmap_port_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) { - const struct bitmap_port *map = set->data; - struct nlattr *atd, *nested; - u16 id, first = cb->args[2]; - u16 last = map->last_port - map->first_port; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - if (!test_bit(id, map->members)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, - htons(map->first_port + id)); - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return !!test_bit(id, map->members); } -/* Timeout variant */ - -static int -bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_add(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map, u32 flags, size_t dsize) { - const struct bitmap_port *map = set->data; - const unsigned long *members = map->members; - u16 id = *(u16 *)value; - - return ip_set_timeout_test(members[id]); + return !!test_and_set_bit(e->id, map->members); } -static int -bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_del(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map) { - struct bitmap_port *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - - if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) - return -IPSET_ERR_EXIST; - - members[id] = ip_set_timeout_set(timeout); - - return 0; + return !test_and_clear_bit(e->id, map->members); } -static int -bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, + size_t dsize) { - struct bitmap_port *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - int ret = -IPSET_ERR_EXIST; - - if (ip_set_timeout_test(members[id])) - ret = 0; - - members[id] = IPSET_ELEM_UNSET; - return ret; + return nla_put_net16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); } -static int -bitmap_port_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { - const struct bitmap_port *map = set->data; - struct nlattr *adt, *nested; - u16 id, first = cb->args[2]; - u16 last = map->last_port - map->first_port; - const unsigned long *members = map->members; - - adt = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!adt) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - if (!ip_set_timeout_test(members[id])) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, - htons(map->first_port + id)); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(members[id]))); - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, adt); - - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, adt); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || + nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be16 __port; u16 port = 0; @@ -225,9 +118,9 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; - port -= map->first_port; + e.id = port_to_id(map, port); - return adtfn(set, &port, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -236,14 +129,17 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 timeout = map->timeout; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port; /* wraparound */ - u16 id, port_to; + u16 port_to; int ret = 0; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -252,16 +148,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; if (adt == IPSET_TEST) { - id = port - map->first_port; - return adtfn(set, &id, timeout, flags); + e.id = port_to_id(map, port); + return adtfn(set, &e, &ext, &ext, flags); } if (tb[IPSET_ATTR_PORT_TO]) { @@ -278,8 +171,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; for (; port <= port_to; port++) { - id = port - map->first_port; - ret = adtfn(set, &id, timeout, flags); + e.id = port_to_id(map, port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -289,51 +182,6 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static void -bitmap_port_destroy(struct ip_set *set) -{ - struct bitmap_port *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_port_flush(struct ip_set *set) -{ - struct bitmap_port *map = set->data; - - memset(map->members, 0, map->memsize); -} - -static int -bitmap_port_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_port *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port)); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); - NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->memsize)); - if (with_timeout(map->timeout)) - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -342,71 +190,16 @@ bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_port == y->first_port && x->last_port == y->last_port && - x->timeout == y->timeout; + a->timeout == b->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_port = { - .kadt = bitmap_port_kadt, - .uadt = bitmap_port_uadt, - .adt = { - [IPSET_ADD] = bitmap_port_add, - [IPSET_DEL] = bitmap_port_del, - [IPSET_TEST] = bitmap_port_test, - }, - .destroy = bitmap_port_destroy, - .flush = bitmap_port_flush, - .head = bitmap_port_head, - .list = bitmap_port_list, - .same_set = bitmap_port_same_set, -}; +/* Plain variant */ -static const struct ip_set_type_variant bitmap_tport = { - .kadt = bitmap_port_kadt, - .uadt = bitmap_port_uadt, - .adt = { - [IPSET_ADD] = bitmap_port_tadd, - [IPSET_DEL] = bitmap_port_tdel, - [IPSET_TEST] = bitmap_port_ttest, - }, - .destroy = bitmap_port_destroy, - .flush = bitmap_port_flush, - .head = bitmap_port_head, - .list = bitmap_port_tlist, - .same_set = bitmap_port_same_set, +struct bitmap_port_elem { }; -static void -bitmap_port_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_port *map = set->data; - unsigned long *table = map->members; - u32 id; /* wraparound */ - u16 last = map->last_port - map->first_port; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id <= last; id++) - if (ip_set_timeout_expired(table[id])) - table[id] = IPSET_ELEM_UNSET; - read_unlock_bh(&set->lock); - - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} - -static void -bitmap_port_gc_init(struct ip_set *set) -{ - struct bitmap_port *map = set->data; - - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_port_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip type of sets */ @@ -417,26 +210,34 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; + if (set->dsize) { + map->extensions = ip_set_alloc(set->dsize * map->elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_port = first_port; map->last_port = last_port; - map->timeout = IPSET_NO_TIMEOUT; + set->timeout = IPSET_NO_TIMEOUT; set->data = map; - set->family = AF_UNSPEC; + set->family = NFPROTO_UNSPEC; return true; } static int -bitmap_port_create(struct ip_set *set, struct nlattr *tb[], - u32 flags) +bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], + u32 flags) { struct bitmap_port *map; u16 first_port, last_port; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); @@ -452,28 +253,17 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; + map->elements = last_port - first_port + 1; + map->memsize = bitmap_bytes(0, map->elements); + set->variant = &bitmap_port; + set->dsize = ip_set_elem_len(set, tb, 0); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } if (tb[IPSET_ATTR_TIMEOUT]) { - map->memsize = (last_port - first_port + 1) - * sizeof(unsigned long); - - if (!init_map_port(set, map, first_port, last_port)) { - kfree(map); - return -ENOMEM; - } - - map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - set->variant = &bitmap_tport; - - bitmap_port_gc_init(set); - } else { - map->memsize = bitmap_bytes(0, last_port - first_port); - pr_debug("memsize: %zu\n", map->memsize); - if (!init_map_port(set, map, first_port, last_port)) { - kfree(map); - return -ENOMEM; - } - - set->variant = &bitmap_port; + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + bitmap_port_gc_init(set, bitmap_port_gc); } return 0; } @@ -483,20 +273,24 @@ static struct ip_set_type bitmap_port_type = { .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_PORT, .dimension = IPSET_DIM_ONE, - .family = AF_UNSPEC, - .revision_min = 0, - .revision_max = 0, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = bitmap_port_create, .create_policy = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 32dbf0fa89d..ec8114fae50 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -15,9 +15,10 @@ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/spinlock.h> -#include <linux/netlink.h> #include <linux/rculist.h> #include <net/netlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> @@ -28,9 +29,19 @@ static LIST_HEAD(ip_set_type_list); /* all registered set types */ static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ -static struct ip_set **ip_set_list; /* all individual sets */ -static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ +struct ip_set_net { + struct ip_set * __rcu *ip_set_list; /* all individual sets */ + ip_set_id_t ip_set_max; /* max number of sets */ + int is_deleted; /* deleted by ip_set_net_exit */ +}; +static int ip_set_net_id __read_mostly; +static inline struct ip_set_net *ip_set_pernet(struct net *net) +{ + return net_generic(net, ip_set_net_id); +} + +#define IP_SET_INC 64 #define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; @@ -42,6 +53,12 @@ MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); +/* When the nfnl mutex is held: */ +#define ip_set_dereference(p) \ + rcu_dereference_protected(p, 1) +#define ip_set(inst, id) \ + ip_set_dereference((inst)->ip_set_list)[id] + /* * The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -69,7 +86,8 @@ find_set_type(const char *name, u8 family, u8 revision) list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STREQ(type->name, name) && - (type->family == family || type->family == AF_UNSPEC) && + (type->family == family || + type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && revision <= type->revision_max) return type; @@ -80,14 +98,14 @@ find_set_type(const char *name, u8 family, u8 revision) static bool load_settype(const char *name) { - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warning("Can't find ip_set type %s\n", name); - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); return false; } - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); return true; } @@ -149,7 +167,8 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STREQ(type->name, name) && - (type->family == family || type->family == AF_UNSPEC)) { + (type->family == family || + type->family == NFPROTO_UNSPEC)) { found = true; if (type->revision_min < *min) *min = type->revision_min; @@ -164,8 +183,8 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, __find_set_type_minmax(name, family, min, max, true); } -#define family_name(f) ((f) == AF_INET ? "inet" : \ - (f) == AF_INET6 ? "inet6" : "any") +#define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \ + (f) == NFPROTO_IPV6 ? "inet6" : "any") /* Register a set type structure. The type is identified by * the unique triple of name, family and revision. @@ -252,10 +271,7 @@ ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); - if (is_vmalloc_addr(members)) - vfree(members); - else - kfree(members); + kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); @@ -307,6 +323,91 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +typedef void (*destroyer)(void *); +/* ipset data extension types, in size order */ + +const struct ip_set_ext_type ip_set_extensions[] = { + [IPSET_EXT_ID_COUNTER] = { + .type = IPSET_EXT_COUNTER, + .flag = IPSET_FLAG_WITH_COUNTERS, + .len = sizeof(struct ip_set_counter), + .align = __alignof__(struct ip_set_counter), + }, + [IPSET_EXT_ID_TIMEOUT] = { + .type = IPSET_EXT_TIMEOUT, + .len = sizeof(unsigned long), + .align = __alignof__(unsigned long), + }, + [IPSET_EXT_ID_COMMENT] = { + .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, + .flag = IPSET_FLAG_WITH_COMMENT, + .len = sizeof(struct ip_set_comment), + .align = __alignof__(struct ip_set_comment), + .destroy = (destroyer) ip_set_comment_free, + }, +}; +EXPORT_SYMBOL_GPL(ip_set_extensions); + +static inline bool +add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) +{ + return ip_set_extensions[id].flag ? + (flags & ip_set_extensions[id].flag) : + !!tb[IPSET_ATTR_TIMEOUT]; +} + +size_t +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +{ + enum ip_set_ext_id id; + size_t offset = 0; + u32 cadt_flags = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) + set->flags |= IPSET_CREATE_FLAG_FORCEADD; + for (id = 0; id < IPSET_EXT_ID_MAX; id++) { + if (!add_extension(id, cadt_flags, tb)) + continue; + offset += ALIGN(len + offset, ip_set_extensions[id].align); + set->offset[id] = offset; + set->extensions |= ip_set_extensions[id].type; + offset += ip_set_extensions[id].len; + } + return len + offset; +} +EXPORT_SYMBOL_GPL(ip_set_elem_len); + +int +ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], + struct ip_set_ext *ext) +{ + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!(set->extensions & IPSET_EXT_TIMEOUT)) + return -IPSET_ERR_TIMEOUT; + ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { + if (!(set->extensions & IPSET_EXT_COUNTER)) + return -IPSET_ERR_COUNTER; + if (tb[IPSET_ATTR_BYTES]) + ext->bytes = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_BYTES])); + if (tb[IPSET_ATTR_PACKETS]) + ext->packets = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_PACKETS])); + } + if (tb[IPSET_ATTR_COMMENT]) { + if (!(set->extensions & IPSET_EXT_COMMENT)) + return -IPSET_ERR_COMMENT; + ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); + } + + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_extensions); + /* * Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace @@ -319,19 +420,19 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); */ static inline void -__ip_set_get(ip_set_id_t index) +__ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); - ip_set_list[index]->ref++; + set->ref++; write_unlock_bh(&ip_set_ref_lock); } static inline void -__ip_set_put(ip_set_id_t index) +__ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); - BUG_ON(ip_set_list[index]->ref == 0); - ip_set_list[index]->ref--; + BUG_ON(set->ref == 0); + set->ref--; write_unlock_bh(&ip_set_ref_lock); } @@ -342,19 +443,33 @@ __ip_set_put(ip_set_id_t index) * so it can't be destroyed (or changed) under our foot. */ +static inline struct ip_set * +ip_set_rcu_get(struct net *net, ip_set_id_t index) +{ + struct ip_set *set; + struct ip_set_net *inst = ip_set_pernet(net); + + rcu_read_lock(); + /* ip_set_list itself needs to be protected */ + set = rcu_dereference(inst->ip_set_list)[index]; + rcu_read_unlock(); + + return set; +} + int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = ip_set_rcu_get( + dev_net(par->in ? par->in : par->out), index); int ret = 0; BUG_ON(set == NULL); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == AF_UNSPEC)) + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; read_lock_bh(&set->lock); @@ -368,6 +483,12 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, set->variant->kadt(set, skb, par, IPSET_ADD, opt); write_unlock_bh(&set->lock); ret = 1; + } else { + /* --return-nomatch: invert matched element */ + if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && + (set->type->features & IPSET_TYPE_NOMATCH) && + (ret > 0 || ret == -ENOTEMPTY)) + ret = -ret; } /* Convert error codes to nomatch */ @@ -377,18 +498,18 @@ EXPORT_SYMBOL_GPL(ip_set_test); int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = ip_set_rcu_get( + dev_net(par->in ? par->in : par->out), index); int ret; BUG_ON(set == NULL); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == AF_UNSPEC)) - return 0; + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); @@ -400,18 +521,18 @@ EXPORT_SYMBOL_GPL(ip_set_add); int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = ip_set_rcu_get( + dev_net(par->in ? par->in : par->out), index); int ret = 0; BUG_ON(set == NULL); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == AF_UNSPEC)) - return 0; + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); @@ -427,19 +548,23 @@ EXPORT_SYMBOL_GPL(ip_set_del); * */ ip_set_id_t -ip_set_get_byname(const char *name, struct ip_set **set) +ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) { ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; + struct ip_set_net *inst = ip_set_pernet(net); - for (i = 0; i < ip_set_max; i++) { - s = ip_set_list[i]; + rcu_read_lock(); + for (i = 0; i < inst->ip_set_max; i++) { + s = rcu_dereference(inst->ip_set_list)[i]; if (s != NULL && STREQ(s->name, name)) { - __ip_set_get(i); + __ip_set_get(s); index = i; *set = s; + break; } } + rcu_read_unlock(); return index; } @@ -451,11 +576,25 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); * to be valid, after calling this function. * */ + +static inline void +__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) +{ + struct ip_set *set; + + rcu_read_lock(); + set = rcu_dereference(inst->ip_set_list)[index]; + if (set != NULL) + __ip_set_put(set); + rcu_read_unlock(); +} + void -ip_set_put_byindex(ip_set_id_t index) +ip_set_put_byindex(struct net *net, ip_set_id_t index) { - if (ip_set_list[index] != NULL) - __ip_set_put(index); + struct ip_set_net *inst = ip_set_pernet(net); + + __ip_set_put_byindex(inst, index); } EXPORT_SYMBOL_GPL(ip_set_put_byindex); @@ -467,9 +606,9 @@ EXPORT_SYMBOL_GPL(ip_set_put_byindex); * */ const char * -ip_set_name_byindex(ip_set_id_t index) +ip_set_name_byindex(struct net *net, ip_set_id_t index) { - const struct ip_set *set = ip_set_list[index]; + const struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(set == NULL); BUG_ON(set->ref == 0); @@ -485,43 +624,27 @@ EXPORT_SYMBOL_GPL(ip_set_name_byindex); */ /* - * Find set by name, reference it once. The reference makes sure the - * thing pointed to, does not go away under our feet. - * - * The nfnl mutex is used in the function. - */ -ip_set_id_t -ip_set_nfnl_get(const char *name) -{ - struct ip_set *s; - ip_set_id_t index; - - nfnl_lock(); - index = ip_set_get_byname(name, &s); - nfnl_unlock(); - - return index; -} -EXPORT_SYMBOL_GPL(ip_set_nfnl_get); - -/* * Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. */ ip_set_id_t -ip_set_nfnl_get_byindex(ip_set_id_t index) +ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) { - if (index > ip_set_max) + struct ip_set *set; + struct ip_set_net *inst = ip_set_pernet(net); + + if (index > inst->ip_set_max) return IPSET_INVALID_ID; - nfnl_lock(); - if (ip_set_list[index]) - __ip_set_get(index); + nfnl_lock(NFNL_SUBSYS_IPSET); + set = ip_set(inst, index); + if (set) + __ip_set_get(set); else index = IPSET_INVALID_ID; - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); return index; } @@ -535,11 +658,18 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); * The nfnl mutex is used in the function. */ void -ip_set_nfnl_put(ip_set_id_t index) +ip_set_nfnl_put(struct net *net, ip_set_id_t index) { - nfnl_lock(); - ip_set_put_byindex(index); - nfnl_unlock(); + struct ip_set *set; + struct ip_set_net *inst = ip_set_pernet(net); + + nfnl_lock(NFNL_SUBSYS_IPSET); + if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ + set = ip_set(inst, index); + if (set != NULL) + __ip_set_put(set); + } + nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); @@ -563,19 +693,19 @@ flag_exist(const struct nlmsghdr *nlh) } static struct nlmsghdr * -start_msg(struct sk_buff *skb, u32 pid, u32 seq, unsigned int flags, +start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - nlh = nlmsg_put(skb, pid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), + nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), sizeof(*nfmsg), flags); if (nlh == NULL) return NULL; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_INET; + nfmsg->nfgen_family = NFPROTO_IPV4; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; @@ -595,41 +725,47 @@ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, }; -static ip_set_id_t -find_set_id(const char *name) +static struct ip_set * +find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) { - ip_set_id_t i, index = IPSET_INVALID_ID; - const struct ip_set *set; + struct ip_set *set = NULL; + ip_set_id_t i; - for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) { - set = ip_set_list[i]; - if (set != NULL && STREQ(set->name, name)) - index = i; + *id = IPSET_INVALID_ID; + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set != NULL && STREQ(set->name, name)) { + *id = i; + break; + } } - return index; + return (*id == IPSET_INVALID_ID ? NULL : set); } static inline struct ip_set * -find_set(const char *name) +find_set(struct ip_set_net *inst, const char *name) { - ip_set_id_t index = find_set_id(name); + ip_set_id_t id; - return index == IPSET_INVALID_ID ? NULL : ip_set_list[index]; + return find_set_and_id(inst, name, &id); } static int -find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set) +find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, + struct ip_set **set) { + struct ip_set *s; ip_set_id_t i; *index = IPSET_INVALID_ID; - for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] == NULL) { + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s == NULL) { if (*index == IPSET_INVALID_ID) *index = i; - } else if (STREQ(name, ip_set_list[i]->name)) { + } else if (STREQ(name, s->name)) { /* Name clash */ - *set = ip_set_list[i]; + *set = s; return -EEXIST; } } @@ -640,10 +776,20 @@ find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set) } static int +ip_set_none(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + return -EOPNOTSUPP; +} + +static int ip_set_create(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct net *net = sock_net(ctnl); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; @@ -702,7 +848,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, goto put_out; } - ret = set->type->create(set, tb, flags); + ret = set->type->create(net, set, tb, flags); if (ret != 0) goto put_out; @@ -713,10 +859,10 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ - if ((ret = find_free_id(set->name, &index, &clash)) != 0) { + ret = find_free_id(inst, set->name, &index, &clash); + if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ - if (ret == -EEXIST && - (flags & IPSET_FLAG_EXIST) && + if ((flags & IPSET_FLAG_EXIST) && STREQ(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && @@ -724,13 +870,36 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, set->variant->same_set(set, clash)) ret = 0; goto cleanup; - } + } else if (ret == -IPSET_ERR_MAX_SETS) { + struct ip_set **list, **tmp; + ip_set_id_t i = inst->ip_set_max + IP_SET_INC; + + if (i < inst->ip_set_max || i == IPSET_INVALID_ID) + /* Wraparound */ + goto cleanup; + + list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); + if (!list) + goto cleanup; + /* nfnl mutex is held, both lists are valid */ + tmp = ip_set_dereference(inst->ip_set_list); + memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); + rcu_assign_pointer(inst->ip_set_list, list); + /* Make sure all current packets have passed through */ + synchronize_net(); + /* Use new list */ + index = inst->ip_set_max; + inst->ip_set_max = i; + kfree(tmp); + ret = 0; + } else if (ret) + goto cleanup; /* * Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); - ip_set_list[index] = set; + ip_set(inst, index) = set; return ret; @@ -753,12 +922,12 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static void -ip_set_destroy_set(ip_set_id_t index) +ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = ip_set(inst, index); pr_debug("set: %s\n", set->name); - ip_set_list[index] = NULL; + ip_set(inst, index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -771,6 +940,8 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *s; ip_set_id_t i; int ret = 0; @@ -789,29 +960,32 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, */ read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { - for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL && s->ref) { ret = -IPSET_ERR_BUSY; goto out; } } read_unlock_bh(&ip_set_ref_lock); - for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] != NULL) - ip_set_destroy_set(i); + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL) + ip_set_destroy_set(inst, i); } } else { - i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (i == IPSET_INVALID_ID) { + s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), + &i); + if (s == NULL) { ret = -ENOENT; goto out; - } else if (ip_set_list[i]->ref) { + } else if (s->ref) { ret = -IPSET_ERR_BUSY; goto out; } read_unlock_bh(&ip_set_ref_lock); - ip_set_destroy_set(i); + ip_set_destroy_set(inst, i); } return 0; out: @@ -836,21 +1010,25 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *s; ip_set_id_t i; if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; if (!attr[IPSET_ATTR_SETNAME]) { - for (i = 0; i < ip_set_max; i++) - if (ip_set_list[i] != NULL) - ip_set_flush_set(ip_set_list[i]); + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL) + ip_set_flush_set(s); + } } else { - i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (i == IPSET_INVALID_ID) + s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (s == NULL) return -ENOENT; - ip_set_flush_set(ip_set_list[i]); + ip_set_flush_set(s); } return 0; @@ -872,7 +1050,8 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { - struct ip_set *set; + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set *set, *s; const char *name2; ip_set_id_t i; int ret = 0; @@ -882,7 +1061,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, attr[IPSET_ATTR_SETNAME2] == NULL)) return -IPSET_ERR_PROTOCOL; - set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (set == NULL) return -ENOENT; @@ -893,9 +1072,9 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, } name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); - for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] != NULL && - STREQ(ip_set_list[i]->name, name2)) { + for (i = 0; i < inst->ip_set_max; i++) { + s = ip_set(inst, i); + if (s != NULL && STREQ(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } @@ -921,6 +1100,7 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; @@ -930,22 +1110,21 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, attr[IPSET_ATTR_SETNAME2] == NULL)) return -IPSET_ERR_PROTOCOL; - from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (from_id == IPSET_INVALID_ID) + from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), + &from_id); + if (from == NULL) return -ENOENT; - to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2])); - if (to_id == IPSET_INVALID_ID) + to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), + &to_id); + if (to == NULL) return -IPSET_ERR_EXIST_SETNAME2; - from = ip_set_list[from_id]; - to = ip_set_list[to_id]; - /* Features must not change. * Not an artificial restriction anymore, as we must prevent * possible loops created by swapping in setlist type of sets. */ if (!(from->type->features == to->type->features && - from->type->family == to->type->family)) + from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; strncpy(from_name, from->name, IPSET_MAXNAMELEN); @@ -954,8 +1133,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); - ip_set_list[from_id] = to; - ip_set_list[to_id] = from; + ip_set(inst, from_id) = to; + ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); return 0; @@ -974,9 +1153,12 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, static int ip_set_dump_done(struct netlink_callback *cb) { - if (cb->args[2]) { - pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); - ip_set_put_byindex((ip_set_id_t) cb->args[1]); + struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; + if (cb->args[IPSET_CB_ARG0]) { + pr_debug("release set %s\n", + ip_set(inst, cb->args[IPSET_CB_INDEX])->name); + __ip_set_put_byindex(inst, + (ip_set_id_t) cb->args[IPSET_CB_INDEX]); } return 0; } @@ -994,10 +1176,10 @@ dump_attrs(struct nlmsghdr *nlh) } static int -dump_init(struct netlink_callback *cb) +dump_init(struct netlink_callback *cb, struct ip_set_net *inst) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; @@ -1007,18 +1189,22 @@ dump_init(struct netlink_callback *cb) nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); - /* cb->args[0] : dump single set/all sets - * [1] : set index - * [..]: type specific + /* cb->args[IPSET_CB_NET]: net namespace + * [IPSET_CB_DUMP]: dump single set/all sets + * [IPSET_CB_INDEX]: set index + * [IPSET_CB_ARG0]: type specific */ if (cda[IPSET_ATTR_SETNAME]) { - index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME])); - if (index == IPSET_INVALID_ID) + struct ip_set *set; + + set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), + &index); + if (set == NULL) return -ENOENT; dump_type = DUMP_ONE; - cb->args[1] = index; + cb->args[IPSET_CB_INDEX] = index; } else dump_type = DUMP_ALL; @@ -1026,7 +1212,8 @@ dump_init(struct netlink_callback *cb) u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); dump_type |= (f << 16); } - cb->args[0] = dump_type; + cb->args[IPSET_CB_NET] = (unsigned long)inst; + cb->args[IPSET_CB_DUMP] = dump_type; return 0; } @@ -1037,12 +1224,13 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) ip_set_id_t index = IPSET_INVALID_ID, max; struct ip_set *set = NULL; struct nlmsghdr *nlh = NULL; - unsigned int flags = NETLINK_CB(cb->skb).pid ? NLM_F_MULTI : 0; + unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; + struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; int ret = 0; - if (!cb->args[0]) { - ret = dump_init(cb); + if (!cb->args[IPSET_CB_DUMP]) { + ret = dump_init(cb, inst); if (ret < 0) { nlh = nlmsg_hdr(cb->skb); /* We have to create and send the error message @@ -1053,18 +1241,19 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) } } - if (cb->args[1] >= ip_set_max) + if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) goto out; - dump_type = DUMP_TYPE(cb->args[0]); - dump_flags = DUMP_FLAGS(cb->args[0]); - max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max; + dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]); + dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]); + max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1 + : inst->ip_set_max; dump_last: - pr_debug("args[0]: %u %u args[1]: %ld\n", - dump_type, dump_flags, cb->args[1]); - for (; cb->args[1] < max; cb->args[1]++) { - index = (ip_set_id_t) cb->args[1]; - set = ip_set_list[index]; + pr_debug("dump type, flag: %u %u index: %ld\n", + dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); + for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { + index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; + set = ip_set(inst, index); if (set == NULL) { if (dump_type == DUMP_ONE) { ret = -ENOENT; @@ -1080,31 +1269,33 @@ dump_last: !!(set->type->features & IPSET_DUMP_LAST))) continue; pr_debug("List set: %s\n", set->name); - if (!cb->args[2]) { + if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - __ip_set_get(index); + __ip_set_get(set); } - nlh = start_msg(skb, NETLINK_CB(cb->skb).pid, + nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); if (!nlh) { ret = -EMSGSIZE; goto release_refcount; } - NLA_PUT_U8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); - NLA_PUT_STRING(skb, IPSET_ATTR_SETNAME, set->name); + if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + nla_put_string(skb, IPSET_ATTR_SETNAME, set->name)) + goto nla_put_failure; if (dump_flags & IPSET_FLAG_LIST_SETNAME) goto next_set; - switch (cb->args[2]) { + switch (cb->args[IPSET_CB_ARG0]) { case 0: /* Core header data */ - NLA_PUT_STRING(skb, IPSET_ATTR_TYPENAME, - set->type->name); - NLA_PUT_U8(skb, IPSET_ATTR_FAMILY, - set->family); - NLA_PUT_U8(skb, IPSET_ATTR_REVISION, - set->revision); + if (nla_put_string(skb, IPSET_ATTR_TYPENAME, + set->type->name) || + nla_put_u8(skb, IPSET_ATTR_FAMILY, + set->family) || + nla_put_u8(skb, IPSET_ATTR_REVISION, + set->revision)) + goto nla_put_failure; ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; @@ -1115,7 +1306,7 @@ dump_last: read_lock_bh(&set->lock); ret = set->variant->list(set, skb, cb); read_unlock_bh(&set->lock); - if (!cb->args[2]) + if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; goto release_refcount; @@ -1124,8 +1315,8 @@ dump_last: /* If we dump all sets, continue with dumping last ones */ if (dump_type == DUMP_ALL) { dump_type = DUMP_LAST; - cb->args[0] = dump_type | (dump_flags << 16); - cb->args[1] = 0; + cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); + cb->args[IPSET_CB_INDEX] = 0; goto dump_last; } goto out; @@ -1134,15 +1325,15 @@ nla_put_failure: ret = -EFAULT; next_set: if (dump_type == DUMP_ONE) - cb->args[1] = IPSET_INVALID_ID; + cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID; else - cb->args[1]++; + cb->args[IPSET_CB_INDEX]++; release_refcount: /* If there was an error or set is done, release set */ - if (ret || !cb->args[2]) { - pr_debug("release set %s\n", ip_set_list[index]->name); - ip_set_put_byindex(index); - cb->args[2] = 0; + if (ret || !cb->args[IPSET_CB_ARG0]) { + pr_debug("release set %s\n", ip_set(inst, index)->name); + __ip_set_put_byindex(inst, index); + cb->args[IPSET_CB_ARG0] = 0; } out: if (nlh) { @@ -1162,9 +1353,13 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb, if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; - return netlink_dump_start(ctnl, skb, nlh, - ip_set_dump_start, - ip_set_dump_done, 0); + { + struct netlink_dump_control c = { + .dump = ip_set_dump_start, + .done = ip_set_dump_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } } /* Add, del and test */ @@ -1204,7 +1399,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct sk_buff *skb2; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *cmdattr; u32 *errline; @@ -1212,7 +1407,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, skb2 = nlmsg_new(payload, GFP_KERNEL); if (skb2 == NULL) return -ENOMEM; - rep = __nlmsg_put(skb2, NETLINK_CB(skb).pid, + rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = ret; @@ -1227,7 +1422,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1240,6 +1435,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; const struct nlattr *nla; @@ -1258,7 +1454,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, attr[IPSET_ATTR_LINENO] == NULL)))) return -IPSET_ERR_PROTOCOL; - set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (set == NULL) return -ENOENT; @@ -1294,6 +1490,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; const struct nlattr *nla; @@ -1312,7 +1509,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb, attr[IPSET_ATTR_LINENO] == NULL)))) return -IPSET_ERR_PROTOCOL; - set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (set == NULL) return -ENOENT; @@ -1348,6 +1545,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; int ret = 0; @@ -1358,7 +1556,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; - set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (set == NULL) return -ENOENT; @@ -1373,7 +1571,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, if (ret == -EAGAIN) ret = 1; - return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST; + return ret > 0 ? 0 : -IPSET_ERR_EXIST; } /* Get headed data of a set */ @@ -1383,37 +1581,37 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - ip_set_id_t index; int ret = 0; if (unlikely(protocol_failed(attr) || attr[IPSET_ATTR_SETNAME] == NULL)) return -IPSET_ERR_PROTOCOL; - index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (index == IPSET_INVALID_ID) + set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) return -ENOENT; - set = ip_set_list[index]; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; - NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); - NLA_PUT_STRING(skb2, IPSET_ATTR_SETNAME, set->name); - NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, set->type->name); - NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, set->family); - NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->revision); + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) || + nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) || + nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || + nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision)) + goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret < 0) return ret; @@ -1461,19 +1659,20 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, if (skb2 == NULL) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; - NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); - NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, typename); - NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, family); - NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, max); - NLA_PUT_U8(skb2, IPSET_ATTR_REVISION_MIN, min); + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) || + nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) || + nla_put_u8(skb2, IPSET_ATTR_REVISION, max) || + nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min)) + goto nla_put_failure; nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret < 0) return ret; @@ -1509,14 +1708,15 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, if (skb2 == NULL) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, IPSET_CMD_PROTOCOL); if (!nlh2) goto nlmsg_failure; - NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL)) + goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret < 0) return ret; @@ -1530,6 +1730,10 @@ nlmsg_failure: } static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { + [IPSET_CMD_NONE] = { + .call = ip_set_none, + .attr_count = IPSET_ATTR_CMD_MAX, + }, [IPSET_CMD_CREATE] = { .call = ip_set_create, .attr_count = IPSET_ATTR_CMD_MAX, @@ -1609,15 +1813,17 @@ static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { static int ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) { - unsigned *op; + unsigned int *op; void *data; int copylen = *len, ret = 0; + struct net *net = sock_net(sk); + struct ip_set_net *inst = ip_set_pernet(net); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (optval != SO_IP_SET) return -EBADF; - if (*len < sizeof(unsigned)) + if (*len < sizeof(unsigned int)) return -EINVAL; data = vmalloc(*len); @@ -1627,7 +1833,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) ret = -EFAULT; goto done; } - op = (unsigned *) data; + op = (unsigned int *) data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ @@ -1654,31 +1860,50 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } case IP_SET_OP_GET_BYNAME: { struct ip_set_req_get_set *req_get = data; + ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set)) { ret = -EINVAL; goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; - nfnl_lock(); - req_get->set.index = find_set_id(req_get->set.name); - nfnl_unlock(); + nfnl_lock(NFNL_SUBSYS_IPSET); + find_set_and_id(inst, req_get->set.name, &id); + req_get->set.index = id; + nfnl_unlock(NFNL_SUBSYS_IPSET); + goto copy; + } + case IP_SET_OP_GET_FNAME: { + struct ip_set_req_get_set_family *req_get = data; + ip_set_id_t id; + + if (*len != sizeof(struct ip_set_req_get_set_family)) { + ret = -EINVAL; + goto done; + } + req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; + nfnl_lock(NFNL_SUBSYS_IPSET); + find_set_and_id(inst, req_get->set.name, &id); + req_get->set.index = id; + if (id != IPSET_INVALID_ID) + req_get->family = ip_set(inst, id)->family; + nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_BYINDEX: { struct ip_set_req_get_set *req_get = data; + struct ip_set *set; if (*len != sizeof(struct ip_set_req_get_set) || - req_get->set.index >= ip_set_max) { + req_get->set.index >= inst->ip_set_max) { ret = -EINVAL; goto done; } - nfnl_lock(); - strncpy(req_get->set.name, - ip_set_list[req_get->set.index] - ? ip_set_list[req_get->set.index]->name : "", + nfnl_lock(NFNL_SUBSYS_IPSET); + set = ip_set(inst, req_get->set.index); + strncpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } default: @@ -1704,46 +1929,81 @@ static struct nf_sockopt_ops so_set __read_mostly = { .owner = THIS_MODULE, }; -static int __init -ip_set_init(void) +static int __net_init +ip_set_net_init(struct net *net) { - int ret; + struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set **list; - if (max_sets) - ip_set_max = max_sets; - if (ip_set_max >= IPSET_INVALID_ID) - ip_set_max = IPSET_INVALID_ID - 1; + inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX; + if (inst->ip_set_max >= IPSET_INVALID_ID) + inst->ip_set_max = IPSET_INVALID_ID - 1; - ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max, - GFP_KERNEL); - if (!ip_set_list) + list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); + if (!list) return -ENOMEM; + inst->is_deleted = 0; + rcu_assign_pointer(inst->ip_set_list, list); + return 0; +} + +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); - ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + struct ip_set *set = NULL; + ip_set_id_t i; + + inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set != NULL) + ip_set_destroy_set(inst, i); + } + kfree(rcu_dereference_protected(inst->ip_set_list, 1)); +} + +static struct pernet_operations ip_set_net_ops = { + .init = ip_set_net_init, + .exit = ip_set_net_exit, + .id = &ip_set_net_id, + .size = sizeof(struct ip_set_net) +}; + + +static int __init +ip_set_init(void) +{ + int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); - kfree(ip_set_list); return ret; } ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - kfree(ip_set_list); return ret; } - - pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL); + ret = register_pernet_subsys(&ip_set_net_ops); + if (ret) { + pr_err("ip_set: cannot register pernet_subsys.\n"); + nf_unregister_sockopt(&so_set); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + return ret; + } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } static void __exit ip_set_fini(void) { - /* There can't be any existing set */ + unregister_pernet_subsys(&ip_set_net_ops); nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - kfree(ip_set_list); pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 1f03556666f..29fb01ddff9 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -102,9 +102,25 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, int protocol = iph->protocol; /* See comments at tcp_match in ip_tables.c */ - if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET)) + if (protocol <= 0) return false; + if (ntohs(iph->frag_off) & IP_OFFSET) + switch (protocol) { + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + /* Port info not available for fragment offset > 0 */ + return false; + default: + /* Other protocols doesn't have ports, + so we can match fragments */ + *proto = protocol; + return true; + } + return get_port(skb, protocol, protooff, src, port, proto); } EXPORT_SYMBOL_GPL(ip_set_get_ip4_port); @@ -116,12 +132,12 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, { int protoff; u8 nexthdr; - __be16 frag_off; + __be16 frag_off = 0; nexthdr = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (protoff < 0) + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return false; return get_port(skb, nexthdr, protoff, src, port, proto); @@ -136,10 +152,10 @@ ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) u8 proto; switch (pf) { - case AF_INET: + case NFPROTO_IPV4: ret = ip_set_get_ip4_port(skb, src, port, &proto); break; - case AF_INET6: + case NFPROTO_IPV6: ret = ip_set_get_ip6_port(skb, src, port, &proto); break; default: diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h new file mode 100644 index 00000000000..61c7fb05280 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -0,0 +1,1160 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _IP_SET_HASH_GEN_H +#define _IP_SET_HASH_GEN_H + +#include <linux/rcupdate.h> +#include <linux/jhash.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#ifndef rcu_dereference_bh +#define rcu_dereference_bh(p) rcu_dereference(p) +#endif + +#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) + +/* Hashing which uses arrays to resolve clashing. The hash table is resized + * (doubled) when searching becomes too long. + * Internally jhash is used with the assumption that the size of the + * stored data is a multiple of sizeof(u32). If storage supports timeout, + * the timeout field must be the last one in the data structure - that field + * is ignored when computing the hash key. + * + * Readers and resizing + * + * Resizing can be triggered by userspace command only, and those + * are serialized by the nfnl mutex. During resizing the set is + * read-locked, so the only possible concurrent operations are + * the kernel side readers. Those must be protected by proper RCU locking. + */ + +/* Number of elements to store in an initial array block */ +#define AHASH_INIT_SIZE 4 +/* Max number of elements to store in an array block */ +#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) + +/* Max number of elements can be tuned */ +#ifdef IP_SET_HASH_WITH_MULTI +#define AHASH_MAX(h) ((h)->ahash_max) + +static inline u8 +tune_ahash_max(u8 curr, u32 multi) +{ + u32 n; + + if (multi < curr) + return curr; + + n = curr + AHASH_INIT_SIZE; + /* Currently, at listing one hash bucket must fit into a message. + * Therefore we have a hard limit here. + */ + return n > curr && n <= 64 ? n : curr; +} +#define TUNE_AHASH_MAX(h, multi) \ + ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) +#else +#define AHASH_MAX(h) AHASH_MAX_SIZE +#define TUNE_AHASH_MAX(h, multi) +#endif + +/* A hash bucket */ +struct hbucket { + void *value; /* the array of the values */ + u8 size; /* size of the array */ + u8 pos; /* position of the first free entry */ +}; + +/* The hash table: the table size stored here in order to make resizing easy */ +struct htable { + u8 htable_bits; /* size of hash table == 2^htable_bits */ + struct hbucket bucket[0]; /* hashtable buckets */ +}; + +#define hbucket(h, i) (&((h)->bucket[i])) + +#ifndef IPSET_NET_COUNT +#define IPSET_NET_COUNT 1 +#endif + +/* Book-keeping of the prefixes added to the set */ +struct net_prefixes { + u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ + u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */ +}; + +/* Compute the hash table size */ +static size_t +htable_size(u8 hbits) +{ + size_t hsize; + + /* We must fit both into u32 in jhash and size_t */ + if (hbits > 31) + return 0; + hsize = jhash_size(hbits); + if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + < hsize) + return 0; + + return hsize * sizeof(struct hbucket) + sizeof(struct htable); +} + +/* Compute htable_bits from the user input parameter hashsize */ +static u8 +htable_bits(u32 hashsize) +{ + /* Assume that hashsize == 2^htable_bits */ + u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) + /* Round up to the first 2^n value */ + bits = fls(hashsize); + + return bits; +} + +static int +hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) +{ + if (n->pos >= n->size) { + void *tmp; + + if (n->size >= ahash_max) + /* Trigger rehashing */ + return -EAGAIN; + + tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + if (n->size) { + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + } + n->value = tmp; + n->size += AHASH_INIT_SIZE; + } + return 0; +} + +#ifdef IP_SET_HASH_WITH_NETS +#if IPSET_NET_COUNT > 1 +#define __CIDR(cidr, i) (cidr[i]) +#else +#define __CIDR(cidr, i) (cidr) +#endif +#ifdef IP_SET_HASH_WITH_NETS_PACKED +/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */ +#define CIDR(cidr, i) (__CIDR(cidr, i) + 1) +#else +#define CIDR(cidr, i) (__CIDR(cidr, i)) +#endif + +#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) + +#ifdef IP_SET_HASH_WITH_MULTI +#define NLEN(family) (SET_HOST_MASK(family) + 1) +#else +#define NLEN(family) SET_HOST_MASK(family) +#endif + +#else +#define NLEN(family) 0 +#endif /* IP_SET_HASH_WITH_NETS */ + +#endif /* _IP_SET_HASH_GEN_H */ + +/* Family dependent templates */ + +#undef ahash_data +#undef mtype_data_equal +#undef mtype_do_data_match +#undef mtype_data_set_flags +#undef mtype_data_reset_flags +#undef mtype_data_netmask +#undef mtype_data_list +#undef mtype_data_next +#undef mtype_elem + +#undef mtype_ahash_destroy +#undef mtype_ext_cleanup +#undef mtype_add_cidr +#undef mtype_del_cidr +#undef mtype_ahash_memsize +#undef mtype_flush +#undef mtype_destroy +#undef mtype_gc_init +#undef mtype_same_set +#undef mtype_kadt +#undef mtype_uadt +#undef mtype + +#undef mtype_add +#undef mtype_del +#undef mtype_test_cidrs +#undef mtype_test +#undef mtype_expire +#undef mtype_resize +#undef mtype_head +#undef mtype_list +#undef mtype_gc +#undef mtype_gc_init +#undef mtype_variant +#undef mtype_data_match + +#undef HKEY + +#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal) +#ifdef IP_SET_HASH_WITH_NETS +#define mtype_do_data_match IPSET_TOKEN(MTYPE, _do_data_match) +#else +#define mtype_do_data_match(d) 1 +#endif +#define mtype_data_set_flags IPSET_TOKEN(MTYPE, _data_set_flags) +#define mtype_data_reset_elem IPSET_TOKEN(MTYPE, _data_reset_elem) +#define mtype_data_reset_flags IPSET_TOKEN(MTYPE, _data_reset_flags) +#define mtype_data_netmask IPSET_TOKEN(MTYPE, _data_netmask) +#define mtype_data_list IPSET_TOKEN(MTYPE, _data_list) +#define mtype_data_next IPSET_TOKEN(MTYPE, _data_next) +#define mtype_elem IPSET_TOKEN(MTYPE, _elem) +#define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy) +#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) +#define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr) +#define mtype_del_cidr IPSET_TOKEN(MTYPE, _del_cidr) +#define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize) +#define mtype_flush IPSET_TOKEN(MTYPE, _flush) +#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) +#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) +#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) +#define mtype MTYPE + +#define mtype_add IPSET_TOKEN(MTYPE, _add) +#define mtype_del IPSET_TOKEN(MTYPE, _del) +#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) +#define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_expire IPSET_TOKEN(MTYPE, _expire) +#define mtype_resize IPSET_TOKEN(MTYPE, _resize) +#define mtype_head IPSET_TOKEN(MTYPE, _head) +#define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_variant IPSET_TOKEN(MTYPE, _variant) +#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) + +#ifndef HKEY_DATALEN +#define HKEY_DATALEN sizeof(struct mtype_elem) +#endif + +#define HKEY(data, initval, htable_bits) \ +(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ + & jhash_mask(htable_bits)) + +#ifndef htype +#define htype HTYPE + +/* The generic hash structure */ +struct htype { + struct htable __rcu *table; /* the hash table */ + u32 maxelem; /* max elements in the hash */ + u32 elements; /* current element (vs timeout) */ + u32 initval; /* random jhash init value */ +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; /* markmask value for mark mask to store */ +#endif + struct timer_list gc; /* garbage collection when timeout enabled */ + struct mtype_elem next; /* temporary storage for uadd */ +#ifdef IP_SET_HASH_WITH_MULTI + u8 ahash_max; /* max elements in an array block */ +#endif +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; /* netmask value for subnets to store */ +#endif +#ifdef IP_SET_HASH_WITH_RBTREE + struct rb_root rbtree; +#endif +#ifdef IP_SET_HASH_WITH_NETS + struct net_prefixes nets[0]; /* book-keeping of prefixes */ +#endif +}; +#endif + +#ifdef IP_SET_HASH_WITH_NETS +/* Network cidr size book keeping when the hash stores different + * sized networks */ +static void +mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) +{ + int i, j; + + /* Add in increasing prefix order, so larger cidr first */ + for (i = 0, j = -1; i < nets_length && h->nets[i].nets[n]; i++) { + if (j != -1) + continue; + else if (h->nets[i].cidr[n] < cidr) + j = i; + else if (h->nets[i].cidr[n] == cidr) { + h->nets[i].nets[n]++; + return; + } + } + if (j != -1) { + for (; i > j; i--) { + h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; + h->nets[i].nets[n] = h->nets[i - 1].nets[n]; + } + } + h->nets[i].cidr[n] = cidr; + h->nets[i].nets[n] = 1; +} + +static void +mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) +{ + u8 i, j, net_end = nets_length - 1; + + for (i = 0; i < nets_length; i++) { + if (h->nets[i].cidr[n] != cidr) + continue; + if (h->nets[i].nets[n] > 1 || i == net_end || + h->nets[i + 1].nets[n] == 0) { + h->nets[i].nets[n]--; + return; + } + for (j = i; j < net_end && h->nets[j].nets[n]; j++) { + h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; + h->nets[j].nets[n] = h->nets[j + 1].nets[n]; + } + h->nets[j].nets[n] = 0; + return; + } +} +#endif + +/* Calculate the actual memory size of the set data */ +static size_t +mtype_ahash_memsize(const struct htype *h, const struct htable *t, + u8 nets_length, size_t dsize) +{ + u32 i; + size_t memsize = sizeof(*h) + + sizeof(*t) +#ifdef IP_SET_HASH_WITH_NETS + + sizeof(struct net_prefixes) * nets_length +#endif + + jhash_size(t->htable_bits) * sizeof(struct hbucket); + + for (i = 0; i < jhash_size(t->htable_bits); i++) + memsize += t->bucket[i].size * dsize; + + return memsize; +} + +/* Get the ith element from the array block n */ +#define ahash_data(n, i, dsize) \ + ((struct mtype_elem *)((n)->value + ((i) * (dsize)))) + +static void +mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) +{ + int i; + + for (i = 0; i < n->pos; i++) + ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); +} + +/* Flush a hash type of set: destroy all elements */ +static void +mtype_flush(struct ip_set *set) +{ + struct htype *h = set->data; + struct htable *t; + struct hbucket *n; + u32 i; + + t = rcu_dereference_bh_nfnl(h->table); + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) { + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + n->size = n->pos = 0; + /* FIXME: use slab cache */ + kfree(n->value); + } + } +#ifdef IP_SET_HASH_WITH_NETS + memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); +#endif + h->elements = 0; +} + +/* Destroy the hashtable part of the set */ +static void +mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) +{ + struct hbucket *n; + u32 i; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) { + if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + kfree(n->value); + } + } + + ip_set_free(t); +} + +/* Destroy a hash type of set */ +static void +mtype_destroy(struct ip_set *set) +{ + struct htype *h = set->data; + + if (set->extensions & IPSET_EXT_TIMEOUT) + del_timer_sync(&h->gc); + + mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); +#ifdef IP_SET_HASH_WITH_RBTREE + rbtree_destroy(&h->rbtree); +#endif + kfree(h); + + set->data = NULL; +} + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct htype *h = set->data; + + init_timer(&h->gc); + h->gc.data = (unsigned long) set; + h->gc.function = gc; + h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&h->gc); + pr_debug("gc initialized, run in every %u\n", + IPSET_GC_PERIOD(set->timeout)); +} + +static bool +mtype_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct htype *x = a->data; + const struct htype *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + a->timeout == b->timeout && +#ifdef IP_SET_HASH_WITH_NETMASK + x->netmask == y->netmask && +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + x->markmask == y->markmask && +#endif + a->extensions == b->extensions; +} + +/* Delete expired elements from the hashtable */ +static void +mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) +{ + struct htable *t; + struct hbucket *n; + struct mtype_elem *data; + u32 i; + int j; +#ifdef IP_SET_HASH_WITH_NETS + u8 k; +#endif + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, dsize); + if (ip_set_timeout_expired(ext_timeout(data, set))) { + pr_debug("expired %u/%u\n", i, j); +#ifdef IP_SET_HASH_WITH_NETS + for (k = 0; k < IPSET_NET_COUNT; k++) + mtype_del_cidr(h, CIDR(data->cidr, k), + nets_length, k); +#endif + ip_set_ext_destroy(set, data); + if (j != n->pos - 1) + /* Not last one */ + memcpy(data, + ahash_data(n, n->pos - 1, dsize), + dsize); + n->pos--; + h->elements--; + } + } + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!tmp) + /* Still try to delete expired elements */ + continue; + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + n->value = tmp; + } + } + rcu_read_unlock_bh(); +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct htype *h = set->data; + + pr_debug("called\n"); + write_lock_bh(&set->lock); + mtype_expire(set, h, NLEN(set->family), set->dsize); + write_unlock_bh(&set->lock); + + h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; + add_timer(&h->gc); +} + +/* Resize a hash: create a new hash table with doubling the hashsize + * and inserting the elements to it. Repeat until we succeed or + * fail due to memory pressures. */ +static int +mtype_resize(struct ip_set *set, bool retried) +{ + struct htype *h = set->data; + struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table); + u8 htable_bits = orig->htable_bits; +#ifdef IP_SET_HASH_WITH_NETS + u8 flags; +#endif + struct mtype_elem *data; + struct mtype_elem *d; + struct hbucket *n, *m; + u32 i, j; + int ret; + + /* Try to cleanup once */ + if (SET_WITH_TIMEOUT(set) && !retried) { + i = h->elements; + write_lock_bh(&set->lock); + mtype_expire(set, set->data, NLEN(set->family), set->dsize); + write_unlock_bh(&set->lock); + if (h->elements < i) + return 0; + } + +retry: + ret = 0; + htable_bits++; + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); + if (!htable_bits) { + /* In case we have plenty of memory :-) */ + pr_warning("Cannot increase the hashsize of set %s further\n", + set->name); + return -IPSET_ERR_HASH_FULL; + } + t = ip_set_alloc(sizeof(*t) + + jhash_size(htable_bits) * sizeof(struct hbucket)); + if (!t) + return -ENOMEM; + t->htable_bits = htable_bits; + + read_lock_bh(&set->lock); + for (i = 0; i < jhash_size(orig->htable_bits); i++) { + n = hbucket(orig, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + flags = 0; + mtype_data_reset_flags(data, &flags); +#endif + m = hbucket(t, HKEY(data, h->initval, htable_bits)); + ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize); + if (ret < 0) { +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(data, &flags); +#endif + read_unlock_bh(&set->lock); + mtype_ahash_destroy(set, t, false); + if (ret == -EAGAIN) + goto retry; + return ret; + } + d = ahash_data(m, m->pos++, set->dsize); + memcpy(d, data, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(d, &flags); +#endif + } + } + + rcu_assign_pointer(h->table, t); + read_unlock_bh(&set->lock); + + /* Give time to other readers of the set */ + synchronize_rcu_bh(); + + pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, + orig->htable_bits, orig, t->htable_bits, t); + mtype_ahash_destroy(set, orig, false); + + return 0; +} + +/* Add an element to a hash and update the internal counters when succeeded, + * otherwise report the proper error code. */ +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i, ret = 0; + int j = AHASH_MAX(h) + 1; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 key, multi = 0; + + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) { + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t,key); + if (n->pos) { + /* Choosing the first entry in the array to replace */ + j = 0; + goto reuse_slot; + } + rcu_read_unlock_bh(); + } + if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) + /* FIXME: when set is full, we slow down here */ + mtype_expire(set, h, NLEN(set->family), set->dsize); + + if (h->elements >= h->maxelem) { + if (net_ratelimit()) + pr_warning("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (mtype_data_equal(data, d, &multi)) { + if (flag_exist || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set)))) { + /* Just the extensions could be overwritten */ + j = i; + goto reuse_slot; + } else { + ret = -IPSET_ERR_EXIST; + goto out; + } + } + /* Reuse first timed out entry */ + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set)) && + j != AHASH_MAX(h) + 1) + j = i; + } +reuse_slot: + if (j != AHASH_MAX(h) + 1) { + /* Fill out reused slot */ + data = ahash_data(n, j, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + for (i = 0; i < IPSET_NET_COUNT; i++) { + mtype_del_cidr(h, CIDR(data->cidr, i), + NLEN(set->family), i); + mtype_add_cidr(h, CIDR(d->cidr, i), + NLEN(set->family), i); + } +#endif + ip_set_ext_destroy(set, data); + } else { + /* Use/create a new slot */ + TUNE_AHASH_MAX(h, multi); + ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize); + if (ret != 0) { + if (ret == -EAGAIN) + mtype_data_next(&h->next, d); + goto out; + } + data = ahash_data(n, n->pos++, set->dsize); +#ifdef IP_SET_HASH_WITH_NETS + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_add_cidr(h, CIDR(d->cidr, i), NLEN(set->family), + i); +#endif + h->elements++; + } + memcpy(data, d, sizeof(struct mtype_elem)); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_set_flags(data, flags); +#endif + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, set), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(data, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(data, set), ext); + +out: + rcu_read_unlock_bh(); + return ret; +} + +/* Delete an element from the hash: swap it with the last element + * and free up space if possible. + */ +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i, ret = -IPSET_ERR_EXIST; +#ifdef IP_SET_HASH_WITH_NETS + u8 j; +#endif + u32 key, multi = 0; + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set))) + goto out; + if (i != n->pos - 1) + /* Not last one */ + memcpy(data, ahash_data(n, n->pos - 1, set->dsize), + set->dsize); + + n->pos--; + h->elements--; +#ifdef IP_SET_HASH_WITH_NETS + for (j = 0; j < IPSET_NET_COUNT; j++) + mtype_del_cidr(h, CIDR(d->cidr, j), NLEN(set->family), + j); +#endif + ip_set_ext_destroy(set, data); + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * set->dsize, + GFP_ATOMIC); + if (!tmp) { + ret = 0; + goto out; + } + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * set->dsize); + kfree(n->value); + n->value = tmp; + } + ret = 0; + goto out; + } + +out: + rcu_read_unlock_bh(); + return ret; +} + +static inline int +mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, + struct ip_set_ext *mext, struct ip_set *set, u32 flags) +{ + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(data, set), + ext, mext, flags); + return mtype_do_data_match(data); +} + +#ifdef IP_SET_HASH_WITH_NETS +/* Special test function which takes into account the different network + * sizes added to the set */ +static int +mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = rcu_dereference_bh(h->table); + struct hbucket *n; + struct mtype_elem *data; +#if IPSET_NET_COUNT == 2 + struct mtype_elem orig = *d; + int i, j = 0, k; +#else + int i, j = 0; +#endif + u32 key, multi = 0; + u8 nets_length = NLEN(set->family); + + pr_debug("test by nets\n"); + for (; j < nets_length && h->nets[j].nets[0] && !multi; j++) { +#if IPSET_NET_COUNT == 2 + mtype_data_reset_elem(d, &orig); + mtype_data_netmask(d, h->nets[j].cidr[0], false); + for (k = 0; k < nets_length && h->nets[k].nets[1] && !multi; + k++) { + mtype_data_netmask(d, h->nets[k].cidr[1], true); +#else + mtype_data_netmask(d, h->nets[j].cidr[0]); +#endif + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set)) { + if (!ip_set_timeout_expired( + ext_timeout(data, set))) + return mtype_data_match(data, ext, + mext, set, + flags); +#ifdef IP_SET_HASH_WITH_MULTI + multi = 0; +#endif + } else + return mtype_data_match(data, ext, + mext, set, flags); + } +#if IPSET_NET_COUNT == 2 + } +#endif + } + return 0; +} +#endif + +/* Test whether the element is added to the set */ +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + struct mtype_elem *d = value; + struct hbucket *n; + struct mtype_elem *data; + int i, ret = 0; + u32 key, multi = 0; + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); +#ifdef IP_SET_HASH_WITH_NETS + /* If we test an IP address and not a network address, + * try all possible network sizes */ + for (i = 0; i < IPSET_NET_COUNT; i++) + if (CIDR(d->cidr, i) != SET_HOST_MASK(set->family)) + break; + if (i == IPSET_NET_COUNT) { + ret = mtype_test_cidrs(set, d, ext, mext, flags); + goto out; + } +#endif + + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, set->dsize); + if (mtype_data_equal(data, d, &multi) && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set)))) { + ret = mtype_data_match(data, ext, mext, set, flags); + goto out; + } + } +out: + rcu_read_unlock_bh(); + return ret; +} + +/* Reply a HEADER request: fill out the header part of the set */ +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct htype *h = set->data; + const struct htable *t; + struct nlattr *nested; + size_t memsize; + + t = rcu_dereference_bh_nfnl(h->table); + memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, + htonl(jhash_size(t->htable_bits))) || + nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) + goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_NETMASK + if (h->netmask != HOST_MASK && + nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + goto nla_put_failure; +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) + goto nla_put_failure; +#endif + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) + goto nla_put_failure; + if (unlikely(ip_set_put_flags(skb, set))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +/* Reply a LIST/SAVE request: dump the elements of the specified set */ +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct htype *h = set->data; + const struct htable *t = rcu_dereference_bh_nfnl(h->table); + struct nlattr *atd, *nested; + const struct hbucket *n; + const struct mtype_elem *e; + u32 first = cb->args[IPSET_CB_ARG0]; + /* We assume that one hash bucket fills into one page */ + void *incomplete; + int i; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); + for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); + cb->args[IPSET_CB_ARG0]++) { + incomplete = skb_tail_pointer(skb); + n = hbucket(t, cb->args[IPSET_CB_ARG0]); + pr_debug("cb->arg bucket: %lu, t %p n %p\n", + cb->args[IPSET_CB_ARG0], t, n); + for (i = 0; i < n->pos; i++) { + e = ahash_data(n, i, set->dsize); + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + pr_debug("list hash %lu hbucket %p i %u, data %p\n", + cb->args[IPSET_CB_ARG0], n, i, e); + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (cb->args[IPSET_CB_ARG0] == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_data_list(skb, e)) + goto nla_put_failure; + if (ip_set_put_extensions(skb, set, e, true)) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[IPSET_CB_ARG0] = 0; + + return 0; + +nla_put_failure: + nlmsg_trim(skb, incomplete); + if (unlikely(first == cb->args[IPSET_CB_ARG0])) { + pr_warning("Can't list set %s: one bucket does not fit into " + "a message. Please report it!\n", set->name); + cb->args[IPSET_CB_ARG0] = 0; + return -EMSGSIZE; + } + ipset_nest_end(skb, atd); + return 0; +} + +static int +IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); + +static int +IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + +static const struct ip_set_type_variant mtype_variant = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .resize = mtype_resize, + .same_set = mtype_same_set, +}; + +#ifdef IP_SET_EMIT_CREATE +static int +IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, + struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; +#ifdef IP_SET_HASH_WITH_MARKMASK + u32 markmask; +#endif + u8 hbits; +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; +#endif + size_t hsize; + struct HTYPE *h; + struct htable *t; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + +#ifdef IP_SET_HASH_WITH_MARKMASK + markmask = 0xffffffff; +#endif +#ifdef IP_SET_HASH_WITH_NETMASK + netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + pr_debug("Create set %s with family %s\n", + set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); +#endif + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || +#ifdef IP_SET_HASH_WITH_MARKMASK + !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || +#endif + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + +#ifdef IP_SET_HASH_WITH_NETMASK + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if ((set->family == NFPROTO_IPV4 && netmask > 32) || + (set->family == NFPROTO_IPV6 && netmask > 128) || + netmask == 0) + return -IPSET_ERR_INVALID_NETMASK; + } +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + if (tb[IPSET_ATTR_MARKMASK]) { + markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + + if ((markmask > 4294967295u) || markmask == 0) + return -IPSET_ERR_INVALID_MARKMASK; + } +#endif + + hsize = sizeof(*h); +#ifdef IP_SET_HASH_WITH_NETS + hsize += sizeof(struct net_prefixes) * + (set->family == NFPROTO_IPV4 ? 32 : 128); +#endif + h = kzalloc(hsize, GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; +#ifdef IP_SET_HASH_WITH_NETMASK + h->netmask = netmask; +#endif +#ifdef IP_SET_HASH_WITH_MARKMASK + h->markmask = markmask; +#endif + get_random_bytes(&h->initval, sizeof(h->initval)); + set->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + t = ip_set_alloc(hsize); + if (!t) { + kfree(h); + return -ENOMEM; + } + t->htable_bits = hbits; + rcu_assign_pointer(h->table, t); + + set->data = h; + if (set->family == NFPROTO_IPV4) { + set->variant = &IPSET_TOKEN(HTYPE, 4_variant); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); + } else { + set->variant = &IPSET_TOKEN(HTYPE, 6_variant); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); + } + if (tb[IPSET_ATTR_TIMEOUT]) { + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (set->family == NFPROTO_IPV4) + IPSET_TOKEN(HTYPE, 4_gc_init)(set, + IPSET_TOKEN(HTYPE, 4_gc)); + else + IPSET_TOKEN(HTYPE, 6_gc_init)(set, + IPSET_TOKEN(HTYPE, 6_gc)); + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(t->htable_bits), + t->htable_bits, h->maxelem, set->data, t); + + return 0; +} +#endif /* IP_SET_EMIT_CREATE */ diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 4015fcaf87b..dd40607f878 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,107 +21,71 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counters support */ +/* 2 Comments support */ +#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("hash:ip type of IP sets"); +IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ -#define TYPE hash_ip - -static bool -hash_ip_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_ip4_same_set hash_ip_same_set -#define hash_ip6_same_set hash_ip_same_set +#define HTYPE hash_ip +#define IP_SET_HASH_WITH_NETMASK -/* The type variant functions: IPv4 */ +/* IPv4 variant */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ip4_elem { + /* Zero valued IP addresses cannot be stored */ __be32 ip; }; -/* Member elements with timeout support */ -struct hash_ip4_telem { - __be32 ip; - unsigned long timeout; -}; +/* Common functions */ static inline bool -hash_ip4_data_equal(const struct hash_ip4_elem *ip1, - const struct hash_ip4_elem *ip2, +hash_ip4_data_equal(const struct hash_ip4_elem *e1, + const struct hash_ip4_elem *e2, u32 *multi) { - return ip1->ip == ip2->ip; -} - -static inline bool -hash_ip4_data_isnull(const struct hash_ip4_elem *elem) -{ - return elem->ip == 0; -} - -static inline void -hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src) -{ - dst->ip = src->ip; -} - -/* Zero valued IP addresses cannot be stored */ -static inline void -hash_ip4_data_zero_out(struct hash_ip4_elem *elem) -{ - elem->ip = 0; + return e1->ip == e2->ip; } static inline bool -hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data) +hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data) +static inline void +hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { - const struct hash_ip4_telem *tdata = - (const struct hash_ip4_telem *)data; - - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))); - - return 0; - -nla_put_failure: - return 1; + next->ip = e->ip; } -#define IP_SET_HASH_WITH_NETMASK +#define MTYPE hash_ip4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ip4_data_next(struct ip_set_hash *h, const struct hash_ip4_elem *d) -{ - h->next.ip = ntohl(d->ip); -} +#include "ip_set_hash_gen.h" static int hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); @@ -129,45 +93,45 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, if (ip == 0) return -EINVAL; - return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); + e.ip = ip; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 ip, ip_to, hosts, timeout = h->timeout; - __be32 nip; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, hosts; int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ip &= ip_set_hostmask(h->netmask); - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - if (adt == IPSET_TEST) { - nip = htonl(ip); - if (nip == 0) + e.ip = htonl(ip); + if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; - return adtfn(set, &nip, timeout, flags); + return adtfn(set, &e, &ext, &ext, flags); } + ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -177,21 +141,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr > 32) + if (!cidr || cidr > 32) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); - } else - ip_to = ip; + } hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); if (retried) - ip = h->next.ip; + ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip += hosts) { - nip = htonl(ip); - if (nip == 0) + e.ip = htonl(ip); + if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; - ret = adtfn(set, &nip, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -201,136 +164,89 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ip_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout && - x->netmask == y->netmask; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variant */ +/* Member elements */ struct hash_ip6_elem { union nf_inet_addr ip; }; -struct hash_ip6_telem { - union nf_inet_addr ip; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0; -} - -static inline bool -hash_ip6_data_isnull(const struct hash_ip6_elem *elem) -{ - return ipv6_addr_any(&elem->ip.in6); + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } static inline void -hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src) +hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { - dst->ip.in6 = src->ip.in6; -} - -static inline void -hash_ip6_data_zero_out(struct hash_ip6_elem *elem) -{ - ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0); -} - -static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + ip6_netmask(ip, prefix); } static bool -hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data) +hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data) +static inline void +hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) { - const struct hash_ip6_telem *e = - (const struct hash_ip6_telem *)data; - - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))); - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_ip6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> -static inline void -hash_ip6_data_next(struct ip_set_hash *h, const struct hash_ip6_elem *d) -{ -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - union nf_inet_addr ip; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip.in6); - ip6_netmask(&ip, h->netmask); - if (ipv6_addr_any(&ip.in6)) + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; - return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } -static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = { - [IPSET_ATTR_IP] = { .type = NLA_NESTED }, - [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, - [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, -}; - static int hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - union nf_inet_addr ip; - u32 timeout = h->timeout; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -338,114 +254,28 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ip6_netmask(&ip, h->netmask); - if (ipv6_addr_any(&ip.in6)) + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - ret = adtfn(set, &ip, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } -/* Create hash:ip type of sets */ - -static int -hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 netmask, hbits; - struct ip_set_hash *h; - - if (!(set->family == AF_INET || set->family == AF_INET6)) - return -IPSET_ERR_INVALID_FAMILY; - netmask = set->family == AF_INET ? 32 : 128; - pr_debug("Create set %s with family %s\n", - set->name, set->family == AF_INET ? "inet" : "inet6"); - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - if (tb[IPSET_ATTR_NETMASK]) { - netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); - - if ((set->family == AF_INET && netmask > 32) || - (set->family == AF_INET6 && netmask > 128) || - netmask == 0) - return -IPSET_ERR_INVALID_NETMASK; - } - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - h->netmask = netmask; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - h->table = ip_set_alloc( - sizeof(struct htable) - + jhash_size(hbits) * sizeof(struct hbucket)); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == AF_INET - ? &hash_ip4_tvariant : &hash_ip6_tvariant; - - if (set->family == AF_INET) - hash_ip4_gc_init(set); - else - hash_ip6_gc_init(set); - } else { - set->variant = set->family == AF_INET - ? &hash_ip4_variant : &hash_ip6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ip_type __read_mostly = { .name = "hash:ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP, .dimension = IPSET_DIM_ONE, - .family = AF_UNSPEC, - .revision_min = 0, - .revision_max = 0, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = hash_ip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, @@ -454,6 +284,7 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -461,6 +292,9 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c new file mode 100644 index 00000000000..4eff0a29725 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -0,0 +1,321 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,mark type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); +IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:ip,mark"); + +/* Type specific function prefix */ +#define HTYPE hash_ipmark +#define IP_SET_HASH_WITH_MARKMASK + +/* IPv4 variant */ + +/* Member elements */ +struct hash_ipmark4_elem { + __be32 ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, + const struct hash_ipmark4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark4_data_list(struct sk_buff *skb, + const struct hash_ipmark4_elem *data) +{ + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark4_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark4_elem *d) +{ + next->ip = d->ip; +} + +#define MTYPE hash_ipmark4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) +#include "ip_set_hash_gen.h" + +static int +hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip, ip_to = 0; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip = ntohl(e.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!cidr || cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + if (retried) + ip = ntohl(h->next.ip); + for (; !before(ip_to, ip); ip++) { + e.ip = htonl(ip); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_ipmark6_elem { + union nf_inet_addr ip; + __u32 mark; +}; + +/* Common functions */ + +static inline bool +hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, + const struct hash_ipmark6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ip1->mark == ip2->mark; +} + +static bool +hash_ipmark6_data_list(struct sk_buff *skb, + const struct hash_ipmark6_elem *data) +{ + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_ipmark6_data_next(struct hash_ipmark4_elem *next, + const struct hash_ipmark6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK +#undef HKEY_DATALEN + +#define MTYPE hash_ipmark6 +#define PF 6 +#define HOST_MASK 128 +#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + + +static int +hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.mark = skb->mark; + e.mark &= h->markmask; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_ipmark *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipmark6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark &= h->markmask; + + if (adt == IPSET_TEST) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + + return ret; +} + +static struct ip_set_type hash_ipmark_type __read_mostly = { + .name = "hash:ip,mark", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_MARK, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_ipmark_create, + .create_policy = { + [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_MARK] = { .type = NLA_U32 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipmark_init(void) +{ + return ip_set_type_register(&hash_ipmark_type); +} + +static void __exit +hash_ipmark_fini(void) +{ + ip_set_type_unregister(&hash_ipmark_type); +} + +module_init(hash_ipmark_init); +module_exit(hash_ipmark_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 37d667e3f6f..7597b82a8b0 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,27 +21,26 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Counters support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("hash:ip,port type of IP sets"); +IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ -#define TYPE hash_ipport - -static bool -hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_ipport4_same_set hash_ipport_same_set -#define hash_ipport6_same_set hash_ipport_same_set +#define HTYPE hash_ipport -/* The type variant functions: IPv4 */ +/* IPv4 variant */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ipport4_elem { __be32 ip; __be16 port; @@ -49,14 +48,7 @@ struct hash_ipport4_elem { u8 padding; }; -/* Member elements with timeout support */ -struct hash_ipport4_telem { - __be32 ip; - __be16 port; - u8 proto; - u8 padding; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, @@ -68,145 +60,104 @@ hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipport4_data_copy(struct hash_ipport4_elem *dst, - const struct hash_ipport4_elem *src) -{ - dst->ip = src->ip; - dst->port = src->port; - dst->proto = src->proto; -} - -static inline void -hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipport4_data_list(struct sk_buff *skb, const struct hash_ipport4_elem *data) { - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - return 0; - -nla_put_failure: - return 1; -} - -static bool -hash_ipport4_data_tlist(struct sk_buff *skb, - const struct hash_ipport4_elem *data) -{ - const struct hash_ipport4_telem *tdata = - (const struct hash_ipport4_telem *)data; - - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))); - + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -#define PF 4 -#define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - static inline void -hash_ipport4_data_next(struct ip_set_hash *h, +hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { - h->next.ip = ntohl(d->ip); - h->next.port = ntohs(d->port); + next->ip = d->ip; + next->port = d->port; } +#define MTYPE hash_ipport4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) +#include "ip_set_hash_gen.h" + static int hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem data = { }; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem data = { }; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - ip = ntohl(data.ip); + ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -216,13 +167,12 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr > 32) + if (!cidr || cidr > 32) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); - } else - ip_to = ip; + } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -230,13 +180,14 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } if (retried) - ip = h->next.ip; + ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip++) { - p = retried && ip == h->next.ip ? h->next.port : port; + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; for (; p <= port_to; p++) { - data.ip = htonl(ip); - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -247,18 +198,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variant */ struct hash_ipport6_elem { union nf_inet_addr ip; @@ -267,115 +207,77 @@ struct hash_ipport6_elem { u8 padding; }; -struct hash_ipport6_telem { - union nf_inet_addr ip; - __be16 port; - u8 proto; - u8 padding; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } -static inline bool -hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipport6_data_copy(struct hash_ipport6_elem *dst, - const struct hash_ipport6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipport6_data_list(struct sk_buff *skb, const struct hash_ipport6_elem *data) { - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_ipport6_data_tlist(struct sk_buff *skb, - const struct hash_ipport6_elem *data) +static inline void +hash_ipport6_data_next(struct hash_ipport4_elem *next, + const struct hash_ipport6_elem *d) { - const struct hash_ipport6_telem *e = - (const struct hash_ipport6_telem *)data; - - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))); - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_ipport6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipport6_data_next(struct ip_set_hash *h, - const struct hash_ipport6_elem *d) -{ - h->next.port = ntohs(d->port); -} +#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem data = { }; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem data = { }; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; @@ -383,6 +285,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -390,48 +294,43 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) - port = h->next.port; + port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -441,82 +340,14 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - - if (!(set->family == AF_INET || set->family == AF_INET6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - h->table = ip_set_alloc( - sizeof(struct htable) - + jhash_size(hbits) * sizeof(struct hbucket)); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == AF_INET - ? &hash_ipport4_tvariant : &hash_ipport6_tvariant; - - if (set->family == AF_INET) - hash_ipport4_gc_init(set); - else - hash_ipport6_gc_init(set); - } else { - set->variant = set->family == AF_INET - ? &hash_ipport4_variant : &hash_ipport6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipport_type __read_mostly = { .name = "hash:ip,port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, .dimension = IPSET_DIM_TWO, - .family = AF_UNSPEC, - .revision_min = 0, - .revision_max = 1, /* SCTP and UDPLITE support added */ + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, @@ -525,6 +356,7 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -535,6 +367,9 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index e69e2718fbe..672655ffd57 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,27 +21,26 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Counters support added */ +/* 3 Comments support added */ +#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("hash:ip,port,ip type of IP sets"); +IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ -#define TYPE hash_ipportip - -static bool -hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_ipportip4_same_set hash_ipportip_same_set -#define hash_ipportip6_same_set hash_ipportip_same_set +#define HTYPE hash_ipportip -/* The type variant functions: IPv4 */ +/* IPv4 variant */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ipportip4_elem { __be32 ip; __be32 ip2; @@ -50,16 +49,6 @@ struct hash_ipportip4_elem { u8 padding; }; -/* Member elements with timeout support */ -struct hash_ipportip4_telem { - __be32 ip; - __be32 ip2; - __be16 port; - u8 proto; - u8 padding; - unsigned long timeout; -}; - static inline bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, @@ -71,150 +60,110 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst, - const struct hash_ipportip4_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) { - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_ipportip4_data_tlist(struct sk_buff *skb, - const struct hash_ipportip4_elem *data) +static inline void +hash_ipportip4_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip4_elem *d) { - const struct hash_ipportip4_telem *tdata = - (const struct hash_ipportip4_telem *)data; - - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))); - - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; } +/* Common functions */ +#define MTYPE hash_ipportip4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportip4_data_next(struct ip_set_hash *h, - const struct hash_ipportip4_elem *d) -{ - h->next.ip = ntohl(d->ip); - h->next.port = ntohs(d->port); -} +#include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem data = { }; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem data = { }; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - ip = ntohl(data.ip); + ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -224,13 +173,12 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr > 32) + if (!cidr || cidr > 32) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); - } else - ip_to = ip; + } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -238,13 +186,14 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } if (retried) - ip = h->next.ip; + ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip++) { - p = retried && ip == h->next.ip ? h->next.port : port; + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; for (; p <= port_to; p++) { - data.ip = htonl(ip); - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -255,18 +204,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variant */ struct hash_ipportip6_elem { union nf_inet_addr ip; @@ -276,120 +214,78 @@ struct hash_ipportip6_elem { u8 padding; }; -struct hash_ipportip6_telem { - union nf_inet_addr ip; - union nf_inet_addr ip2; - __be16 port; - u8 proto; - u8 padding; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && - ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } -static inline bool -hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst, - const struct hash_ipportip6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) { - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_ipportip6_data_tlist(struct sk_buff *skb, - const struct hash_ipportip6_elem *data) +static inline void +hash_ipportip6_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip6_elem *d) { - const struct hash_ipportip6_telem *e = - (const struct hash_ipportip6_telem *)data; - - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))); - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_ipportip6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportip6_data_next(struct ip_set_hash *h, - const struct hash_ipportip6_elem *d) -{ - h->next.port = ntohs(d->port); -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem data = { }; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem data = { }; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; @@ -397,6 +293,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -404,52 +302,47 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) - port = h->next.port; + port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -459,82 +352,14 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - - if (!(set->family == AF_INET || set->family == AF_INET6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - h->table = ip_set_alloc( - sizeof(struct htable) - + jhash_size(hbits) * sizeof(struct hbucket)); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == AF_INET - ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant; - - if (set->family == AF_INET) - hash_ipportip4_gc_init(set); - else - hash_ipportip6_gc_init(set); - } else { - set->variant = set->family == AF_INET - ? &hash_ipportip4_variant : &hash_ipportip6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, .dimension = IPSET_DIM_THREE, - .family = AF_UNSPEC, - .revision_min = 0, - .revision_max = 1, /* SCTP and UDPLITE support added */ + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, @@ -542,6 +367,7 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -553,6 +379,9 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 64199b4e93c..7308d84f927 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,44 +21,46 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Range as input support for IPv4 added */ +/* 3 nomatch flag support added */ +/* 4 Counters support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("hash:ip,port,net type of IP sets"); +IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ -#define TYPE hash_ipportnet - -static bool -hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b); +#define HTYPE hash_ipportnet -#define hash_ipportnet4_same_set hash_ipportnet_same_set -#define hash_ipportnet6_same_set hash_ipportnet_same_set +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS -/* The type variant functions: IPv4 */ +/* IPv4 variant */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; __be16 port; - u8 cidr; + u8 cidr:7; + u8 nomatch:1; u8 proto; }; -/* Member elements with timeout support */ -struct hash_ipportnet4_telem { - __be32 ip; - __be32 ip2; - __be16 port; - u8 cidr; - u8 proto; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, @@ -72,134 +74,119 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem) +static inline int +hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { - return elem->proto == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst, - const struct hash_ipportnet4_elem *src) +hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { - memcpy(dst, src, sizeof(*dst)); + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) +hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { - elem->ip2 &= ip_set_netmask(cidr); - elem->cidr = cidr; + swap(*flags, elem->nomatch); } static inline void -hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem) +hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { - elem->proto = 0; + elem->ip2 &= ip_set_netmask(cidr); + elem->cidr = cidr - 1; } static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) { - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_ipportnet4_data_tlist(struct sk_buff *skb, - const struct hash_ipportnet4_elem *data) +static inline void +hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet4_elem *d) { - const struct hash_ipportnet4_telem *tdata = - (const struct hash_ipportnet4_telem *)data; - - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))); - - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; + next->ip2 = d->ip2; } -#define IP_SET_HASH_WITH_PROTO -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_ipportnet4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportnet4_data_next(struct ip_set_hash *h, - const struct hash_ipportnet4_elem *d) -{ - h->next.ip = ntohl(d->ip); - h->next.port = ntohs(d->port); - h->next.ip2 = ntohl(d->ip2); -} +#include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet4_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_ipportnet4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (data.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); - data.ip2 &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + e.ip2 &= ip_set_netmask(e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet4_elem data = { .cidr = HOST_MASK }; - u32 ip, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to, ip2_last, ip2; - u32 timeout = h->timeout; + struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, p = 0, port, port_to; + u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; bool with_ports = false; + u8 cidr; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -208,44 +195,47 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; if (tb[IPSET_ATTR_CIDR2]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!data.cidr) + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; } if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { - data.ip = htonl(ip); - data.ip2 = htonl(ip2_from & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip); + e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; } + ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -253,19 +243,21 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { - u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr > 32) + if (!cidr || cidr > 32) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } + + ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) @@ -274,24 +266,27 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip2_from, ip2_to, data.cidr); - } + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); if (retried) - ip = h->next.ip; + ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip++) { - data.ip = htonl(ip); - p = retried && ip == h->next.ip ? h->next.port : port; + e.ip = htonl(ip); + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; for (; p <= port_to; p++) { - data.port = htons(p); - ip2 = retried && ip == h->next.ip && p == h->next.port - ? h->next.ip2 : ip2_from; + e.port = htons(p); + ip2 = retried && + ip == ntohl(h->next.ip) && + p == ntohs(h->next.port) + ? ntohl(h->next.ip2) : ip2_from; while (!after(ip2, ip2_to)) { - data.ip2 = htonl(ip2); + e.ip2 = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, - &data.cidr); - ret = adtfn(set, &data, timeout, flags); + &cidr); + e.cidr = cidr - 1; + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -304,175 +299,139 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variant */ struct hash_ipportnet6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; - u8 cidr; + u8 cidr:7; + u8 nomatch:1; u8 proto; }; -struct hash_ipportnet6_telem { - union nf_inet_addr ip; - union nf_inet_addr ip2; - __be16 port; - u8 cidr; - u8 proto; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && - ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } -static inline bool -hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst, - const struct hash_ipportnet6_elem *src) +static inline int +hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { - memcpy(dst, src, sizeof(*dst)); + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem) +hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { - elem->proto = 0; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); - elem->cidr = cidr; + elem->cidr = cidr - 1; } static bool hash_ipportnet6_data_list(struct sk_buff *skb, const struct hash_ipportnet6_elem *data) { - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_ipportnet6_data_tlist(struct sk_buff *skb, - const struct hash_ipportnet6_elem *data) +static inline void +hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet6_elem *d) { - const struct hash_ipportnet6_telem *e = - (const struct hash_ipportnet6_telem *)data; - - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))); - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_ipportnet6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportnet6_data_next(struct ip_set_hash *h, - const struct hash_ipportnet6_elem *d) -{ - h->next.port = ntohs(d->port); -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet6_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_ipportnet6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (data.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); - ip6_netmask(&data.ip2, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + ip6_netmask(&e.ip2, e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet6_elem data = { .cidr = HOST_MASK }; + struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; + u8 cidr; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -482,60 +441,63 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR2]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (!data.cidr) - return -IPSET_ERR_INVALID_CIDR; + if (tb[IPSET_ATTR_CIDR2]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; + } - ip6_netmask(&data.ip2, data.cidr); + ip6_netmask(&e.ip2, e.cidr + 1); if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) - port = h->next.port; + port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -545,86 +507,15 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - - if (!(set->family == AF_INET || set->family == AF_INET6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == AF_INET ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - h->table = ip_set_alloc( - sizeof(struct htable) - + jhash_size(hbits) * sizeof(struct hbucket)); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == AF_INET - ? &hash_ipportnet4_tvariant - : &hash_ipportnet6_tvariant; - - if (set->family == AF_INET) - hash_ipportnet4_gc_init(set); - else - hash_ipportnet6_gc_init(set); - } else { - set->variant = set->family == AF_INET - ? &hash_ipportnet4_variant : &hash_ipportnet6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, - .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | + IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_THREE, - .family = AF_UNSPEC, - .revision_min = 0, - /* 1 SCTP and UDPLITE support added */ - .revision_max = 2, /* Range as input support for IPv4 added */ + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, @@ -632,6 +523,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -643,8 +535,12 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 28988196775..4c7d495783a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,179 +20,164 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 Range as input support for IPv4 added */ +/* 2 nomatch flag support added */ +/* 3 Counters support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("hash:net type of IP sets"); +IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); /* Type specific function prefix */ -#define TYPE hash_net - -static bool -hash_net_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_net4_same_set hash_net_same_set -#define hash_net6_same_set hash_net_same_set +#define HTYPE hash_net +#define IP_SET_HASH_WITH_NETS -/* The type variant functions: IPv4 */ +/* IPv4 variant */ -/* Member elements without timeout */ +/* Member elements */ struct hash_net4_elem { __be32 ip; u16 padding0; - u8 padding1; + u8 nomatch; u8 cidr; }; -/* Member elements with timeout support */ -struct hash_net4_telem { - __be32 ip; - u16 padding0; - u8 padding1; - u8 cidr; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) { - return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr; + return ip1->ip == ip2->ip && + ip1->cidr == ip2->cidr; } -static inline bool -hash_net4_data_isnull(const struct hash_net4_elem *elem) +static inline int +hash_net4_do_data_match(const struct hash_net4_elem *elem) { - return elem->cidr == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_net4_data_copy(struct hash_net4_elem *dst, - const struct hash_net4_elem *src) +hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { - dst->ip = src->ip; - dst->cidr = src->cidr; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) +hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { - elem->ip &= ip_set_netmask(cidr); - elem->cidr = cidr; + swap(*flags, elem->nomatch); } -/* Zero CIDR values cannot be stored */ static inline void -hash_net4_data_zero_out(struct hash_net4_elem *elem) +hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { - elem->cidr = 0; + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; } static bool hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) { - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - return 0; - -nla_put_failure: - return 1; -} - -static bool -hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data) -{ - const struct hash_net4_telem *tdata = - (const struct hash_net4_telem *)data; - - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, tdata->cidr); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))); + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -#define IP_SET_HASH_WITH_NETS - -#define PF 4 -#define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - static inline void -hash_net4_data_next(struct ip_set_hash *h, +hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { - h->next.ip = ntohl(d->ip); + next->ip = d->ip; } +#define MTYPE hash_net4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + static int hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net4_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_net4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net4_elem data = { .cidr = HOST_MASK }; - u32 timeout = h->timeout; - u32 ip = 0, ip_to, last; + struct hash_net4_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, last; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret: + ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; @@ -206,11 +191,11 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_HASH_RANGE; } if (retried) - ip = h->next.ip; + ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; else @@ -220,70 +205,42 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_net_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variant */ struct hash_net6_elem { union nf_inet_addr ip; u16 padding0; - u8 padding1; + u8 nomatch; u8 cidr; }; -struct hash_net6_telem { - union nf_inet_addr ip; - u16 padding0; - u8 padding1; - u8 cidr; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->cidr == ip2->cidr; } -static inline bool -hash_net6_data_isnull(const struct hash_net6_elem *elem) -{ - return elem->cidr == 0; -} - -static inline void -hash_net6_data_copy(struct hash_net6_elem *dst, - const struct hash_net6_elem *src) +static inline int +hash_net6_do_data_match(const struct hash_net6_elem *elem) { - dst->ip.in6 = src->ip.in6; - dst->cidr = src->cidr; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_net6_data_zero_out(struct hash_net6_elem *elem) +hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { - elem->cidr = 0; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -296,77 +253,72 @@ hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) static bool hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) { - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data) +static inline void +hash_net6_data_next(struct hash_net4_elem *next, + const struct hash_net6_elem *d) { - const struct hash_net6_telem *e = - (const struct hash_net6_telem *)data; - - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, e->cidr); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))); - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_net6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_net6_data_next(struct ip_set_hash *h, - const struct hash_net6_elem *d) -{ -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net6_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_net6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net6_elem data = { .cidr = HOST_MASK }; - u32 timeout = h->timeout; + struct hash_net6_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -374,107 +326,39 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr) + if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - ip6_netmask(&data.ip, data.cidr); + ip6_netmask(&e.ip, e.cidr); - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; -} - -/* Create hash:ip type of sets */ - -static int -hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - struct ip_set_hash *h; - u8 hbits; - - if (!(set->family == AF_INET || set->family == AF_INET6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == AF_INET ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - h->table = ip_set_alloc( - sizeof(struct htable) - + jhash_size(hbits) * sizeof(struct hbucket)); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == AF_INET - ? &hash_net4_tvariant : &hash_net6_tvariant; - - if (set->family == AF_INET) - hash_net4_gc_init(set); - else - hash_net6_gc_init(set); - } else { - set->variant = set->family == AF_INET - ? &hash_net4_variant : &hash_net6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_net_type __read_mostly = { .name = "hash:net", .protocol = IPSET_PROTOCOL, - .features = IPSET_TYPE_IP, + .features = IPSET_TYPE_IP | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_ONE, - .family = AF_UNSPEC, - .revision_min = 0, - .revision_max = 1, /* Range as input support for IPv4 added */ + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = hash_net_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, @@ -482,12 +366,17 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index e13095deb50..db2606805b3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,12 +21,18 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 nomatch flag support added */ +/* 2 /0 support added */ +/* 3 Counters support added */ +/* 4 Comments support added */ +#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("hash:net,iface type of IP sets"); +IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); /* Interface name rbtree */ @@ -38,58 +44,15 @@ struct iface_node { #define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) -static inline long -ifname_compare(const char *_a, const char *_b) -{ - const long *a = (const long *)_a; - const long *b = (const long *)_b; - - BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); - if (a[0] != b[0]) - return a[0] - b[0]; - if (IFNAMSIZ > sizeof(long)) { - if (a[1] != b[1]) - return a[1] - b[1]; - } - if (IFNAMSIZ > 2 * sizeof(long)) { - if (a[2] != b[2]) - return a[2] - b[2]; - } - if (IFNAMSIZ > 3 * sizeof(long)) { - if (a[3] != b[3]) - return a[3] - b[3]; - } - return 0; -} - static void rbtree_destroy(struct rb_root *root) { - struct rb_node *p, *n = root->rb_node; - struct iface_node *node; - - /* Non-recursive destroy, like in ext3 */ - while (n) { - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - p = rb_parent(n); - node = rb_entry(n, struct iface_node, node); - if (!p) - *root = RB_ROOT; - else if (p->rb_left == n) - p->rb_left = NULL; - else if (p->rb_right == n) - p->rb_right = NULL; + struct iface_node *node, *next; + rbtree_postorder_for_each_entry_safe(node, next, root, node) kfree(node); - n = p; - } + + *root = RB_ROOT; } static int @@ -99,7 +62,7 @@ iface_test(struct rb_root *root, const char **iface) while (n) { const char *d = iface_data(n); - long res = ifname_compare(*iface, d); + int res = strcmp(*iface, d); if (res < 0) n = n->rb_left; @@ -121,7 +84,7 @@ iface_add(struct rb_root *root, const char **iface) while (*n) { char *ifname = iface_data(*n); - long res = ifname_compare(*iface, ifname); + int res = strcmp(*iface, ifname); p = *n; if (res < 0) @@ -147,45 +110,34 @@ iface_add(struct rb_root *root, const char **iface) } /* Type specific function prefix */ -#define TYPE hash_netiface - -static bool -hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_netiface4_same_set hash_netiface_same_set -#define hash_netiface6_same_set hash_netiface_same_set +#define HTYPE hash_netiface +#define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_RBTREE +#define IP_SET_HASH_WITH_MULTI #define STREQ(a, b) (strcmp(a, b) == 0) -/* The type variant functions: IPv4 */ +/* IPv4 variant */ struct hash_netiface4_elem_hashed { __be32 ip; u8 physdev; u8 cidr; - u16 padding; + u8 nomatch; + u8 elem; }; -#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) - -/* Member elements without timeout */ +/* Member elements */ struct hash_netiface4_elem { __be32 ip; u8 physdev; u8 cidr; - u16 padding; + u8 nomatch; + u8 elem; const char *iface; }; -/* Member elements with timeout support */ -struct hash_netiface4_telem { - __be32 ip; - u8 physdev; - u8 cidr; - u16 padding; - const char *iface; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, @@ -199,32 +151,29 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->iface == ip2->iface; } -static inline bool -hash_netiface4_data_isnull(const struct hash_netiface4_elem *elem) +static inline int +hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { - return elem->cidr == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netiface4_data_copy(struct hash_netiface4_elem *dst, - const struct hash_netiface4_elem *src) { - dst->ip = src->ip; - dst->cidr = src->cidr; - dst->physdev = src->physdev; - dst->iface = src->iface; +hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) +hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { - elem->ip &= ip_set_netmask(cidr); - elem->cidr = cidr; + swap(*flags, elem->nomatch); } static inline void -hash_netiface4_data_zero_out(struct hash_netiface4_elem *elem) +hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) { - elem->cidr = 0; + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; } static bool @@ -233,73 +182,54 @@ hash_netiface4_data_list(struct sk_buff *skb, { u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); - if (flags) - NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags); + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_netiface4_data_tlist(struct sk_buff *skb, - const struct hash_netiface4_elem *data) +static inline void +hash_netiface4_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface4_elem *d) { - const struct hash_netiface4_telem *tdata = - (const struct hash_netiface4_telem *)data; - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); - if (flags) - NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))); - - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; } -#define IP_SET_HASH_WITH_NETS -#define IP_SET_HASH_WITH_RBTREE -#define IP_SET_HASH_WITH_MULTI - +#define MTYPE hash_netiface4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netiface4_data_next(struct ip_set_hash *h, - const struct hash_netiface4_elem *d) -{ - h->next.ip = ntohl(d->ip); -} +#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) +#include "ip_set_hash_gen.h" static int hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface4_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_netiface4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .elem = 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); int ret; - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); #define IFACE(dir) (par->dir ? par->dir->name : NULL) #define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL) @@ -311,72 +241,69 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (!nf_bridge) return -EINVAL; - data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); - data.physdev = 1; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; #else - data.iface = NULL; + e.iface = NULL; #endif } else - data.iface = SRCDIR ? IFACE(in) : IFACE(out); + e.iface = SRCDIR ? IFACE(in) : IFACE(out); - if (!data.iface) + if (!e.iface) return -EINVAL; - ret = iface_test(&h->rbtree, &data.iface); + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } } else if (!ret) return ret; - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface4_elem data = { .cidr = HOST_MASK }; - u32 ip = 0, ip_to, last; - u32 timeout = h->timeout; - char iface[IFNAMSIZ] = {}; + struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, last; + char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - data.iface = iface; - ret = iface_test(&h->rbtree, &data.iface); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } @@ -386,13 +313,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) - data.physdev = 1; + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } - if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; } if (tb[IPSET_ATTR_IP_TO]) { @@ -403,16 +332,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip, ip_to, data.cidr); - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr); if (retried) - ip = h->next.ip; + ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -423,82 +351,55 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variant */ struct hash_netiface6_elem_hashed { union nf_inet_addr ip; u8 physdev; u8 cidr; - u16 padding; + u8 nomatch; + u8 elem; }; -#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) - struct hash_netiface6_elem { union nf_inet_addr ip; u8 physdev; u8 cidr; - u16 padding; + u8 nomatch; + u8 elem; const char *iface; }; -struct hash_netiface6_telem { - union nf_inet_addr ip; - u8 physdev; - u8 cidr; - u16 padding; - const char *iface; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && ip1->iface == ip2->iface; } -static inline bool -hash_netiface6_data_isnull(const struct hash_netiface6_elem *elem) +static inline int +hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { - return elem->cidr == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netiface6_data_copy(struct hash_netiface6_elem *dst, - const struct hash_netiface6_elem *src) +hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { - memcpy(dst, src, sizeof(*dst)); + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem) +hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { -} - -static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -514,70 +415,59 @@ hash_netiface6_data_list(struct sk_buff *skb, { u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); - if (flags) - NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags); + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || + nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_netiface6_data_tlist(struct sk_buff *skb, - const struct hash_netiface6_elem *data) +static inline void +hash_netiface6_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface6_elem *d) { - const struct hash_netiface6_telem *e = - (const struct hash_netiface6_telem *)data; - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); - if (flags) - NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, flags); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))); - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_netiface6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netiface6_data_next(struct ip_set_hash *h, - const struct hash_netiface6_elem *d) -{ -} +#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface6_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_netiface6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .elem = 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); int ret; - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #ifdef CONFIG_BRIDGE_NETFILTER @@ -585,44 +475,46 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (!nf_bridge) return -EINVAL; - data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); - data.physdev = 1; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; #else - data.iface = NULL; + e.iface = NULL; #endif } else - data.iface = SRCDIR ? IFACE(in) : IFACE(out); + e.iface = SRCDIR ? IFACE(in) : IFACE(out); - if (!data.iface) + if (!e.iface) return -EINVAL; - ret = iface_test(&h->rbtree, &data.iface); + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } } else if (!ret) return ret; - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface6_elem data = { .cidr = HOST_MASK }; - u32 timeout = h->timeout; - char iface[IFNAMSIZ] = {}; + struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -630,28 +522,23 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - ip6_netmask(&data.ip, data.cidr); - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ip6_netmask(&e.ip, e.cidr); strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - data.iface = iface; - ret = iface_test(&h->rbtree, &data.iface); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } @@ -661,93 +548,26 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) - data.physdev = 1; + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; -} - -/* Create hash:ip type of sets */ - -static int -hash_netiface_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - - if (!(set->family == AF_INET || set->family == AF_INET6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == AF_INET ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - h->ahash_max = AHASH_MAX_SIZE; - - hbits = htable_bits(hashsize); - h->table = ip_set_alloc( - sizeof(struct htable) - + jhash_size(hbits) * sizeof(struct hbucket)); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - h->rbtree = RB_ROOT; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == AF_INET - ? &hash_netiface4_tvariant : &hash_netiface6_tvariant; - - if (set->family == AF_INET) - hash_netiface4_gc_init(set); - else - hash_netiface6_gc_init(set); - } else { - set->variant = set->family == AF_INET - ? &hash_netiface4_variant : &hash_netiface6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_netiface_type __read_mostly = { .name = "hash:net,iface", .protocol = IPSET_PROTOCOL, - .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE, + .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE | + IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_TWO, - .family = AF_UNSPEC, - .revision_min = 0, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = hash_netiface_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, @@ -756,16 +576,20 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IFACE] = { .type = NLA_NUL_STRING, - .len = IPSET_MAXNAMELEN - 1 }, + .len = IFNAMSIZ - 1 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c new file mode 100644 index 00000000000..3e99987e4bf --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -0,0 +1,481 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,net"); + +/* Type specific function prefix */ +#define HTYPE hash_netnet +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variants */ + +/* Member elements */ +struct hash_netnet4_elem { + union { + __be32 ip[2]; + __be64 ipcmp; + }; + u8 nomatch; + union { + u8 cidr[2]; + u16 ccmp; + }; +}; + +/* Common functions */ + +static inline bool +hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, + const struct hash_netnet4_elem *ip2, + u32 *multi) +{ + return ip1->ipcmp == ip2->ipcmp && + ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, + struct hash_netnet4_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) +{ + if (inner) { + elem->ip[1] &= ip_set_netmask(cidr); + elem->cidr[1] = cidr; + } else { + elem->ip[0] &= ip_set_netmask(cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netnet4_data_list(struct sk_buff *skb, + const struct hash_netnet4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; +} + +static inline void +hash_netnet4_data_next(struct hash_netnet4_elem *next, + const struct hash_netnet4_elem *d) +{ + next->ipcmp = d->ipcmp; +} + +#define MTYPE hash_netnet4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); + ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); + e.ip[0] &= ip_set_netmask(e.cidr[0]); + e.ip[1] &= ip_set_netmask(e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, last; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; + u8 cidr, cidr2; + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[0] = cidr; + } + + if (tb[IPSET_ATTR_CIDR2]) { + cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr2 || cidr2 > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[1] = cidr2; + } + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] && + tb[IPSET_ATTR_IP2_TO])) { + e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } + + ip2_to = ip2_from; + if (tb[IPSET_ATTR_IP2_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); + if (ret) + return ret; + if (ip2_to < ip2_from) + swap(ip2_from, ip2_to); + if (ip2_from + UINT_MAX == ip2_to) + return -IPSET_ERR_HASH_RANGE; + + } + + if (retried) + ip = ntohl(h->next.ip[0]); + + while (!after(ip, ip_to)) { + e.ip[0] = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &cidr); + e.cidr[0] = cidr; + ip2 = (retried && + ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) + : ip2_from; + while (!after(ip2, ip2_to)) { + e.ip[1] = htonl(ip2); + last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); + e.cidr[1] = cidr2; + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip2 = last2 + 1; + } + ip = last + 1; + } + return ret; +} + +/* IPv6 variants */ + +struct hash_netnet6_elem { + union nf_inet_addr ip[2]; + u8 nomatch; + union { + u8 cidr[2]; + u16 ccmp; + }; +}; + +/* Common functions */ + +static inline bool +hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, + const struct hash_netnet6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && + ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && + ip1->ccmp == ip2->ccmp; +} + +static inline int +hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) +{ + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; +} + +static inline void +hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, + struct hash_netnet6_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) +{ + if (inner) { + ip6_netmask(&elem->ip[1], cidr); + elem->cidr[1] = cidr; + } else { + ip6_netmask(&elem->ip[0], cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netnet6_data_list(struct sk_buff *skb, + const struct hash_netnet6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; +} + +static inline void +hash_netnet6_data_next(struct hash_netnet4_elem *next, + const struct hash_netnet6_elem *d) +{ +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_netnet6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || + ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (tb[IPSET_ATTR_CIDR2]) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + + if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || + e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + ret = adtfn(set, &e, &ext, &ext, flags); + + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; +} + +static struct ip_set_type hash_netnet_type __read_mostly = { + .name = "hash:net,net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_netnet_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netnet_init(void) +{ + return ip_set_type_register(&hash_netnet_type); +} + +static void __exit +hash_netnet_fini(void) +{ + ip_set_type_unregister(&hash_netnet_type); +} + +module_init(hash_netnet_init); +module_exit(hash_netnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 8f9de7207ec..1c645fbd09c 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,42 +20,45 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 SCTP and UDPLITE support added */ +/* 2 Range as input support for IPv4 added */ +/* 3 nomatch flag support added */ +/* 4 Counters support added */ +/* 5 Comments support added */ +#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("hash:net,port type of IP sets"); +IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); /* Type specific function prefix */ -#define TYPE hash_netport - -static bool -hash_netport_same_set(const struct ip_set *a, const struct ip_set *b); +#define HTYPE hash_netport +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS -#define hash_netport4_same_set hash_netport_same_set -#define hash_netport6_same_set hash_netport_same_set +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED -/* The type variant functions: IPv4 */ +/* IPv4 variant */ -/* Member elements without timeout */ +/* Member elements */ struct hash_netport4_elem { __be32 ip; __be16 port; u8 proto; - u8 cidr; + u8 cidr:7; + u8 nomatch:1; }; -/* Member elements with timeout support */ -struct hash_netport4_telem { - __be32 ip; - __be16 port; - u8 proto; - u8 cidr; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, @@ -68,172 +71,158 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline bool -hash_netport4_data_isnull(const struct hash_netport4_elem *elem) +static inline int +hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { - return elem->proto == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netport4_data_copy(struct hash_netport4_elem *dst, - const struct hash_netport4_elem *src) +hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { - dst->ip = src->ip; - dst->port = src->port; - dst->proto = src->proto; - dst->cidr = src->cidr; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) +hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { - elem->ip &= ip_set_netmask(cidr); - elem->cidr = cidr; + swap(*flags, elem->nomatch); } static inline void -hash_netport4_data_zero_out(struct hash_netport4_elem *elem) +hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { - elem->proto = 0; + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr - 1; } static bool hash_netport4_data_list(struct sk_buff *skb, const struct hash_netport4_elem *data) { - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_netport4_data_tlist(struct sk_buff *skb, - const struct hash_netport4_elem *data) +static inline void +hash_netport4_data_next(struct hash_netport4_elem *next, + const struct hash_netport4_elem *d) { - const struct hash_netport4_telem *tdata = - (const struct hash_netport4_telem *)data; - - NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))); - - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; } -#define IP_SET_HASH_WITH_PROTO -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_netport4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netport4_data_next(struct ip_set_hash *h, - const struct hash_netport4_elem *d) -{ - h->next.ip = ntohl(d->ip); - h->next.port = ntohs(d->port); -} +#include "ip_set_hash_gen.h" static int hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport4_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_netport4_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (data.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport4_elem data = { .cidr = HOST_MASK }; - u32 port, port_to, p = 0, ip = 0, ip_to, last; - u32 timeout = h->timeout; + struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to, p = 0, ip = 0, ip_to = 0, last; bool with_ports = false; + u8 cidr; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr) + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; } if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } - with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = port_to = ntohs(data.port); + port = port_to = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port_to < port) @@ -247,19 +236,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip, ip_to, data.cidr); - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr + 1); if (retried) - ip = h->next.ip; + ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); - p = retried && ip == h->next.ip ? h->next.port : port; + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &cidr); + e.cidr = cidr - 1; + p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) + : port; for (; p <= port_to; p++) { - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -271,169 +261,135 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_netport_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variant */ struct hash_netport6_elem { union nf_inet_addr ip; __be16 port; u8 proto; - u8 cidr; + u8 cidr:7; + u8 nomatch:1; }; -struct hash_netport6_telem { - union nf_inet_addr ip; - __be16 port; - u8 proto; - u8 cidr; - unsigned long timeout; -}; +/* Common functions */ static inline bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto && ip1->cidr == ip2->cidr; } -static inline bool -hash_netport6_data_isnull(const struct hash_netport6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_netport6_data_copy(struct hash_netport6_elem *dst, - const struct hash_netport6_elem *src) +static inline int +hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { - memcpy(dst, src, sizeof(*dst)); + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netport6_data_zero_out(struct hash_netport6_elem *elem) +hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { - elem->proto = 0; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); - elem->cidr = cidr; + elem->cidr = cidr - 1; } static bool hash_netport6_data_list(struct sk_buff *skb, const struct hash_netport6_elem *data) { - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; return 0; nla_put_failure: return 1; } -static bool -hash_netport6_data_tlist(struct sk_buff *skb, - const struct hash_netport6_elem *data) +static inline void +hash_netport6_data_next(struct hash_netport4_elem *next, + const struct hash_netport6_elem *d) { - const struct hash_netport6_telem *e = - (const struct hash_netport6_telem *)data; - - NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); - NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); - NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); - NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))); - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_netport6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netport6_data_next(struct ip_set_hash *h, - const struct hash_netport6_elem *d) -{ - h->next.port = ntohs(d->port); -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport6_elem data = { - .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + struct hash_netport6_elem e = { + .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (data.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport6_elem data = { .cidr = HOST_MASK }; + struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; + u8 cidr; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -441,54 +397,58 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr) - return -IPSET_ERR_INVALID_CIDR; - ip6_netmask(&data.ip, data.cidr); + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr = cidr - 1; + } + ip6_netmask(&e.ip, e.cidr + 1); if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) - port = h->next.port; + port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -498,85 +458,14 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - - if (!(set->family == AF_INET || set->family == AF_INET6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == AF_INET ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - h->table = ip_set_alloc( - sizeof(struct htable) - + jhash_size(hbits) * sizeof(struct hbucket)); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == AF_INET - ? &hash_netport4_tvariant : &hash_netport6_tvariant; - - if (set->family == AF_INET) - hash_netport4_gc_init(set); - else - hash_netport6_gc_init(set); - } else { - set->variant = set->family == AF_INET - ? &hash_netport4_variant : &hash_netport6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_netport_type __read_mostly = { .name = "hash:net,port", .protocol = IPSET_PROTOCOL, - .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_TWO, - .family = AF_UNSPEC, - .revision_min = 0, - /* 1 SCTP and UDPLITE support added */ - .revision_max = 2, /* Range as input support for IPv4 added */ + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = hash_netport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, @@ -585,6 +474,7 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -595,6 +485,10 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c new file mode 100644 index 00000000000..c0d2ba73f8b --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -0,0 +1,587 @@ +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +/* 0 Comments support added */ +#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); +IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:net,port,net"); + +/* Type specific function prefix */ +#define HTYPE hash_netportnet +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS +#define IPSET_NET_COUNT 2 + +/* IPv4 variant */ + +/* Member elements */ +struct hash_netportnet4_elem { + union { + __be32 ip[2]; + __be64 ipcmp; + }; + __be16 port; + union { + u8 cidr[2]; + u16 ccmp; + }; + u8 nomatch:1; + u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, + const struct hash_netportnet4_elem *ip2, + u32 *multi) +{ + return ip1->ipcmp == ip2->ipcmp && + ip1->ccmp == ip2->ccmp && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, + struct hash_netportnet4_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, + u8 cidr, bool inner) +{ + if (inner) { + elem->ip[1] &= ip_set_netmask(cidr); + elem->cidr[1] = cidr; + } else { + elem->ip[0] &= ip_set_netmask(cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netportnet4_data_list(struct sk_buff *skb, + const struct hash_netportnet4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netportnet4_data_next(struct hash_netportnet4_elem *next, + const struct hash_netportnet4_elem *d) +{ + next->ipcmp = d->ipcmp; + next->port = d->port; +} + +#define MTYPE hash_netportnet4 +#define PF 4 +#define HOST_MASK 32 +#include "ip_set_hash_gen.h" + +static int +hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]); + e.ip[0] &= ip_set_netmask(e.cidr[0]); + e.ip[1] &= ip_set_netmask(e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; + u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; + bool with_ports = false; + u8 cidr, cidr2; + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[0] = cidr; + } + + if (tb[IPSET_ATTR_CIDR2]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + e.cidr[1] = cidr; + } + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { + e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + if (unlikely(ip + UINT_MAX == ip_to)) + return -IPSET_ERR_HASH_RANGE; + } + + port_to = port = ntohs(e.port); + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + ip2_to = ip2_from; + if (tb[IPSET_ATTR_IP2_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); + if (ret) + return ret; + if (ip2_from > ip2_to) + swap(ip2_from, ip2_to); + if (unlikely(ip2_from + UINT_MAX == ip2_to)) + return -IPSET_ERR_HASH_RANGE; + } + + if (retried) + ip = ntohl(h->next.ip[0]); + + while (!after(ip, ip_to)) { + e.ip[0] = htonl(ip); + ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); + e.cidr[0] = cidr; + p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) + : port; + for (; p <= port_to; p++) { + e.port = htons(p); + ip2 = (retried && ip == ntohl(h->next.ip[0]) && + p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) + : ip2_from; + while (!after(ip2, ip2_to)) { + e.ip[1] = htonl(ip2); + ip2_last = ip_set_range_to_cidr(ip2, ip2_to, + &cidr2); + e.cidr[1] = cidr2; + ret = adtfn(set, &e, &ext, &ext, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip2 = ip2_last + 1; + } + } + ip = ip_last + 1; + } + return ret; +} + +/* IPv6 variant */ + +struct hash_netportnet6_elem { + union nf_inet_addr ip[2]; + __be16 port; + union { + u8 cidr[2]; + u16 ccmp; + }; + u8 nomatch:1; + u8 proto; +}; + +/* Common functions */ + +static inline bool +hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, + const struct hash_netportnet6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && + ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && + ip1->ccmp == ip2->ccmp && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline int +hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) +{ + return elem->nomatch ? -ENOTEMPTY : 1; +} + +static inline void +hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) +{ + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); +} + +static inline void +hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) +{ + swap(*flags, elem->nomatch); +} + +static inline void +hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, + struct hash_netportnet6_elem *orig) +{ + elem->ip[1] = orig->ip[1]; +} + +static inline void +hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, + u8 cidr, bool inner) +{ + if (inner) { + ip6_netmask(&elem->ip[1], cidr); + elem->cidr[1] = cidr; + } else { + ip6_netmask(&elem->ip[0], cidr); + elem->cidr[0] = cidr; + } +} + +static bool +hash_netportnet6_data_list(struct sk_buff *skb, + const struct hash_netportnet6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || + nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || + nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || + nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || + nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || + nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || + (flags && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return 1; +} + +static inline void +hash_netportnet6_data_next(struct hash_netportnet4_elem *next, + const struct hash_netportnet6_elem *d) +{ + next->port = d->port; +} + +#undef MTYPE +#undef PF +#undef HOST_MASK + +#define MTYPE hash_netportnet6 +#define PF 6 +#define HOST_MASK 128 +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" + +static int +hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + if (adt == IPSET_TEST) + e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &e.port, &e.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6); + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct hash_netportnet *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netportnet6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + u32 port, port_to; + bool with_ports = false; + int ret; + + e.cidr[0] = e.cidr[1] = HOST_MASK; + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || + ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || + ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (tb[IPSET_ATTR_CIDR2]) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + + if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || + e.cidr[1] > HOST_MASK)) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&e.ip[0], e.cidr[0]); + ip6_netmask(&e.ip[1], e.cidr[1]); + + if (tb[IPSET_ATTR_PORT]) + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); + + if (e.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt, set) ? -ret : + ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(e.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = ntohs(h->next.port); + for (; port <= port_to; port++) { + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static struct ip_set_type hash_netportnet_type __read_mostly = { + .name = "hash:net,port,net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | + IPSET_TYPE_NOMATCH, + .dimension = IPSET_DIM_THREE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_netportnet_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netportnet_init(void) +{ + return ip_set_type_register(&hash_netportnet_type); +} + +static void __exit +hash_netportnet_fini(void) +{ + ip_set_type_unregister(&hash_netportnet_type); +} + +module_init(hash_netportnet_init); +module_exit(hash_netportnet_fini); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 4d10819d462..3e2317f3cf6 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -13,208 +13,354 @@ #include <linux/errno.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_list.h> +#define IPSET_TYPE_REV_MIN 0 +/* 1 Counters support added */ +#define IPSET_TYPE_REV_MAX 2 /* Comments support added */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("list:set type of IP sets"); +IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); -/* Member elements without and with timeout */ +/* Member elements */ struct set_elem { ip_set_id_t id; }; -struct set_telem { +struct set_adt_elem { ip_set_id_t id; - unsigned long timeout; + ip_set_id_t refid; + int before; }; /* Type structure */ struct list_set { - size_t dsize; /* element size */ u32 size; /* size of set list array */ - u32 timeout; /* timeout value */ struct timer_list gc; /* garbage collection */ + struct net *net; /* namespace */ struct set_elem members[0]; /* the set members */ }; -static inline struct set_elem * -list_set_elem(const struct list_set *map, u32 id) -{ - return (struct set_elem *)((void *)map->members + id * map->dsize); -} +#define list_set_elem(set, map, id) \ + (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize) -static inline struct set_telem * -list_set_telem(const struct list_set *map, u32 id) -{ - return (struct set_telem *)((void *)map->members + id * map->dsize); -} - -static inline bool -list_set_timeout(const struct list_set *map, u32 id) +static int +list_set_ktest(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { - const struct set_telem *elem = list_set_telem(map, id); + struct list_set *map = set->data; + struct set_elem *e; + u32 i, cmdflags = opt->cmdflags; + int ret; - return ip_set_timeout_test(elem->timeout); + /* Don't lookup sub-counters at all */ + opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; + if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) + opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + ret = ip_set_test(e->id, skb, par, opt); + if (ret > 0) { + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(e, set), + ext, &opt->ext, + cmdflags); + return ret; + } + } + return 0; } -static inline bool -list_set_expired(const struct list_set *map, u32 id) +static int +list_set_kadd(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { - const struct set_telem *elem = list_set_telem(map, id); + struct list_set *map = set->data; + struct set_elem *e; + u32 i; + int ret; - return ip_set_timeout_expired(elem->timeout); + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + ret = ip_set_add(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; } -/* Set list without and with timeout */ - static int -list_set_kadt(struct ip_set *set, const struct sk_buff *skb, +list_set_kdel(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; - struct set_elem *elem; + struct set_elem *e; u32 i; int ret; for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) return 0; - if (with_timeout(map->timeout) && list_set_expired(map, i)) + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - switch (adt) { - case IPSET_TEST: - ret = ip_set_test(elem->id, skb, par, opt); - if (ret > 0) - return ret; - break; - case IPSET_ADD: - ret = ip_set_add(elem->id, skb, par, opt); - if (ret == 0) - return ret; - break; - case IPSET_DEL: - ret = ip_set_del(elem->id, skb, par, opt); - if (ret == 0) - return ret; - break; - default: - break; - } + ret = ip_set_del(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; +} + +static int +list_set_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + switch (adt) { + case IPSET_TEST: + return list_set_ktest(set, skb, par, opt, &ext); + case IPSET_ADD: + return list_set_kadd(set, skb, par, opt, &ext); + case IPSET_DEL: + return list_set_kdel(set, skb, par, opt, &ext); + default: + break; } return -EINVAL; } static bool -id_eq(const struct list_set *map, u32 i, ip_set_id_t id) +id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) { - const struct set_elem *elem; + const struct list_set *map = set->data; + const struct set_elem *e; + + if (i >= map->size) + return 0; - if (i < map->size) { - elem = list_set_elem(map, i); - return elem->id == id; + e = list_set_elem(set, map, i); + return !!(e->id == id && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set)))); +} + +static int +list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, + const struct ip_set_ext *ext) +{ + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(set, map, i); + + if (e->id != IPSET_INVALID_ID) { + if (i == map->size - 1) { + /* Last element replaced: e.g. add new,before,last */ + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + } else { + struct set_elem *x = list_set_elem(set, map, + map->size - 1); + + /* Last element pushed off */ + if (x->id != IPSET_INVALID_ID) { + ip_set_put_byindex(map->net, x->id); + ip_set_ext_destroy(set, x); + } + memmove(list_set_elem(set, map, i + 1), e, + set->dsize * (map->size - (i + 1))); + /* Extensions must be initialized to zero */ + memset(e, 0, set->dsize); + } } + e->id = d->id; + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); return 0; } -static bool -id_eq_timeout(const struct list_set *map, u32 i, ip_set_id_t id) +static int +list_set_del(struct ip_set *set, u32 i) { - const struct set_elem *elem; + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(set, map, i); - if (i < map->size) { - elem = list_set_elem(map, i); - return !!(elem->id == id && - !(with_timeout(map->timeout) && - list_set_expired(map, i))); - } + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + if (i < map->size - 1) + memmove(e, list_set_elem(set, map, i + 1), + set->dsize * (map->size - (i + 1))); + + /* Last element */ + e = list_set_elem(set, map, map->size - 1); + e->id = IPSET_INVALID_ID; return 0; } static void -list_elem_add(struct list_set *map, u32 i, ip_set_id_t id) +set_cleanup_entries(struct ip_set *set) { + struct list_set *map = set->data; struct set_elem *e; - - for (; i < map->size; i++) { - e = list_set_elem(map, i); - swap(e->id, id); - if (e->id == IPSET_INVALID_ID) - break; + u32 i = 0; + + while (i < map->size) { + e = list_set_elem(set, map, i); + if (e->id != IPSET_INVALID_ID && + ip_set_timeout_expired(ext_timeout(e, set))) + list_set_del(set, i); + /* Check element moved to position i in next loop */ + else + i++; } } -static void -list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id, - unsigned long timeout) +static int +list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - struct set_telem *e; + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + u32 i; + int ret; - for (; i < map->size; i++) { - e = list_set_telem(map, i); - swap(e->id, id); - swap(e->timeout, timeout); + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); if (e->id == IPSET_INVALID_ID) - break; + return 0; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return 1; + else if (d->before > 0) + ret = id_eq(set, i + 1, d->refid); + else + ret = i > 0 && id_eq(set, i - 1, d->refid); + return ret; } + return 0; } + static int -list_set_add(struct list_set *map, u32 i, ip_set_id_t id, - unsigned long timeout) +list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - const struct set_elem *e = list_set_elem(map, i); + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 i, ret = 0; - if (i == map->size - 1 && e->id != IPSET_INVALID_ID) - /* Last element replaced: e.g. add new,before,last */ - ip_set_put_byindex(e->id); - if (with_timeout(map->timeout)) - list_elem_tadd(map, i, id, ip_set_timeout_set(timeout)); - else - list_elem_add(map, i, id); + if (SET_WITH_TIMEOUT(set)) + set_cleanup_entries(set); - return 0; -} + /* Check already added element */ + for (i = 0; i < map->size; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + goto insert; + else if (e->id != d->id) + continue; -static int -list_set_del(struct list_set *map, u32 i) -{ - struct set_elem *a = list_set_elem(map, i), *b; - - ip_set_put_byindex(a->id); - - for (; i < map->size - 1; i++) { - b = list_set_elem(map, i + 1); - a->id = b->id; - if (with_timeout(map->timeout)) - ((struct set_telem *)a)->timeout = - ((struct set_telem *)b)->timeout; - a = b; - if (a->id == IPSET_INVALID_ID) - break; + if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || + (d->before < 0 && + (i == 0 || !id_eq(set, i - 1, d->refid)))) + /* Before/after doesn't match */ + return -IPSET_ERR_REF_EXIST; + if (!flag_exist) + /* Can't re-add */ + return -IPSET_ERR_EXIST; + /* Update extensions */ + ip_set_ext_destroy(set, e); + + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); + /* Set is already added to the list */ + ip_set_put_byindex(map->net, d->id); + return 0; } - /* Last element */ - a->id = IPSET_INVALID_ID; - return 0; +insert: + ret = -IPSET_ERR_LIST_FULL; + for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + ret = d->before != 0 ? -IPSET_ERR_REF_EXIST + : list_set_add(set, i, d, ext); + else if (e->id != d->refid) + continue; + else if (d->before > 0) + ret = list_set_add(set, i, d, ext); + else if (i + 1 < map->size) + ret = list_set_add(set, i + 1, d, ext); + } + + return ret; } -static void -cleanup_entries(struct list_set *map) +static int +list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - struct set_telem *e; + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; u32 i; for (i = 0; i < map->size; i++) { - e = list_set_telem(map, i); - if (e->id != IPSET_INVALID_ID && list_set_expired(map, i)) - list_set_del(map, i); + e = list_set_elem(set, map, i); + if (e->id == IPSET_INVALID_ID) + return d->before != 0 ? -IPSET_ERR_REF_EXIST + : -IPSET_ERR_EXIST; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return list_set_del(set, i); + else if (d->before > 0) { + if (!id_eq(set, i + 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + return list_set_del(set, i); + } else if (i == 0 || !id_eq(set, i - 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + else + return list_set_del(set, i); } + return -IPSET_ERR_EXIST; } static int @@ -222,26 +368,27 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct list_set *map = set->data; - bool with_timeout = with_timeout(map->timeout); - bool flag_exist = flags & IPSET_FLAG_EXIST; - int before = 0; - u32 timeout = map->timeout; - ip_set_id_t id, refid = IPSET_INVALID_ID; - const struct set_elem *elem; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); struct ip_set *s; - u32 i; int ret = 0; if (unlikely(!tb[IPSET_ATTR_NAME] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); - if (id == IPSET_INVALID_ID) + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s); + if (e.id == IPSET_INVALID_ID) return -IPSET_ERR_NAME; /* "Loop detection" */ if (s->type->features & IPSET_TYPE_NAME) { @@ -251,115 +398,35 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); - before = f & IPSET_FLAG_BEFORE; + e.before = f & IPSET_FLAG_BEFORE; } - if (before && !tb[IPSET_ATTR_NAMEREF]) { + if (e.before && !tb[IPSET_ATTR_NAMEREF]) { ret = -IPSET_ERR_BEFORE; goto finish; } if (tb[IPSET_ATTR_NAMEREF]) { - refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), - &s); - if (refid == IPSET_INVALID_ID) { + e.refid = ip_set_get_byname(map->net, + nla_data(tb[IPSET_ATTR_NAMEREF]), + &s); + if (e.refid == IPSET_INVALID_ID) { ret = -IPSET_ERR_NAMEREF; goto finish; } - if (!before) - before = -1; - } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout) { - ret = -IPSET_ERR_TIMEOUT; - goto finish; - } - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (!e.before) + e.before = -1; } - if (with_timeout && adt != IPSET_TEST) - cleanup_entries(map); + if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set)) + set_cleanup_entries(set); - switch (adt) { - case IPSET_TEST: - for (i = 0; i < map->size && !ret; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID || - (before != 0 && i + 1 >= map->size)) - break; - else if (with_timeout && list_set_expired(map, i)) - continue; - else if (before > 0 && elem->id == id) - ret = id_eq_timeout(map, i + 1, refid); - else if (before < 0 && elem->id == refid) - ret = id_eq_timeout(map, i + 1, id); - else if (before == 0 && elem->id == id) - ret = 1; - } - break; - case IPSET_ADD: - for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id != id) - continue; - if (!(with_timeout && flag_exist)) { - ret = -IPSET_ERR_EXIST; - goto finish; - } else { - struct set_telem *e = list_set_telem(map, i); - - if ((before > 1 && - !id_eq(map, i + 1, refid)) || - (before < 0 && - (i == 0 || !id_eq(map, i - 1, refid)))) { - ret = -IPSET_ERR_EXIST; - goto finish; - } - e->timeout = ip_set_timeout_set(timeout); - ip_set_put_byindex(id); - ret = 0; - goto finish; - } - } - ret = -IPSET_ERR_LIST_FULL; - for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) - ret = before != 0 ? -IPSET_ERR_REF_EXIST - : list_set_add(map, i, id, timeout); - else if (elem->id != refid) - continue; - else if (before > 0) - ret = list_set_add(map, i, id, timeout); - else if (i + 1 < map->size) - ret = list_set_add(map, i + 1, id, timeout); - } - break; - case IPSET_DEL: - ret = -IPSET_ERR_EXIST; - for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) { - ret = before != 0 ? -IPSET_ERR_REF_EXIST - : -IPSET_ERR_EXIST; - break; - } else if (elem->id == id && - (before == 0 || - (before > 0 && id_eq(map, i + 1, refid)))) - ret = list_set_del(map, i); - else if (elem->id == refid && - before < 0 && id_eq(map, i + 1, id)) - ret = list_set_del(map, i + 1); - } - break; - default: - break; - } + ret = adtfn(set, &e, &ext, &ext, flags); finish: - if (refid != IPSET_INVALID_ID) - ip_set_put_byindex(refid); + if (e.refid != IPSET_INVALID_ID) + ip_set_put_byindex(map->net, e.refid); if (adt != IPSET_ADD || ret) - ip_set_put_byindex(id); + ip_set_put_byindex(map->net, e.id); return ip_set_eexist(ret, flags) ? 0 : ret; } @@ -368,14 +435,15 @@ static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *elem; + struct set_elem *e; u32 i; for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id != IPSET_INVALID_ID) { - ip_set_put_byindex(elem->id); - elem->id = IPSET_INVALID_ID; + e = list_set_elem(set, map, i); + if (e->id != IPSET_INVALID_ID) { + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + e->id = IPSET_INVALID_ID; } } } @@ -385,7 +453,7 @@ list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - if (with_timeout(map->timeout)) + if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); list_set_flush(set); kfree(map); @@ -402,12 +470,13 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; - NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size)); - if (with_timeout(map->timeout)) - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); - NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->size * map->dsize)); + if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->size * set->dsize))) + goto nla_put_failure; + if (unlikely(ip_set_put_flags(skb, set))) + goto nla_put_failure; ipset_nest_end(skb, nested); return 0; @@ -421,18 +490,20 @@ list_set_list(const struct ip_set *set, { const struct list_set *map = set->data; struct nlattr *atd, *nested; - u32 i, first = cb->args[2]; + u32 i, first = cb->args[IPSET_CB_ARG0]; const struct set_elem *e; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; - for (; cb->args[2] < map->size; cb->args[2]++) { - i = cb->args[2]; - e = list_set_elem(map, i); + for (; cb->args[IPSET_CB_ARG0] < map->size; + cb->args[IPSET_CB_ARG0]++) { + i = cb->args[IPSET_CB_ARG0]; + e = list_set_elem(set, map, i); if (e->id == IPSET_INVALID_ID) goto finish; - if (with_timeout(map->timeout) && list_set_expired(map, i)) + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { @@ -442,29 +513,26 @@ list_set_list(const struct ip_set *set, } else goto nla_put_failure; } - NLA_PUT_STRING(skb, IPSET_ATTR_NAME, - ip_set_name_byindex(e->id)); - if (with_timeout(map->timeout)) { - const struct set_telem *te = - (const struct set_telem *) e; - NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(te->timeout))); - } + if (nla_put_string(skb, IPSET_ATTR_NAME, + ip_set_name_byindex(map->net, e->id))) + goto nla_put_failure; + if (ip_set_put_extensions(skb, set, e, true)) + goto nla_put_failure; ipset_nest_end(skb, nested); } finish: ipset_nest_end(skb, atd); /* Set listing finished */ - cb->args[2] = 0; + cb->args[IPSET_CB_ARG0] = 0; return 0; nla_put_failure: nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); if (unlikely(i == first)) { - cb->args[2] = 0; + cb->args[IPSET_CB_ARG0] = 0; return -EMSGSIZE; } + ipset_nest_end(skb, atd); return 0; } @@ -475,12 +543,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) const struct list_set *y = b->data; return x->size == y->size && - x->timeout == y->timeout; + a->timeout == b->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant list_set = { +static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, + .adt = { + [IPSET_ADD] = list_set_uadd, + [IPSET_DEL] = list_set_udel, + [IPSET_TEST] = list_set_utest, + }, .destroy = list_set_destroy, .flush = list_set_flush, .head = list_set_head, @@ -495,46 +569,44 @@ list_set_gc(unsigned long ul_set) struct list_set *map = set->data; write_lock_bh(&set->lock); - cleanup_entries(map); + set_cleanup_entries(set); write_unlock_bh(&set->lock); - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } static void -list_set_gc_init(struct ip_set *set) +list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) { struct list_set *map = set->data; init_timer(&map->gc); map->gc.data = (unsigned long) set; - map->gc.function = list_set_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + map->gc.function = gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } /* Create list:set type of sets */ static bool -init_list_set(struct ip_set *set, u32 size, size_t dsize, - unsigned long timeout) +init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; struct set_elem *e; u32 i; - map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL); + map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL); if (!map) return false; map->size = size; - map->dsize = dsize; - map->timeout = timeout; + map->net = net; set->data = map; for (i = 0; i < size; i++) { - e = list_set_elem(map, i); + e = list_set_elem(set, map, i); e->id = IPSET_INVALID_ID; } @@ -542,12 +614,14 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize, } static int -list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], + u32 flags) { u32 size = IP_SET_LIST_DEFAULT_SIZE; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_SIZE]) @@ -555,18 +629,14 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + set->variant = &set_variant; + set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); + if (!init_list_set(net, set, size)) + return -ENOMEM; if (tb[IPSET_ATTR_TIMEOUT]) { - if (!init_list_set(set, size, sizeof(struct set_telem), - ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]))) - return -ENOMEM; - - list_set_gc_init(set); - } else { - if (!init_list_set(set, size, sizeof(struct set_elem), - IPSET_NO_TIMEOUT)) - return -ENOMEM; + set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + list_set_gc_init(set, list_set_gc); } - set->variant = &list_set; return 0; } @@ -575,13 +645,14 @@ static struct ip_set_type list_set_type __read_mostly = { .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST, .dimension = IPSET_DIM_ONE, - .family = AF_UNSPEC, - .revision_min = 0, - .revision_max = 0, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, .create = list_set_create, .create_policy = { [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_NAME] = { .type = NLA_STRING, @@ -591,6 +662,9 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 4f29fa97044..04d15fdc99e 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -7,8 +7,8 @@ #define E(a, b, c, d) \ {.ip6 = { \ - __constant_htonl(a), __constant_htonl(b), \ - __constant_htonl(c), __constant_htonl(d), \ + htonl(a), htonl(b), \ + htonl(c), htonl(d), \ } } /* diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index f9871385a65..0c3b1670b0d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -28,12 +28,11 @@ if IP_VS config IP_VS_IPV6 bool "IPv6 support for IPVS" depends on IPV6 = y || IP_VS = IPV6 + select IP6_NF_IPTABLES ---help--- - Add IPv6 support to IPVS. This is incomplete and might be dangerous. + Add IPv6 support to IPVS. - See http://www.mindbasket.com/ipvs for more information. - - Say N if unsure. + Say Y if unsure. config IP_VS_DEBUG bool "IP virtual server debugging" @@ -250,7 +249,8 @@ comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ + NF_CONNTRACK_FTP select IP_VS_NFCT ---help--- FTP is a protocol that transfers IP address and/or port number in diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index fe6cb4304d7..dfd7b65b3d2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -31,7 +31,6 @@ #include <net/net_namespace.h> #include <net/protocol.h> #include <net/tcp.h> -#include <asm/system.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -59,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app) module_put(app->module); } +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ + kfree(inc->timeout_table); + kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + + ip_vs_app_inc_destroy(inc); +} /* * Allocate/initialize app incarnation and register it in proto apps. @@ -107,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, return 0; out: - kfree(inc->timeout_table); - kfree(inc); + ip_vs_app_inc_destroy(inc); return ret; } @@ -132,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) list_del(&inc->a_list); - kfree(inc->timeout_table); - kfree(inc); + call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); } @@ -145,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) { int result; - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); + result = ip_vs_app_get(inc->app); + if (result) + atomic_inc(&inc->usecnt); return result; } @@ -157,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) */ void ip_vs_app_inc_put(struct ip_vs_app *inc) { - ip_vs_app_put(inc->app); atomic_dec(&inc->usecnt); + ip_vs_app_put(inc->app); } @@ -181,45 +190,71 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, } -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct net *net, struct ip_vs_app *app) +/* Register application for netns */ +struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app) { struct netns_ipvs *ipvs = net_ipvs(net); - /* increase the module use count */ - ip_vs_use_count_inc(); + struct ip_vs_app *a; + int err = 0; + + if (!ipvs) + return ERR_PTR(-ENOENT); mutex_lock(&__ip_vs_app_mutex); - list_add(&app->a_list, &ipvs->app_list); + list_for_each_entry(a, &ipvs->app_list, a_list) { + if (!strcmp(app->name, a->name)) { + err = -EEXIST; + goto out_unlock; + } + } + a = kmemdup(app, sizeof(*app), GFP_KERNEL); + if (!a) { + err = -ENOMEM; + goto out_unlock; + } + INIT_LIST_HEAD(&a->incs_list); + list_add(&a->a_list, &ipvs->app_list); + /* increase the module use count */ + ip_vs_use_count_inc(); +out_unlock: mutex_unlock(&__ip_vs_app_mutex); - return 0; + return err ? ERR_PTR(err) : a; } /* * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services + * Caller should use synchronize_rcu() or rcu_barrier() */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { - struct ip_vs_app *inc, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_app *a, *anxt, *inc, *nxt; + + if (!ipvs) + return; mutex_lock(&__ip_vs_app_mutex); - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(net, inc); - } + list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { + if (app && strcmp(app->name, a->name)) + continue; + list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { + ip_vs_app_inc_release(net, inc); + } - list_del(&app->a_list); + list_del(&a->a_list); + kfree(a); - mutex_unlock(&__ip_vs_app_mutex); + /* decrease the module use count */ + ip_vs_use_count_dec(); + } - /* decrease the module use count */ - ip_vs_use_count_dec(); + mutex_unlock(&__ip_vs_app_mutex); } @@ -314,17 +349,17 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) * Assumes already checked proto==IPPROTO_TCP and diff!=0. */ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) + unsigned int flag, __u32 seq, int diff) { /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { vseq->previous_delta = vseq->delta; vseq->delta += diff; vseq->init_seq = seq; cp->flags |= flag; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, @@ -581,11 +616,12 @@ int __net_init ip_vs_app_net_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); INIT_LIST_HEAD(&ipvs->app_list); - proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); + proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); return 0; } void __net_exit ip_vs_app_net_cleanup(struct net *net) { - proc_net_remove(net, "ip_vs_app"); + unregister_ip_vs_app(net, NULL /* all */); + remove_proc_entry("ip_vs_app", net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 29fa5badde7..610e19c0e13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -79,58 +79,28 @@ static unsigned int ip_vs_conn_rnd __read_mostly; struct ip_vs_aligned_lock { - rwlock_t l; + spinlock_t l; } __attribute__((__aligned__(SMP_CACHE_BYTES))); /* lock array for conn table */ static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; -static inline void ct_read_lock(unsigned key) +static inline void ct_write_lock_bh(unsigned int key) { - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_read_unlock(unsigned key) +static inline void ct_write_unlock_bh(unsigned int key) { - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock(unsigned key) -{ - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock(unsigned key) -{ - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_lock_bh(unsigned key) -{ - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock_bh(unsigned key) -{ - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock_bh(unsigned key) -{ - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock_bh(unsigned key) -{ - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto, +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { @@ -188,7 +158,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) @@ -197,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* Hash by protocol, client address and port */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); cp->flags |= IP_VS_CONN_F_HASHED; atomic_inc(&cp->refcnt); + hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pF\n", @@ -212,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); return ret; } @@ -220,21 +190,21 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. + * returns bool success. Caller should hold conn reference. */ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; /* unhash it and decrease its reference counter */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { - hlist_del(&cp->c_list); + hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; atomic_dec(&cp->refcnt); ret = 1; @@ -242,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) ret = 0; spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); + + return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ + unsigned int hash; + bool ret; + + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + ret = false; + /* Decrease refcnt and unlink conn only if we are last user */ + if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + ret = true; + } + } else + ret = atomic_read(&cp->refcnt) ? false : true; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); return ret; } @@ -257,30 +257,30 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) static inline struct ip_vs_conn * __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; - struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->cport == cp->cport && p->vport == cp->vport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->cport == cp->cport && p->vport == cp->vport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); return cp; } } - ct_read_unlock(hash); + rcu_read_unlock(); return NULL; } @@ -308,13 +308,12 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) static int ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, - unsigned int proto_off, int inverse, - struct ip_vs_conn_param *p) + int inverse, struct ip_vs_conn_param *p) { __be16 _ports[2], *pptr; struct net *net = skb_net(skb); - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return 1; @@ -329,12 +328,11 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, int inverse) + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) return NULL; return ip_vs_conn_in_get(&p); @@ -344,20 +342,21 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; - struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (!ip_vs_conn_net_eq(cp, p->net)) - continue; - if (p->pe_data && p->pe->ct_match) { - if (p->pe == cp->pe && p->pe->ct_match(p, cp)) - goto out; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { + if (__ip_vs_conn_get(cp)) + goto out; + } continue; } @@ -367,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) * p->vaddr is a fwmark */ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, p->vaddr, &cp->vaddr) && - p->cport == cp->cport && p->vport == cp->vport && + p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol) - goto out; + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (__ip_vs_conn_get(cp)) + goto out; + } } cp = NULL; out: - if (cp) - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -394,32 +394,32 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) * p->vaddr, p->vport: pkt dest address (foreign host) */ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp, *ret=NULL; - struct hlist_node *n; /* * Check for "full" addressed entries */ hash = ip_vs_conn_hashkey_param(p, true); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->vport == cp->cport && p->cport == cp->dport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->vport == cp->cport && p->cport == cp->dport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); ret = cp; break; } } - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -432,12 +432,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, int inverse) + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) return NULL; return ip_vs_conn_out_get(&p); @@ -463,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (cp->flags & IP_VS_CONN_F_NO_CPORT) { atomic_dec(&ip_vs_conn_no_cport_cnt); cp->flags &= ~IP_VS_CONN_F_NO_CPORT; cp->cport = cport; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); /* hash on new dport */ ip_vs_conn_hash(cp); @@ -548,28 +547,31 @@ static inline void ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) { unsigned int conn_flags; + __u32 flags; /* if dest is NULL, then return directly */ if (!dest) return; /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); conn_flags = atomic_read(&dest->conn_flags); if (cp->protocol != IPPROTO_UDP) conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + flags = cp->flags; /* Bind with the destination and its corresponding transmitter */ - if (cp->flags & IP_VS_CONN_F_SYNC) { + if (flags & IP_VS_CONN_F_SYNC) { /* if the connection is not template and is created * by sync, preserve the activity flag. */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) conn_flags &= ~IP_VS_CONN_F_INACTIVE; /* connections inherit forwarding method from dest */ - cp->flags &= ~IP_VS_CONN_F_FWD_MASK; + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); } - cp->flags |= conn_flags; + flags |= conn_flags; + cp->flags = flags; cp->dest = dest; IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " @@ -584,12 +586,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) atomic_read(&dest->refcnt)); /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so modify the counters + * according to the flags, later the protocol can + * update them on state change + */ + if (!(flags & IP_VS_CONN_F_INACTIVE)) atomic_inc(&dest->activeconns); else atomic_inc(&dest->inactconns); @@ -609,18 +611,46 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) * Check if there is a destination for the connection, if so * bind the connection to the destination. */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +void ip_vs_try_bind_dest(struct ip_vs_conn *cp) { struct ip_vs_dest *dest; - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, - cp->dport, &cp->vaddr, cp->vport, - cp->protocol, cp->fwmark, cp->flags); + rcu_read_lock(); + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + if (dest) { + struct ip_vs_proto_data *pd; + + spin_lock_bh(&cp->lock); + if (cp->dest) { + spin_unlock_bh(&cp->lock); + rcu_read_unlock(); + return; + } + + /* Applications work depending on the forwarding method + * but better to reassign them always when binding dest */ + if (cp->app) + ip_vs_unbind_app(cp); + ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; + spin_unlock_bh(&cp->lock); + + /* Update its packet transmitter */ + cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + if (pd && atomic_read(&pd->appcnt)) + ip_vs_bind_app(cp, pd->pp); + } + rcu_read_unlock(); } @@ -672,12 +702,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; } - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); + ip_vs_dest_put(dest); } static int expire_quiescent_template(struct netns_ipvs *ipvs, @@ -734,23 +759,27 @@ int ip_vs_check_template(struct ip_vs_conn *ct) * Simply decrease the refcnt of the template, * don't restart its timer. */ - atomic_dec(&ct->refcnt); + __ip_vs_conn_put(ct); return 0; } return 1; } -static void ip_vs_conn_expire(unsigned long data) +static void ip_vs_conn_rcu_free(struct rcu_head *head) { - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, + rcu_head); - cp->timeout = 60*HZ; + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + kmem_cache_free(ip_vs_conn_cachep, cp); +} - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct net *net = ip_vs_conn_net(cp); + struct netns_ipvs *ipvs = net_ipvs(net); /* * do I control anybody? @@ -758,26 +787,16 @@ static void ip_vs_conn_expire(unsigned long data) if (atomic_read(&cp->n_control)) goto expire_later; - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { + /* Unlink conn if not referenced anymore */ + if (likely(ip_vs_conn_unlink(cp))) { /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); + del_timer(&cp->timer); /* does anybody control me? */ if (cp->control) ip_vs_control_del(cp); if (cp->flags & IP_VS_CONN_F_NFCT) { - ip_vs_conn_drop_conntrack(cp); /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. @@ -787,35 +806,41 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_conn_drop_conntrack(cp); } - ip_vs_pe_put(cp->pe); - kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); return; } - /* hash it back to the table */ - ip_vs_conn_hash(cp); - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, + IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt), atomic_read(&cp->n_control)); + atomic_inc(&cp->refcnt); + cp->timeout = 60*HZ; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_conn_put(cp); } - +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + */ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); + /* Using mod_timer_pending will ensure the timer is not + * modified after the final del_timer in ip_vs_conn_expire. + */ + if (timer_pending(&cp->timer) && + time_after(cp->timer.expires, jiffies)) + mod_timer_pending(&cp->timer, jiffies); } @@ -824,7 +849,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) */ struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; @@ -832,7 +857,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, p->protocol); - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { IP_VS_ERR_RL("%s(): no memory\n", __func__); return NULL; @@ -843,13 +868,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_conn_net_set(cp, p->net); cp->af = p->af; cp->protocol = p->protocol; - ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; - ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ + ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->vaddr, p->vaddr); cp->vport = p->vport; - /* proto should only be IPPROTO_IP if d_addr is a fwmark */ - ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, - &cp->daddr, daddr); + ip_vs_addr_set(p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; @@ -858,6 +883,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, cp->pe = p->pe; cp->pe_data = p->pe_data; cp->pe_data_len = p->pe_data_len; + } else { + cp->pe = NULL; + cp->pe_data = NULL; + cp->pe_data_len = 0; } spin_lock_init(&cp->lock); @@ -868,19 +897,30 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, */ atomic_set(&cp->refcnt, 1); + cp->control = NULL; atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); + cp->packet_xmit = NULL; + cp->app = NULL; + cp->app_data = NULL; + /* reset struct ip_vs_seq */ + cp->in_seq.delta = 0; + cp->out_seq.delta = 0; + atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); /* Bind the connection with a destination server */ + cp->dest = NULL; ip_vs_bind_dest(cp, dest); /* Set its state and timeout */ cp->state = 0; + cp->old_state = 0; cp->timeout = 3*HZ; + cp->sync_endtime = jiffies & ~3UL; /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 @@ -923,27 +963,30 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) int idx; struct ip_vs_conn *cp; struct ip_vs_iter_state *iter = seq->private; - struct hlist_node *n; for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + /* __ip_vs_conn_get() is not needed by + * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show + */ if (pos-- == 0) { iter->l = &ip_vs_conn_tab[idx]; return cp; } } - ct_read_unlock_bh(idx); + cond_resched_rcu(); } return NULL; } static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct ip_vs_iter_state *iter = seq->private; iter->l = NULL; + rcu_read_lock(); return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; } @@ -960,31 +1003,26 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ip_vs_conn_array(seq, 0); /* more on same hash chain? */ - if ((e = cp->c_list.next)) + e = rcu_dereference(hlist_next_rcu(&cp->c_list)); + if (e) return hlist_entry(e, struct ip_vs_conn, c_list); idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - while (++idx < ip_vs_conn_tab_size) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { iter->l = &ip_vs_conn_tab[idx]; return cp; } - ct_read_unlock_bh(idx); + cond_resched_rcu(); } iter->l = NULL; return NULL; } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct ip_vs_iter_state *iter = seq->private; - struct hlist_head *l = iter->l; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); + rcu_read_unlock(); } static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) @@ -1057,7 +1095,7 @@ static const struct file_operations ip_vs_conn_fops = { .release = seq_release_net, }; -static const char *ip_vs_origin_name(unsigned flags) +static const char *ip_vs_origin_name(unsigned int flags) { if (flags & IP_VS_CONN_F_SYNC) return "SYNC"; @@ -1163,21 +1201,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; + rcu_read_lock(); /* * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned hash = net_random() & ip_vs_conn_tab_mask; - struct hlist_node *n; + unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; @@ -1197,6 +1230,18 @@ void ip_vs_random_dropentry(struct net *net) default: continue; } + } else if (cp->protocol == IPPROTO_SCTP) { + switch (cp->state) { + case IP_VS_SCTP_S_INIT1: + case IP_VS_SCTP_S_INIT: + break; + case IP_VS_SCTP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + default: + continue; + } } else { if (!todrop_entry(cp)) continue; @@ -1204,13 +1249,17 @@ void ip_vs_random_dropentry(struct net *net) IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(hash); + cond_resched_rcu(); } + rcu_read_unlock(); } @@ -1220,30 +1269,29 @@ void ip_vs_random_dropentry(struct net *net) static void ip_vs_conn_flush(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; struct netns_ipvs *ipvs = net_ipvs(net); flush_again: + rcu_read_lock(); for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - struct hlist_node *n; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(idx); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (!ip_vs_conn_net_eq(cp, net)) continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(idx); + cond_resched_rcu(); } + rcu_read_unlock(); /* the counter may be not NULL, because maybe some conn entries are run by slow timer handler or unhashed but still referred */ @@ -1261,8 +1309,8 @@ int __net_init ip_vs_conn_net_init(struct net *net) atomic_set(&ipvs->conn_count, 0); - proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); + proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); return 0; } @@ -1270,8 +1318,8 @@ void __net_exit ip_vs_conn_net_cleanup(struct net *net) { /* flush all the connection entries first */ ip_vs_conn_flush(net); - proc_net_remove(net, "ip_vs_conn"); - proc_net_remove(net, "ip_vs_conn_sync"); + remove_proc_entry("ip_vs_conn", net->proc_net); + remove_proc_entry("ip_vs_conn_sync", net->proc_net); } int __init ip_vs_conn_init(void) @@ -1309,7 +1357,7 @@ int __init ip_vs_conn_init(void) INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); } /* calculate the random value for connection hash */ @@ -1320,6 +1368,8 @@ int __init ip_vs_conn_init(void) void ip_vs_conn_cleanup(void) { + /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ + rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2555816e778..e6836755c45 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); EXPORT_SYMBOL(ip_vs_get_debug_level); #endif -int ip_vs_net_id __read_mostly; -#ifdef IP_VS_GENERIC_NETNS -EXPORT_SYMBOL(ip_vs_net_id); -#endif +static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); @@ -80,7 +77,7 @@ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); #define icmp_id(icmph) (((icmph)->un).echo.id) #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) -const char *ip_vs_proto_name(unsigned proto) +const char *ip_vs_proto_name(unsigned int proto) { static char buf[20]; @@ -100,7 +97,7 @@ const char *ip_vs_proto_name(unsigned proto) return "ICMPv6"; #endif default: - sprintf(buf, "IP_%d", proto); + sprintf(buf, "IP_%u", proto); return buf; } } @@ -119,6 +116,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); s->ustats.inpkts++; @@ -126,11 +124,14 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.inbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(dest->svc->stats.cpustats); + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); s->ustats.inbytes += skb->len; u64_stats_update_end(&s->syncp); + rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.inpkts++; @@ -149,6 +150,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); s->ustats.outpkts++; @@ -156,11 +158,14 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.outbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(dest->svc->stats.cpustats); + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); s->ustats.outbytes += skb->len; u64_stats_update_end(&s->syncp); + rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.outpkts++; @@ -206,7 +211,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, { ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, vport, p); - p->pe = svc->pe; + p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); @@ -222,11 +227,10 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, - struct sk_buff *skb, - __be16 src_port, __be16 dst_port, int *ignored) + struct sk_buff *skb, __be16 src_port, __be16 dst_port, + int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport = 0; /* destination port to forward */ @@ -236,20 +240,19 @@ ip_vs_sched_persist(struct ip_vs_service *svc, union nf_inet_addr snet; /* source network of the client, after masking */ - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + (__force __u32) svc->netmask); else #endif - snet.ip = iph.saddr.ip & svc->netmask; + snet.ip = iph->saddr.ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -266,8 +269,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * is created for other persistent services. */ { - int protocol = iph.protocol; - const union nf_inet_addr *vaddr = &iph.daddr; + int protocol = iph->protocol; + const union nf_inet_addr *vaddr = &iph->daddr; __be16 vport = 0; if (dst_port == svc->port) { @@ -302,12 +305,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { + struct ip_vs_scheduler *sched; + /* * No template found or the dest of the connection * template is not available. * return *ignored=0 i.e. ICMP and NF_DROP */ - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -342,14 +348,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, dport = dest->port; flags = (svc->flags & IP_VS_SVC_F_ONEPACKET - && iph.protocol == IPPROTO_UDP)? + && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, - src_port, &iph.daddr, dst_port, ¶m); + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, + src_port, &iph->daddr, dst_port, ¶m); cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { @@ -392,18 +398,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc, */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_proto_data *pd, int *ignored) + struct ip_vs_proto_data *pd, int *ignored, + struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; + struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr; unsigned int flags; *ignored = 1; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + /* + * IPv6 frags, only the first hit here. + */ + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return NULL; @@ -423,7 +432,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) { + (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); __ip_vs_conn_put(cp); @@ -434,7 +443,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored); + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, + iph); *ignored = 0; @@ -449,14 +459,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return NULL; } - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb, iph); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } flags = (svc->flags & IP_VS_SVC_F_ONEPACKET - && iph.protocol == IPPROTO_UDP)? + && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* @@ -465,9 +476,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, - &iph.saddr, pptr[0], &iph.daddr, pptr[1], - &p); + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], &iph->daddr, + pptr[1], &p); cp = ip_vs_conn_new(&p, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); @@ -496,21 +507,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_proto_data *pd) + struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { __be16 _ports[2], *pptr; - struct ip_vs_iphdr iph; #ifdef CONFIG_SYSCTL struct net *net; struct netns_ipvs *ipvs; int unicast; #endif - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) { - ip_vs_service_put(svc); return NF_DROP; } @@ -519,10 +526,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; else #endif - unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST); + unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create @@ -532,19 +539,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && - iph.protocol == IPPROTO_UDP)? + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - ip_vs_service_put(svc); - /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], &p); + ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); @@ -559,7 +564,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pd->pp); + ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); @@ -574,12 +579,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) return NF_ACCEPT; - } - - ip_vs_service_put(svc); /* * Notify the client that the destination is unreachable, and @@ -591,9 +592,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net_ = dev_net(skb_dst(skb)->dev); - skb->dev = net->loopback_dev; + skb->dev = net_->loopback_dev; } icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else @@ -646,22 +647,17 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { - int err = ip_defrag(skb, user); + int err; + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); return err; } -#ifdef CONFIG_IP_VS_IPV6 -static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) -{ - /* TODO IPv6: Find out what to do here for IPv6 */ - return 0; -} -#endif - static int ip_vs_route_me_harder(int af, struct sk_buff *skb) { #ifdef CONFIG_IP_VS_IPV6 @@ -732,10 +728,19 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct ipv6hdr *iph = ipv6_hdr(skb); - unsigned int icmp_offset = sizeof(struct ipv6hdr); - struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + - icmp_offset); - struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + unsigned int icmp_offset = 0; + unsigned int offs = 0; /* header offset*/ + int protocol; + struct icmp6hdr *icmph; + struct ipv6hdr *ciph; + unsigned short fragoffs; + + ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); + icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); + offs = icmp_offset + sizeof(struct icmp6hdr); + ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); + + protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); if (inout) { iph->saddr = cp->vaddr.in6; @@ -746,10 +751,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } /* the TCP/UDP/SCTP port */ - if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr || - IPPROTO_SCTP == ciph->nexthdr) { - __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol)) { + __be16 *ports = (void *)(skb_network_header(skb) + offs); + IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, + ntohs(inout ? ports[1] : ports[0]), + ntohs(inout ? cp->vport : cp->dport)); if (inout) ports[1] = cp->vport; else @@ -898,51 +906,35 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); if (!cp) return NF_ACCEPT; snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, offset, ihl); + pp, ciph.len, ihl); } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum) + unsigned int hooknum, struct ip_vs_iphdr *ipvsh) { - struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; + struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset; union nf_inet_addr snet; + unsigned int writable; *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); if (ic == NULL) return NF_DROP; - IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - &iph->saddr, &iph->daddr); - /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy @@ -950,42 +942,45 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (ipvsh->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &ipvsh->saddr, &ipvsh->daddr); /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) + ciph.len = ipvsh->len + sizeof(_icmph); + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); + ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ + + pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, - "Checking outgoing ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); if (!cp) return NF_ACCEPT; - snet.in6 = iph->saddr; - return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, - pp, offset, sizeof(struct ipv6hdr)); + snet.in6 = ciph.saddr.in6; + writable = ciph.len; + return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, + pp, writable, sizeof(struct ipv6hdr)); } #endif @@ -1014,21 +1009,47 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) return th->rst; } +static inline bool is_new_conn(const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + return th->syn; + } + case IPPROTO_SCTP: { + sctp_chunkhdr_t *sch, schunk; + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return false; + return sch->type == SCTP_CID_INIT; + } + default: + return false; + } +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - struct ip_vs_conn *cp, int ihl) + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); - if (!skb_make_writable(skb, ihl)) + if (!skb_make_writable(skb, iph->len)) goto drop; /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 @@ -1115,17 +1136,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (!net_ipvs(net)->enable) return NF_ACCEPT; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(af, skb, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_out_icmp_v6(skb, &related, - hooknum); + hooknum, &iph); if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else #endif @@ -1135,7 +1155,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } pd = ip_vs_proto_data_get(net, iph.protocol); @@ -1145,44 +1164,35 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, - ip_vs_defrag_user(hooknum))) - return NF_STOLEN; - } - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } else + if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); + cp = pp->conn_out_get(af, skb, &iph, 0); if (likely(cp)) - return handle_response(af, skb, pd, cp, iph.len); + return handle_response(af, skb, pd, cp, &iph); if (sysctl_nat_icmp_send(net) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); + pptr = frag_safe_skb_hp(skb, iph.len, + sizeof(_ports), _ports, &iph); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(net, af, iph.protocol, - &iph.saddr, - pptr[0])) { + if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST @@ -1197,9 +1207,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - struct net *net = - dev_net(skb_dst(skb)->dev); - if (!skb->dev) skb->dev = net->loopback_dev; icmpv6_send(skb, @@ -1226,11 +1233,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_out(hooknum, skb, AF_INET); + return ip_vs_out(ops->hooknum, skb, AF_INET); } /* @@ -1238,17 +1245,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_out(ops->hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1259,11 +1260,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_out(hooknum, skb, AF_INET6); + return ip_vs_out(ops->hooknum, skb, AF_INET6); } /* @@ -1271,17 +1272,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_out(ops->hooknum, skb, AF_INET6); } #endif @@ -1303,7 +1298,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; - unsigned int offset, ihl, verdict; + unsigned int offset, offset2, ihl, verdict; + bool ipip; *related = 1; @@ -1345,6 +1341,21 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) net = skb_net(skb); + /* Special case for errors for IPIP packets */ + ipip = false; + if (cih->protocol == IPPROTO_IPIP) { + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + /* Error for our IPIP must arrive at LOCAL_IN */ + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) + return NF_ACCEPT; + offset += cih->ihl * 4; + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + ipip = true; + } + pd = ip_vs_proto_data_get(net, cih->protocol); if (!pd) return NF_ACCEPT; @@ -1358,11 +1369,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking incoming ICMP for"); - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); + offset2 = offset; + ip_vs_fill_ip4hdr(cih, &ciph); + ciph.len += offset; + offset = ciph.len; + /* The embedded headers contain source and dest in reverse order. + * For IPIP this is error for request, not for reply. + */ + cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); if (!cp) return NF_ACCEPT; @@ -1376,11 +1390,69 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) goto out; } + if (ipip) { + __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; + + /* Update the MTU */ + if (ic->type == ICMP_DEST_UNREACH && + ic->code == ICMP_FRAG_NEEDED) { + struct ip_vs_dest *dest = cp->dest; + u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; + + /* Strip outer IP and ICMP, go to IPIP header */ + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_ipip; + offset2 -= ihl + sizeof(_icmph); + skb_reset_network_header(skb); + IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); + ipv4_update_pmtu(skb, dev_net(skb->dev), + mtu, 0, 0, 0, 0); + /* Client uses PMTUD? */ + if (!(frag_off & htons(IP_DF))) + goto ignore_ipip; + /* Prefer the resulting PMTU */ + if (dest) { + struct ip_vs_dest_dst *dest_dst; + + rcu_read_lock(); + dest_dst = rcu_dereference(dest->dest_dst); + if (dest_dst) + mtu = dst_mtu(dest_dst->dst_cache); + rcu_read_unlock(); + } + if (mtu > 68 + sizeof(struct iphdr)) + mtu -= sizeof(struct iphdr); + info = htonl(mtu); + } + /* Strip outer IP, ICMP and IPIP, go to IP header of + * original request. + */ + if (pskb_pull(skb, offset2) == NULL) + goto ignore_ipip; + skb_reset_network_header(skb); + IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + type, code, ntohl(info)); + icmp_send(skb, type, code, info); + /* ICMP can be shorter but anyways, account it */ + ip_vs_out_stats(cp, skb); + +ignore_ipip: + consume_skb(skb); + verdict = NF_STOLEN; + goto out; + } + /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || + IPPROTO_SCTP == cih->protocol) offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); out: __ip_vs_conn_put(cp); @@ -1389,38 +1461,24 @@ out: } #ifdef CONFIG_IP_VS_IPV6 -static int -ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum, struct ip_vs_iphdr *iph) { struct net *net = NULL; - struct ipv6hdr *iph; + struct ipv6hdr _ip6h, *ip6h; struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; - unsigned int offset, verdict; + unsigned int offs_ciph, writable, verdict; *related = 1; - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph); if (ic == NULL) return NF_DROP; - IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - &iph->saddr, &iph->daddr); - /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy @@ -1428,47 +1486,71 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (iph->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) + ciph.len = iph->len + sizeof(_icmph); + offs_ciph = ciph.len; /* Save ip header offset */ + ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); + if (ip6h == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ + ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ + ciph.daddr.in6 = ip6h->daddr; + /* skip possible IPv6 exthdrs of contained IPv6 packet */ + ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); + if (ciph.protocol < 0) + return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ net = skb_net(skb); - pd = ip_vs_proto_data_get(net, cih->nexthdr); + pd = ip_vs_proto_data_get(net, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + /* Cannot handle fragmented embedded protocol */ + if (ciph.fragoffs) return NF_ACCEPT; - IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, "Checking incoming ICMPv6 for"); - offset += sizeof(struct ipv6hdr); + /* The embedded headers contain source and dest in reverse order + * if not from localhost + */ + cp = pp->conn_in_get(AF_INET6, skb, &ciph, + (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); if (!cp) return NF_ACCEPT; + /* VS/TUN, VS/DR and LOCALNODE just let it go */ + if ((hooknum == NF_INET_LOCAL_OUT) && + (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { + __ip_vs_conn_put(cp); + return NF_ACCEPT; + } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr || - IPPROTO_SCTP == cih->nexthdr) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum); + + /* Need to mangle contained IPv6 header in ICMPv6 packet */ + writable = ciph.len; + if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || + IPPROTO_SCTP == ciph.protocol) + writable += 2 * sizeof(__u16); /* Also mangle ports */ + + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); __ip_vs_conn_put(cp); @@ -1504,7 +1586,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(af, skb, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, @@ -1513,10 +1595,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(af, skb, &iph); /* Bad... Do not break raw sockets */ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && @@ -1532,11 +1615,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, + &iph); if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else #endif @@ -1546,7 +1629,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (related) return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } /* Protocol supported? */ @@ -1557,12 +1639,24 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, &iph, iph.len, 0); + cp = pp->conn_in_get(af, skb, &iph, 0); - if (unlikely(!cp)) { + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && + unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && + is_new_conn(skb, &iph)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } + + if (unlikely(!cp) && !iph.fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ int v; - if (!pp->conn_schedule(af, skb, pd, &v, &cp)) + /* Schedule and create new connection entry into &cp */ + if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) return v; } @@ -1570,11 +1664,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_in: packet continues traversal as normal"); + if (iph.fragoffs) { + /* Fragment that couldn't be mapped to a conn entry + * is missing module nf_defrag_ipv6 + */ + IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); + } return NF_ACCEPT; } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1592,7 +1692,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); + ret = cp->packet_xmit(skb, cp, pp, &iph); /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); @@ -1613,34 +1713,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) else pkts = atomic_add_return(1, &cp->in_pkts); - if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - cp->protocol == IPPROTO_SCTP) { - if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - (cp->old_state != cp->state && - ((cp->state == IP_VS_SCTP_S_CLOSED) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { - ip_vs_sync_conn(net, cp); - goto out; - } - } - - /* Keep this block last: TCP and others with pp->num_states <= 1 */ - else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(net, cp); -out: - cp->old_state = cp->state; + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, pkts); ip_vs_conn_put(cp); return ret; @@ -1651,12 +1725,12 @@ out: * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_in(hooknum, skb, AF_INET); + return ip_vs_in(ops->hooknum, skb, AF_INET); } /* @@ -1664,17 +1738,11 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_in(ops->hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1684,12 +1752,12 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip_vs_in(hooknum, skb, AF_INET6); + return ip_vs_in(ops->hooknum, skb, AF_INET6); } /* @@ -1697,17 +1765,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_in(ops->hooknum, skb, AF_INET6); } #endif @@ -1723,42 +1785,48 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, * and send them to ip_vs_in_icmp. */ static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { int r; struct net *net; + struct netns_ipvs *ipvs; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(skb, &r, hooknum); + return ip_vs_in_icmp(skb, &r, ops->hooknum); } #ifdef CONFIG_IP_VS_IPV6 static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, +ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { int r; struct net *net; + struct netns_ipvs *ipvs; + struct ip_vs_iphdr iphdr; - if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); + if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp_v6(skb, &r, hooknum); + return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); } #endif @@ -1768,7 +1836,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, }, @@ -1778,7 +1846,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_remote_request4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, }, @@ -1786,7 +1854,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_local_reply4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, }, @@ -1794,7 +1862,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_local_request4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, }, @@ -1803,7 +1871,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_forward_icmp, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 99, }, @@ -1811,7 +1879,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply4, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, }, @@ -1820,7 +1888,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, }, @@ -1830,7 +1898,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_remote_request6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, }, @@ -1838,7 +1906,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_local_reply6, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, @@ -1846,7 +1914,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_local_request6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, }, @@ -1855,7 +1923,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_forward_icmp_v6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, @@ -1863,7 +1931,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, }, @@ -1924,6 +1992,7 @@ protocol_fail: control_fail: ip_vs_estimator_net_cleanup(net); estimator_fail: + net->ipvs = NULL; return -ENOMEM; } @@ -1936,6 +2005,7 @@ static void __net_exit __ip_vs_cleanup(struct net *net) ip_vs_control_net_cleanup(net); ip_vs_estimator_net_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + net->ipvs = NULL; } static void __net_exit __ip_vs_dev_cleanup(struct net *net) @@ -1993,10 +2063,18 @@ static int __init ip_vs_init(void) goto cleanup_dev; } + ret = ip_vs_register_nl_ioctl(); + if (ret < 0) { + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_hooks; + } + pr_info("ipvs loaded.\n"); return ret; +cleanup_hooks: + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: @@ -2012,6 +2090,7 @@ exit: static void __exit ip_vs_cleanup(void) { + ip_vs_unregister_nl_ioctl(); nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b3afe189af6..581a6584ed0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -55,9 +55,6 @@ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG @@ -71,24 +68,24 @@ int ip_vs_get_debug_level(void) /* Protos */ -static void __ip_vs_del_service(struct ip_vs_service *svc); +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(struct net *net, - const struct in6_addr *addr) +static bool __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) { - struct rt6_info *rt; struct flowi6 fl6 = { .daddr = *addr, }; + struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); + bool is_local; - rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); - if (rt && rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) - return 1; + is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); - return 0; + dst_release(dst); + return is_local; } #endif @@ -257,36 +254,38 @@ ip_vs_use_count_dec(void) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* * Returns hash value for virtual service */ -static inline unsigned -ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, +static inline unsigned int +ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; + __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - addr_fold ^= ((size_t)net>>8); + ahash = ntohl(addr_fold); + ahash ^= ((size_t) net >> 8); - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; + return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & + IP_VS_SVC_TAB_MASK; } /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) { return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } @@ -298,7 +297,7 @@ static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { - unsigned hash; + unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pF\n", @@ -312,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) */ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, &svc->addr, svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); + hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; @@ -342,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* Remove it from the svc_table table */ - list_del(&svc->s_list); + hlist_del_rcu(&svc->s_list); } else { /* Remove it from the svc_fwm_table table */ - list_del(&svc->f_list); + hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; @@ -361,13 +360,13 @@ static inline struct ip_vs_service * __ip_vs_service_find(struct net *net, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) @@ -388,13 +387,13 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(net, fwmark); - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af && net_eq(svc->net, net)) { /* HIT */ @@ -405,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) return NULL; } +/* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) +ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; struct netns_ipvs *ipvs = net_ipvs(net); - read_lock(&__ip_vs_svc_lock); - /* * Check the table hashed by fwmark first */ @@ -449,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, } out: - if (svc) - atomic_inc(&svc->usecnt); - read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), @@ -466,22 +460,35 @@ static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); - dest->svc = svc; + rcu_assign_pointer(dest->svc, svc); } -static void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) +static void ip_vs_service_free(struct ip_vs_service *svc) { - struct ip_vs_service *svc = dest->svc; + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); +} + +static void ip_vs_service_rcu_free(struct rcu_head *head) +{ + struct ip_vs_service *svc; - dest->svc = NULL; + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} + +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +{ if (atomic_dec_and_test(&svc->refcnt)) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); + ntohs(svc->port)); + if (do_delay) + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); + else + ip_vs_service_free(svc); } } @@ -489,11 +496,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) /* * Returns hash value for real service */ -static inline unsigned ip_vs_rs_hashkey(int af, +static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 @@ -506,17 +513,13 @@ static inline unsigned ip_vs_rs_hashkey(int af, & IP_VS_RTAB_MASK; } -/* - * Hashes ip_vs_dest in rs_table by <proto,addr,port>. - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { - unsigned hash; + unsigned int hash; - if (!list_empty(&dest->d_list)) { - return 0; - } + if (dest->in_rs_table) + return; /* * Hash by proto,addr,port, @@ -524,65 +527,51 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - list_add(&dest->d_list, &ipvs->rs_table[hash]); - - return 1; + hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); + dest->in_rs_table = 1; } -/* - * UNhashes ip_vs_dest from rs_table. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); + if (dest->in_rs_table) { + hlist_del_rcu(&dest->d_list); + dest->in_rs_table = 0; } - - return 1; } -/* - * Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) +/* Check if real service by <proto,addr,port> is present */ +bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport) { struct netns_ipvs *ipvs = net_ipvs(net); - unsigned hash; + unsigned int hash; struct ip_vs_dest *dest; - /* - * Check for "full" addressed entries - * Return the first found entry - */ + /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); - read_lock(&ipvs->rs_lock); - list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { + rcu_read_lock(); + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { /* HIT */ - read_unlock(&ipvs->rs_lock); - return dest; + rcu_read_unlock(); + return true; } } - read_unlock(&ipvs->rs_lock); + rcu_read_unlock(); - return NULL; + return false; } -/* - * Lookup destination by {addr,port} in the given service +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, @@ -593,7 +582,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find the destination for the given service */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == svc->af) && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && (dest->port == dport)) { @@ -607,13 +596,11 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in + * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. + * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, @@ -626,7 +613,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) @@ -634,12 +621,31 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, dest = ip_vs_lookup_dest(svc, daddr, port); if (!dest) dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); return dest; } +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_dst *dest_dst = container_of(head, + struct ip_vs_dest_dst, + rcu_head); + + dst_release(dest_dst->dst_cache); + kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, 1); + if (old) { + RCU_INIT_POINTER(dest->dest_dst, NULL); + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); + } +} + /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed @@ -654,13 +660,14 @@ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, __be16 dport) { - struct ip_vs_dest *dest, *nxt; + struct ip_vs_dest *dest; struct netns_ipvs *ipvs = net_ipvs(svc->net); /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, @@ -676,29 +683,29 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + list_del(&dest->t_list); + ip_vs_dest_hold(dest); + goto out; } } - return NULL; + dest = NULL; + +out: + spin_unlock_bh(&ipvs->dest_trash_lock); + + return dest; } +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); + + __ip_vs_dst_cache_reset(dest); + __ip_vs_svc_put(svc, false); + free_percpu(dest->stats.cpustats); + ip_vs_dest_put_and_free(dest); +} /* * Clean up all the destinations in the trash @@ -707,19 +714,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. + * be 0, so we simply release them here. */ static void ip_vs_trash_cleanup(struct net *net) { struct ip_vs_dest *dest, *nxt; struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + del_timer_sync(&ipvs->dest_trash_timer); + /* No need to use dest_trash_lock */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { + list_del(&dest->t_list); + ip_vs_dest_free(dest); } } @@ -769,6 +775,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_service *old_svc; + struct ip_vs_scheduler *sched; int conn_flags; /* set the weight and the flags */ @@ -784,20 +792,19 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * Put the real service in rs_table if not present. * For now only for NAT! */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_hash(ipvs, dest); - write_unlock_bh(&ipvs->rs_lock); } atomic_set(&dest->conn_flags, conn_flags); /* bind the service */ - if (!dest->svc) { + old_svc = rcu_dereference_protected(dest->svc, 1); + if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); + if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); + __ip_vs_svc_put(old_svc, true); } } @@ -810,27 +817,20 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->l_threshold = udest->l_threshold; spin_lock_bh(&dest->dst_lock); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - if (add) - ip_vs_start_estimator(svc->net, &dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - + sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { - list_add(&dest->n_list, &svc->destinations); + ip_vs_start_estimator(svc->net, &dest->stats); + list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; + if (sched->add_dest) + sched->add_dest(svc, dest); + } else { + if (sched->upd_dest) + sched->upd_dest(svc, dest); } - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); } @@ -842,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, struct ip_vs_dest **dest_p) { struct ip_vs_dest *dest; - unsigned atype; + unsigned int atype, i; EnterFunction(2); @@ -869,6 +869,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, if (!dest->stats.cpustats) goto err_alloc; + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_dest_stats; + ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); + u64_stats_init(&ip_vs_dest_stats->syncp); + } + dest->af = svc->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; @@ -882,7 +888,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atomic_set(&dest->persistconns, 0); atomic_set(&dest->refcnt, 1); - INIT_LIST_HEAD(&dest->d_list); + INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); @@ -924,10 +930,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Check if the dest already exists in the list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); @@ -949,11 +955,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - __ip_vs_update_dest(svc, dest, udest, 1); ret = 0; } else { @@ -993,10 +994,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Lookup the destination list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); @@ -1009,11 +1010,11 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return 0; } - /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, + bool cleanup) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1022,38 +1023,20 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) /* * Remove it from the d-linked list with the real services. */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_unhash(dest); - write_unlock_bh(&ipvs->rs_lock); - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", - dest->vfwmark, - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port)); - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - free_percpu(dest->stats.cpustats); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ipvs->dest_trash); - atomic_inc(&dest->refcnt); - } + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash without reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = 0; + spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_dest_put(dest); } @@ -1069,14 +1052,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, /* * Remove it from the d-linked destination list. */ - list_del(&dest->n_list); + list_del_rcu(&dest->n_list); svc->num_dests--; - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); + if (svcupd) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched->del_dest) + sched->del_dest(svc, dest); + } } @@ -1091,37 +1076,62 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); - write_unlock_bh(&__ip_vs_svc_lock); - /* * Delete the destination */ - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, false); LeaveFunction(2); return 0; } +static void ip_vs_dest_trash_expire(unsigned long data) +{ + struct net *net = (struct net *) data; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest *dest, *next; + unsigned long now = jiffies; + + spin_lock(&ipvs->dest_trash_lock); + list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { + if (atomic_read(&dest->refcnt) > 0) + continue; + if (dest->idle_start) { + if (time_before(now, dest->idle_start + + IP_VS_DEST_TRASH_PERIOD)) + continue; + } else { + dest->idle_start = max(1UL, now); + continue; + } + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } + if (!list_empty(&ipvs->dest_trash)) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + spin_unlock(&ipvs->dest_trash_lock); +} /* * Add a service into the service hash table @@ -1130,7 +1140,7 @@ static int ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0; + int ret = 0, i; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; @@ -1158,9 +1168,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out_err; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out_err; + } } #endif @@ -1171,11 +1185,19 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, goto out_err; } svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!svc->stats.cpustats) + if (!svc->stats.cpustats) { + ret = -ENOMEM; goto out_err; + } + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_stats; + ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); + u64_stats_init(&ip_vs_stats->syncp); + } + /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 0); atomic_set(&svc->refcnt, 0); svc->af = u->af; @@ -1189,7 +1211,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, svc->net = net; INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->sched_lock); spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ @@ -1199,7 +1221,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, sched = NULL; /* Bind the ct retriever */ - ip_vs_bind_pe(svc, pe); + RCU_INIT_POINTER(svc->pe, pe); pe = NULL; /* Update the virtual service counters */ @@ -1215,9 +1237,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ipvs->num_services++; /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; /* Now there is a service - full throttle */ @@ -1227,15 +1247,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, out_err: if (svc != NULL) { - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - if (svc->stats.cpustats) - free_percpu(svc->stats.cpustats); - kfree(svc); + ip_vs_unbind_scheduler(svc, sched); + ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); @@ -1279,18 +1292,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out; + } } #endif - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched != old_sched) { + /* Bind the new scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + old_sched = sched; + goto out; + } + /* Unbind the old scheduler on success */ + ip_vs_unbind_scheduler(svc, old_sched); + } /* * Set the flags and timeout value @@ -1299,57 +1321,22 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - old_pe = svc->pe; - if (pe != old_pe) { - ip_vs_unbind_pe(svc); - ip_vs_bind_pe(svc, pe); - } + old_pe = rcu_dereference_protected(svc->pe, 1); + if (pe != old_pe) + rcu_assign_pointer(svc->pe, pe); -out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } - /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ -static void __ip_vs_del_service(struct ip_vs_service *svc) +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; @@ -1365,27 +1352,20 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) ip_vs_stop_estimator(svc->net, &svc->stats); /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); - /* Unbind persistence engine */ - old_pe = svc->pe; - ip_vs_unbind_pe(svc); + /* Unbind persistence engine, keep svc->pe */ + old_pe = rcu_dereference_protected(svc->pe, 1); ip_vs_pe_put(old_pe); - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, cleanup); } /* @@ -1399,14 +1379,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Free the service if nobody refers to it */ - if (atomic_read(&svc->refcnt) == 0) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", - svc->fwmark, - IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); - } + __ip_vs_svc_put(svc, true); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1415,23 +1388,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ -static void ip_vs_unlink_service(struct ip_vs_service *svc) +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { + /* Hold svc to avoid double release from dest_trash */ + atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); + __ip_vs_del_service(svc, cleanup); } /* @@ -1441,7 +1407,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, false); return 0; } @@ -1450,19 +1416,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(struct net *net) +static int ip_vs_flush(struct net *net, bool cleanup) { int idx; - struct ip_vs_service *svc, *nxt; + struct ip_vs_service *svc; + struct hlist_node *n; /* * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], - s_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + s_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1470,10 +1437,10 @@ static int ip_vs_flush(struct net *net) * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1489,71 +1456,74 @@ void ip_vs_service_net_cleanup(struct net *net) EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(net); + ip_vs_flush(net, true); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } -/* - * Release dst hold by dst_cache - */ + +/* Put all references for device (dst_cache) */ static inline void -__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { + struct ip_vs_dest_dst *dest_dst; + spin_lock_bh(&dest->dst_lock); - if (dest->dst_cache && dest->dst_cache->dev == dev) { + dest_dst = rcu_dereference_protected(dest->dest_dst, 1); + if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } -/* - * Netdev event receiver - * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to - * a device that is "unregister" it must be released. +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; struct ip_vs_dest *dest; unsigned int idx; - if (event != NETDEV_UNREGISTER) + if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); EnterFunction(2); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } } - list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { - __ip_vs_dev_reset(dest, dev); + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + ip_vs_forget_dev(dest, dev); } + spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); return NOTIFY_DONE; @@ -1566,12 +1536,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; - write_lock_bh(&__ip_vs_svc_lock); list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); return 0; } @@ -1581,14 +1549,14 @@ static int ip_vs_zero_all(struct net *net) struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } @@ -1599,8 +1567,12 @@ static int ip_vs_zero_all(struct net *net) } #ifdef CONFIG_SYSCTL + +static int zero; +static int three = 3; + static int -proc_do_defense_mode(ctl_table *table, int write, +proc_do_defense_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; @@ -1621,7 +1593,7 @@ proc_do_defense_mode(ctl_table *table, int write, } static int -proc_do_sync_threshold(ctl_table *table, int write, +proc_do_sync_threshold(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1632,7 +1604,8 @@ proc_do_sync_threshold(ctl_table *table, int write, memcpy(val, valp, sizeof(val)); rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + if (write && (valp[0] < 0 || valp[1] < 0 || + (valp[0] >= valp[1] && valp[1]))) { /* Restore the correct value */ memcpy(valp, val, sizeof(val)); } @@ -1640,7 +1613,7 @@ proc_do_sync_threshold(ctl_table *table, int write, } static int -proc_do_sync_mode(ctl_table *table, int write, +proc_do_sync_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1652,9 +1625,24 @@ proc_do_sync_mode(ctl_table *table, int write, if ((*valp < 0) || (*valp > 1)) { /* Restore the correct value */ *valp = val; - } else { - struct net *net = current->nsproxy->net_ns; - ip_vs_sync_switch_mode(net, val); + } + } + return rc; +} + +static int +proc_do_sync_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (*valp < 1 || !is_power_of_2(*valp)) { + /* Restore the correct value */ + *valp = val; } } return rc; @@ -1718,6 +1706,30 @@ static struct ctl_table vs_vars[] = { .proc_handler = &proc_do_sync_mode, }, { + .procname = "sync_ports", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_ports, + }, + { + .procname = "sync_persist_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_qlen_max", + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "sync_sock_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, @@ -1730,6 +1742,18 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, { + .procname = "sloppy_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_sctp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "expire_quiescent_template", .maxlen = sizeof(int), .mode = 0644, @@ -1743,11 +1767,37 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_do_sync_threshold, }, { + .procname = "sync_refresh_period", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "sync_retries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "pmtu_disc", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1846,20 +1896,13 @@ static struct ctl_table vs_vars[] = { { } }; -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); #endif #ifdef CONFIG_PROC_FS struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ - struct list_head *table; + struct hlist_head *table; int bucket; }; @@ -1867,7 +1910,7 @@ struct ip_vs_iter { * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ -static inline const char *ip_vs_fwd_name(unsigned flags) +static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: @@ -1892,7 +1935,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; @@ -1903,7 +1946,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; @@ -1916,17 +1960,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) + __acquires(RCU) { - - read_lock_bh(&__ip_vs_svc_lock); + rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *e; + struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; @@ -1939,13 +1982,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_table[iter->bucket], + s_list) { return svc; } } @@ -1956,13 +2000,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); + e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_fwm_table[iter->bucket], + f_list) return svc; } @@ -1970,9 +2016,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) + __releases(RCU) { - read_unlock_bh(&__ip_vs_svc_lock); + rcu_read_unlock(); } @@ -1990,6 +2036,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1998,18 +2045,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - svc->scheduler->name); + sched->name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - svc->scheduler->name, + sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, svc->scheduler->name, + svc->fwmark, sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2020,7 +2067,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) else seq_putc(seq, '\n'); - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, @@ -2114,7 +2161,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; - struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; + struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_stats_user rates; int i; @@ -2130,10 +2177,10 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) __u64 inbytes, outbytes; do { - start = u64_stats_fetch_begin_bh(&u->syncp); + start = u64_stats_fetch_begin_irq(&u->syncp); inbytes = u->ustats.inbytes; outbytes = u->ustats.outbytes; - } while (u64_stats_fetch_retry_bh(&u->syncp, start)); + } while (u64_stats_fetch_retry_irq(&u->syncp, start)); seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", i, u->ustats.conns, u->ustats.inpkts, @@ -2286,7 +2333,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) struct ip_vs_dest_user_kern udest; struct netns_ipvs *ipvs = net_ipvs(net); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) @@ -2330,7 +2377,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(net); + ret = ip_vs_flush(net, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ @@ -2365,11 +2412,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) } /* Lookup the exact service by <protocol, addr, port> or fwmark */ + rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2421,11 +2470,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(src->scheduler, 1); dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2444,7 +2496,7 @@ __ip_vs_get_service_entries(struct net *net, int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2463,7 +2515,7 @@ __ip_vs_get_service_entries(struct net *net, } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2492,17 +2544,20 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; + rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); else svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, get->port); + rcu_read_unlock(); if (svc) { int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; + memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { if (count >= get->num_dests) break; @@ -2536,6 +2591,8 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) struct ip_vs_proto_data *pd; #endif + memset(u, 0, sizeof (*u)); + #ifdef CONFIG_IP_VS_PROTO_TCP pd = ip_vs_proto_data_get(net, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; @@ -2577,7 +2634,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) @@ -2677,12 +2734,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; + rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(net, AF_INET, entry->protocol, &addr, entry->port); + rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2816,17 +2875,17 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, ip_vs_copy_stats(&ustats, stats); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps); - + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; @@ -2839,6 +2898,8 @@ nla_put_failure: static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { + struct ip_vs_scheduler *sched; + struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; @@ -2847,23 +2908,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, if (!nl_service) return -EMSGSIZE; - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - + if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) + goto nla_put_failure; if (svc->fwmark) { - NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) + goto nla_put_failure; } else { - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); - NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || + nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || + nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + goto nla_put_failure; } - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); - if (svc->pe) - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); - NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - + sched = rcu_dereference_protected(svc->scheduler, 1); + pe = rcu_dereference_protected(svc->pe, 1); + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || + nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || + nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || + nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) goto nla_put_failure; @@ -2882,7 +2946,7 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb, { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); if (!hdr) @@ -2908,7 +2972,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -2919,7 +2983,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -2975,15 +3039,17 @@ static int ip_vs_genl_parse_service(struct net *net, } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); + usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } + rcu_read_lock(); if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, &usvc->addr, usvc->port); + rcu_read_unlock(); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ @@ -3013,7 +3079,7 @@ static int ip_vs_genl_parse_service(struct net *net, usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); + usvc->netmask = nla_get_be32(nla_netmask); } return 0; @@ -3038,21 +3104,22 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) if (!nl_dest) return -EMSGSIZE; - NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); - NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - - NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, - atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - + if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || + nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, + (atomic_read(&dest->conn_flags) & + IP_VS_CONN_F_FWD_MASK)) || + nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, + atomic_read(&dest->weight)) || + nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns))) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3070,7 +3137,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DEST); if (!hdr) @@ -3147,7 +3214,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); + udest->port = nla_get_be16(nla_port); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -3172,8 +3239,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, return 0; } -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid) { struct nlattr *nl_daemon; @@ -3181,10 +3248,10 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, if (!nl_daemon) return -EMSGSIZE; - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); - NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - + if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -3194,12 +3261,12 @@ nla_put_failure: return -EMSGSIZE; } -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid, struct netlink_callback *cb) { void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DAEMON); if (!hdr) @@ -3334,7 +3401,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(net); + ret = ip_vs_flush(net, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(net, info->attrs); @@ -3473,21 +3540,26 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) __ip_vs_get_timeouts(net, &t); #ifdef CONFIG_IP_VS_PROTO_TCP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, - t.tcp_fin_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, + t.tcp_timeout) || + nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout)) + goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) + goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: - NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); - NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - ip_vs_conn_tab_size); + if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, + IP_VS_VERSION_CODE) || + nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size)) + goto nla_put_failure; break; } @@ -3508,7 +3580,7 @@ out: } -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { +static const struct genl_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .flags = GENL_ADMIN_PERM, @@ -3607,7 +3679,7 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = { static int __init ip_vs_genl_register(void) { return genl_register_family_with_ops(&ip_vs_genl_family, - ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops)); + ip_vs_genl_ops); } static void ip_vs_genl_unregister(void) @@ -3621,7 +3693,7 @@ static void ip_vs_genl_unregister(void) * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL -int __net_init ip_vs_control_net_init_sysctl(struct net *net) +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { int idx; struct netns_ipvs *ipvs = net_ipvs(net); @@ -3636,6 +3708,10 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -3654,18 +3730,33 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; + ipvs->sysctl_sync_ports = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ports; + tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; + tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); + tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + ipvs->sysctl_pmtu_disc = 1; + tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; - ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, - tbl); + ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { if (!net_eq(net, &init_net)) kfree(tbl); @@ -3680,19 +3771,20 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) return 0; } -void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); } #else -int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } -void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) { } +static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } #endif @@ -3702,16 +3794,17 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct net *net) { - int idx; + int i, idx; struct netns_ipvs *ipvs = net_ipvs(net); - rwlock_init(&ipvs->rs_lock); - /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) - INIT_LIST_HEAD(&ipvs->rs_table[idx]); + INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); + spin_lock_init(&ipvs->dest_trash_lock); + setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, + (unsigned long) net); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); @@ -3720,12 +3813,18 @@ int __net_init ip_vs_control_net_init(struct net *net) if (!ipvs->tot_stats.cpustats) return -ENOMEM; + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ipvs_tot_stats; + ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); + u64_stats_init(&ipvs_tot_stats->syncp); + } + spin_lock_init(&ipvs->tot_stats.lock); - proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); - proc_net_fops_create(net, "ip_vs_stats_percpu", 0, - &ip_vs_stats_percpu_fops); + proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, net->proc_net, + &ip_vs_stats_percpu_fops); if (ip_vs_control_net_init_sysctl(net)) goto err; @@ -3742,29 +3841,17 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); - proc_net_remove(net, "ip_vs_stats_percpu"); - proc_net_remove(net, "ip_vs_stats"); - proc_net_remove(net, "ip_vs"); + remove_proc_entry("ip_vs_stats_percpu", net->proc_net); + remove_proc_entry("ip_vs_stats", net->proc_net); + remove_proc_entry("ip_vs", net->proc_net); free_percpu(ipvs->tot_stats.cpustats); } -int __init ip_vs_control_init(void) +int __init ip_vs_register_nl_ioctl(void) { - int idx; int ret; - EnterFunction(2); - - /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - - smp_wmb(); /* Do we really need it now ? */ - ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); @@ -3776,28 +3863,47 @@ int __init ip_vs_control_init(void) pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } - - ret = register_netdevice_notifier(&ip_vs_dst_notifier); - if (ret < 0) - goto err_notf; - - LeaveFunction(2); return 0; -err_notf: - ip_vs_genl_unregister(); err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; + + LeaveFunction(2); + return 0; +} + void ip_vs_control_cleanup(void) { EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 1c269e56200..c3b84546ea9 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -51,7 +51,7 @@ * IPVS DH bucket */ struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -64,11 +64,15 @@ struct ip_vs_dh_bucket { #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) +struct ip_vs_dh_state { + struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; + struct rcu_head rcu_head; +}; /* * Returns hash value for IPVS DH entry */ -static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -85,10 +89,9 @@ static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { - return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; + return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } @@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); p = p->next; } @@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -145,51 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl; + struct ip_vs_dh_state *s; /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) + s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - svc->sched_data = tbl; + svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_dh_reassign(s, svc); return 0; } -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; + struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + ip_vs_dh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; } -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + ip_vs_dh_reassign(s, svc); return 0; } @@ -209,27 +214,26 @@ static inline int is_overloaded(struct ip_vs_dest *dest) * Destination hashing scheduling */ static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + struct ip_vs_dh_state *s; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); + s = (struct ip_vs_dh_state *) svc->sched_data; + dest = ip_vs_dh_get(svc->af, s, &iph->daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); @@ -248,7 +252,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, + .add_dest = ip_vs_dh_dest_changed, + .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; @@ -262,6 +267,7 @@ static int __init ip_vs_dh_init(void) static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 0fac6017b6f..1425e9a924c 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -56,15 +56,16 @@ * Make a summary from each cpu */ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, - struct ip_vs_cpu_stats *stats) + struct ip_vs_cpu_stats __percpu *stats) { int i; + bool add = false; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); unsigned int start; __u64 inbytes, outbytes; - if (i) { + if (add) { sum->conns += s->ustats.conns; sum->inpkts += s->ustats.inpkts; sum->outpkts += s->ustats.outpkts; @@ -76,6 +77,7 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, sum->inbytes += inbytes; sum->outbytes += outbytes; } else { + add = true; sum->conns = s->ustats.conns; sum->inpkts = s->ustats.inpkts; sum->outpkts = s->ustats.outpkts; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 538d74ee4f6..77c173282f3 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -177,7 +177,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; + unsigned int buf_len; int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -267,9 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * hopefully it will succeed on the retransmitted * packet. */ + rcu_read_lock(); ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + iph->ihl * 4, start-data, end-start, buf, buf_len); + rcu_read_unlock(); if (ret) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); @@ -439,16 +442,12 @@ static int __net_init __ip_vs_ftp_init(struct net *net) struct ip_vs_app *app; struct netns_ipvs *ipvs = net_ipvs(net); - app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL); - if (!app) - return -ENOMEM; - INIT_LIST_HEAD(&app->a_list); - INIT_LIST_HEAD(&app->incs_list); - ipvs->ftp_app = app; + if (!ipvs) + return -ENOENT; - ret = register_ip_vs_app(net, app); - if (ret) - goto err_exit; + app = register_ip_vs_app(net, &ip_vs_ftp); + if (IS_ERR(app)) + return PTR_ERR(app); for (i = 0; i < ports_count; i++) { if (!ports[i]) @@ -462,9 +461,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) return 0; err_unreg: - unregister_ip_vs_app(net, app); -err_exit: - kfree(ipvs->ftp_app); + unregister_ip_vs_app(net, &ip_vs_ftp); return ret; } /* @@ -472,10 +469,7 @@ err_exit: */ static void __ip_vs_ftp_exit(struct net *net) { - struct netns_ipvs *ipvs = net_ipvs(net); - - unregister_ip_vs_app(net, ipvs->ftp_app); - kfree(ipvs->ftp_app); + unregister_ip_vs_app(net, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { @@ -483,11 +477,12 @@ static struct pernet_operations ip_vs_ftp_ops = { .exit = __ip_vs_ftp_exit, }; -int __init ip_vs_ftp_init(void) +static int __init ip_vs_ftp_init(void) { int rv; rv = register_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns on error */ return rv; } @@ -497,6 +492,7 @@ int __init ip_vs_ftp_init(void) static void __exit ip_vs_ftp_exit(void) { unregister_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 0f16283fd05..547ff33c1ef 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -90,11 +90,12 @@ * IP address and its destination server */ struct ip_vs_lblc_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest *dest; /* real server (cache) */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -102,12 +103,14 @@ struct ip_vs_lblc_entry { * IPVS lblc hash table */ struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct timer_list periodic_timer; /* collect stale entries */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -115,7 +118,7 @@ struct ip_vs_lblc_table { * IPVS LBLC sysctl table */ #ifdef CONFIG_SYSCTL -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", .data = NULL, @@ -127,22 +130,26 @@ static ctl_table vs_vars_table[] = { }; #endif -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +static void ip_vs_lblc_rcu_free(struct rcu_head *head) { - list_del(&en->list); - /* - * We don't kfree dest because it is referred either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); + struct ip_vs_lblc_entry *en = container_of(head, + struct ip_vs_lblc_entry, + rcu_head); + + ip_vs_dest_put_and_free(en->dest); kfree(en); } +static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) +{ + hlist_del_rcu(&en->list); + call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); +} /* * Returns hash value for IPVS LBLC entry */ -static inline unsigned +static inline unsigned int ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -163,25 +170,22 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { - unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ +/* Get ip_vs_lblc_entry associated with supplied parameters. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblc_hashkey(af, addr); + unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -191,7 +195,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, /* * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. + * address to a server. Called under spin lock. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, @@ -200,24 +204,23 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, struct ip_vs_lblc_entry *en; en = ip_vs_lblc_get(dest->af, tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) - return NULL; + if (en) { + if (en->dest == dest) + return en; + ip_vs_lblc_del(en); + } + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); - en->lastuse = jiffies; + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; - atomic_inc(&dest->refcnt); - en->dest = dest; + ip_vs_dest_hold(dest); + en->dest = dest; - ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } + ip_vs_lblc_hash(tbl, en); return en; } @@ -226,17 +229,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +static void ip_vs_lblc_flush(struct ip_vs_service *svc) { - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; int i; - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { - ip_vs_lblc_free(en); + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblc_expiration(struct ip_vs_service *svc) @@ -252,24 +260,25 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc) static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; unsigned long now = jiffies; int i, j; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_lblc_expiration(svc))) continue; - ip_vs_lblc_free(en); + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -293,7 +302,8 @@ static void ip_vs_lblc_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -311,26 +321,26 @@ static void ip_vs_lblc_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; - ip_vs_lblc_free(en); + ip_vs_lblc_del(en); atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } tbl->rover = j; out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); } @@ -342,7 +352,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; @@ -353,12 +363,13 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -371,7 +382,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; @@ -379,14 +390,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); + ip_vs_lblc_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -408,7 +417,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { @@ -423,13 +432,13 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -457,7 +466,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -472,20 +481,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_iphdr iph; struct ip_vs_dest *dest = NULL; struct ip_vs_lblc_entry *en; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); + en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); if (en) { /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; @@ -499,14 +505,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * free up entries from the trash at any time. */ - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; + dest = en->dest; + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); @@ -516,13 +519,14 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblc_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); return dest; @@ -532,8 +536,7 @@ out: /* * IPVS LBLC Scheduler structure */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = { .name = "lblc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, @@ -551,20 +554,27 @@ static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblc_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblc_ctl_table == NULL) return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblc_ctl_table[0].procname = NULL; + } else ipvs->lblc_ctl_table = vs_vars_table; ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; ipvs->lblc_ctl_header = - register_net_sysctl_table(net, net_vs_ctl_path, - ipvs->lblc_ctl_table); + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); @@ -614,6 +624,7 @@ static void __exit ip_vs_lblc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); unregister_pernet_subsys(&ip_vs_lblc_ops); + rcu_barrier(); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index eec797f8cce..3f21a2f47de 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -89,40 +89,49 @@ */ struct ip_vs_dest_set_elem { struct list_head list; /* list link */ - struct ip_vs_dest *dest; /* destination server */ + struct ip_vs_dest *dest; /* destination server */ + struct rcu_head rcu_head; }; struct ip_vs_dest_set { atomic_t size; /* set size */ unsigned long lastmod; /* last modified time */ struct list_head list; /* destination list */ - rwlock_t lock; /* lock for this list */ }; -static struct ip_vs_dest_set_elem * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, + struct ip_vs_dest *dest, bool check) { struct ip_vs_dest_set_elem *e; - list_for_each_entry(e, &set->list, list) { - if (e->dest == dest) - /* already existed */ - return NULL; + if (check) { + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) + return; + } } e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) - return NULL; + return; - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); e->dest = dest; - list_add(&e->list, &set->list); + list_add_rcu(&e->list, &set->list); atomic_inc(&set->size); set->lastmod = jiffies; - return e; +} + +static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_set_elem *e; + + e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); + ip_vs_dest_put_and_free(e->dest); + kfree(e); } static void @@ -135,9 +144,8 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) /* HIT */ atomic_dec(&set->size); set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); break; } } @@ -147,17 +155,10 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e, *ep; - write_lock(&set->lock); list_for_each_entry_safe(e, ep, &set->list, list) { - /* - * We don't kfree dest because it is referred either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); } - write_unlock(&set->lock); } /* get weighted least-connection node in the destination set */ @@ -167,11 +168,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) struct ip_vs_dest *dest, *least; int loh, doh; - if (set == NULL) - return NULL; - /* select the first destination server, whose weight > 0 */ - list_for_each_entry(e, &set->list, list) { + list_for_each_entry_rcu(e, &set->list, list) { least = e->dest; if (least->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -186,14 +184,14 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* find the destination with the weighted least load */ nextstage: - list_for_each_entry(e, &set->list, list) { + list_for_each_entry_continue_rcu(e, &set->list, list) { dest = e->dest; if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) + if (((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { least = dest; loh = doh; @@ -234,12 +232,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) /* find the destination with the weighted most load */ nextstage: - list_for_each_entry(e, &set->list, list) { + list_for_each_entry_continue(e, &set->list, list) { dest = e->dest; doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) + if (((__s64)moh * atomic_read(&dest->weight) < + (__s64)doh * atomic_read(&most->weight)) && (atomic_read(&dest->weight) > 0)) { most = dest; moh = doh; @@ -262,11 +260,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) * IP address and its destination server set */ struct ip_vs_lblcr_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest_set set; /* destination server set */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -274,12 +273,14 @@ struct ip_vs_lblcr_entry { * IPVS lblcr hash table */ struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -288,7 +289,7 @@ struct ip_vs_lblcr_table { * IPVS LBLCR sysctl table */ -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", .data = NULL, @@ -302,16 +303,16 @@ static ctl_table vs_vars_table[] = { static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { - list_del(&en->list); + hlist_del_rcu(&en->list); ip_vs_dest_set_eraseall(&en->set); - kfree(en); + kfree_rcu(en, rcu_head); } /* * Returns hash value for IPVS LBLCR entry */ -static inline unsigned +static inline unsigned int ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -332,25 +333,22 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { - unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ +/* Get ip_vs_lblcr_entry associated with supplied parameters. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblcr_hashkey(af, addr); + unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -360,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, /* * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. + * IP address to a server. Called under spin lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, @@ -381,14 +379,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* initialize its dest set */ atomic_set(&(en->set.size), 0); INIT_LIST_HEAD(&en->set.list); - rwlock_init(&en->set.lock); + + ip_vs_dest_set_insert(&en->set, dest, false); ip_vs_lblcr_hash(tbl, en); + return en; } - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest, true); return en; } @@ -397,17 +395,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +static void ip_vs_lblcr_flush(struct ip_vs_service *svc) { + struct ip_vs_lblcr_table *tbl = svc->sched_data; int i; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - /* No locking required, only called during cleanup. */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblcr_expiration(struct ip_vs_service *svc) @@ -425,13 +427,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_after(en->lastuse + sysctl_lblcr_expiration(svc), now)) continue; @@ -439,7 +442,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -463,7 +466,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -481,11 +485,11 @@ static void ip_vs_lblcr_check_expire(unsigned long data) if (goal > tbl->max_size/2) goal = tbl->max_size/2; - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; @@ -493,7 +497,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -511,7 +515,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; @@ -522,12 +526,13 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Initialize the hash buckets */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -540,7 +545,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; @@ -548,14 +553,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); + ip_vs_lblcr_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -577,7 +580,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -593,13 +596,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -627,7 +630,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -642,65 +645,56 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_lblcr_table *tbl = svc->sched_data; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest = NULL; + struct ip_vs_dest *dest; struct ip_vs_lblcr_entry *en; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); + en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr); if (en) { - /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; /* Get the least loaded destination */ - read_lock(&en->set.lock); dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + + time_after(jiffies, en->set.lastmod + sysctl_lblcr_expiration(svc))) { - struct ip_vs_dest *m; + spin_lock_bh(&svc->sched_lock); + if (atomic_read(&en->set.size) > 1) { + struct ip_vs_dest *m; - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + spin_unlock_bh(&svc->sched_lock); } /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); + if (dest && !is_overloaded(dest, svc)) goto out; - } /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); - read_unlock(&svc->sched_lock); return NULL; } /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_dest_set_insert(&en->set, dest, true); + spin_unlock_bh(&svc->sched_lock); goto out; + } /* No cache entry, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); @@ -710,13 +704,14 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblcr_new(tbl, &iph->daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); return dest; @@ -745,20 +740,26 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblcr_ctl_table == NULL) return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblcr_ctl_table[0].procname = NULL; } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; ipvs->lblcr_ctl_header = - register_net_sysctl_table(net, net_vs_ctl_path, - ipvs->lblcr_ctl_table); + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); @@ -808,6 +809,7 @@ static void __exit ip_vs_lblcr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); unregister_pernet_subsys(&ip_vs_lblcr_ops); + rcu_barrier(); } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index f391819c0cc..2bdcb1cf212 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -26,7 +26,8 @@ * Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; unsigned int loh = 0, doh; @@ -42,7 +43,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * served, but no new connection is assigned to the server. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || atomic_read(&dest->weight) == 0) continue; @@ -84,6 +85,7 @@ static int __init ip_vs_lc_init(void) static void __exit ip_vs_lc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); + synchronize_rcu(); } module_init(ip_vs_lc_init); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 022e77e1e76..5882bbfd198 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -19,8 +19,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * * Authors: @@ -63,6 +62,7 @@ #include <net/ip_vs.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_zones.h> @@ -82,7 +82,7 @@ void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); struct nf_conntrack_tuple new_tuple; if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || @@ -97,6 +97,11 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; + /* Applications may adjust TCP seqs */ + if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && + !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) + return; + /* * The connection is not yet in the hashtable, so we update it. * CIP->VIP will remain the same, so leave the tuple in diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 984d9c137d8..961a6de9bb2 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -40,7 +40,7 @@ #include <net/ip_vs.h> -static inline unsigned int +static inline int ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) { /* @@ -55,10 +55,11 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; + int loh = 0, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -75,7 +76,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD || !atomic_read(&dest->weight)) @@ -91,8 +92,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { + ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight))) { least = dest; loh = doh; } @@ -133,6 +134,7 @@ static int __init ip_vs_nq_init(void) static void __exit ip_vs_nq_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); + synchronize_rcu(); } module_init(ip_vs_nq_init); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 5cf859ccb31..1a82b29ce8e 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -13,20 +13,8 @@ /* IPVS pe list */ static LIST_HEAD(ip_vs_pe); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_pe_lock); - -/* Bind a service with a pe */ -void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) -{ - svc->pe = pe; -} - -/* Unbind a service from its pe */ -void ip_vs_unbind_pe(struct ip_vs_service *svc) -{ - svc->pe = NULL; -} +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex); /* Get pe in the pe list by name */ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) @@ -36,9 +24,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); - spin_lock_bh(&ip_vs_pe_lock); - - list_for_each_entry(pe, &ip_vs_pe, n_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { /* Test and get the modules atomically */ if (pe->module && !try_module_get(pe->module)) { @@ -47,14 +34,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) } if (strcmp(pe_name, pe->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_pe_lock); + rcu_read_unlock(); return pe; } if (pe->module) module_put(pe->module); } + rcu_read_unlock(); - spin_unlock_bh(&ip_vs_pe_lock); return NULL; } @@ -83,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_pe_lock); - - if (!list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - ip_vs_use_count_dec(); - pr_err("%s(): [%s] pe already linked\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist * in the pe list. */ list_for_each_entry(tmp, &ip_vs_pe, n_list) { if (strcmp(tmp->name, pe->name) == 0) { - spin_unlock_bh(&ip_vs_pe_lock); + mutex_unlock(&ip_vs_pe_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] pe already existed " "in the system\n", __func__, pe->name); @@ -106,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) } } /* Add it into the d-linked pe list */ - list_add(&pe->n_list, &ip_vs_pe); - spin_unlock_bh(&ip_vs_pe_lock); + list_add_rcu(&pe->n_list, &ip_vs_pe); + mutex_unlock(&ip_vs_pe_mutex); pr_info("[%s] pe registered.\n", pe->name); @@ -118,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe); /* Unregister a pe from the pe list */ int unregister_ip_vs_pe(struct ip_vs_pe *pe) { - spin_lock_bh(&ip_vs_pe_lock); - if (list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - pr_err("%s(): [%s] pe is not in the list. failed\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Remove it from the d-linked pe list */ - list_del(&pe->n_list); - spin_unlock_bh(&ip_vs_pe_lock); + list_del_rcu(&pe->n_list); + mutex_unlock(&ip_vs_pe_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 1aa5cac748c..bed5f704252 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, const char *callid, size_t callid_len, int *idx) { - size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); + size_t max_len = 64; + size_t len = min3(max_len, callid_len, buf_len - *idx - 1); memcpy(buf + *idx, callid, len); buf[*idx+len] = '\0'; *idx += len + 1; @@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff, if (ret > 0) break; if (!ret) - return 0; + return -EINVAL; dataoff += *matchoff; } - /* Empty callid is useless */ - if (!*matchlen) - return -EINVAL; - /* Too large is useless */ if (*matchlen > IP_VS_PEDATA_MAXLEN) return -EINVAL; @@ -73,18 +70,20 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) const char *dptr; int retc; - ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(p->af, skb, &iph); /* Only useful with UDP */ if (iph.protocol != IPPROTO_UDP) return -EINVAL; - - /* No Data ? */ + /* todo: IPv6 fragments: + * I think this only should be done for the first fragment. /HS + */ dataoff = iph.len + sizeof(struct udphdr); + if (dataoff >= skb->len) return -EINVAL; - - if ((retc=skb_linearize(skb)) < 0) + retc = skb_linearize(skb); + if (retc < 0) return retc; dptr = skb->data + dataoff; datalen = skb->len - dataoff; @@ -164,6 +163,7 @@ static int __init ip_vs_sip_init(void) static void __exit ip_vs_sip_cleanup(void) { unregister_ip_vs_pe(&ip_vs_sip_pe); + synchronize_rcu(); } module_init(ip_vs_sip_init); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 85312939695..939f7fbe9b4 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -25,7 +25,6 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> -#include <asm/system.h> #include <linux/stat.h> #include <linux/proc_fs.h> @@ -49,7 +48,7 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; @@ -60,9 +59,6 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) return 0; } -#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \ - defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \ - defined(CONFIG_IP_VS_PROTO_ESP) /* * register an ipvs protocols netns related data */ @@ -70,9 +66,9 @@ static int register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) { struct netns_ipvs *ipvs = net_ipvs(net); - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = - kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); if (!pd) return -ENOMEM; @@ -82,12 +78,18 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ - if (pp->init_netns != NULL) - pp->init_netns(net, pd); + if (pp->init_netns != NULL) { + int ret = pp->init_netns(net, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } return 0; } -#endif /* * unregister an ipvs protocol @@ -95,7 +97,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { @@ -118,7 +120,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data **pd_p; - unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); pd_p = &ipvs->proto_data_table[hash]; for (; *pd_p; pd_p = &(*pd_p)->next) { @@ -140,7 +142,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); + unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) @@ -154,11 +156,11 @@ EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ -struct ip_vs_proto_data * +static struct ip_vs_proto_data * __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; - unsigned hash = IP_VS_PROTO_HASH(proto); + unsigned int hash = IP_VS_PROTO_HASH(proto); for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { if (pd->pp->protocol == proto) @@ -197,7 +199,7 @@ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) int * ip_vs_create_timeout_table(int *table, int size) { - return kmemdup(table, size, GFP_ATOMIC); + return kmemdup(table, size, GFP_KERNEL); } @@ -278,17 +280,17 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->nexthdr == IPPROTO_FRAGMENT) - sprintf(buf, "%pI6->%pI6 frag", &ih->saddr, &ih->daddr); + sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), sizeof(_ports), _ports); if (pptr == NULL) - sprintf(buf, "TRUNCATED %pI6->%pI6", + sprintf(buf, "TRUNCATED %pI6c->%pI6c", &ih->saddr, &ih->daddr); else - sprintf(buf, "%pI6:%u->%pI6:%u", + sprintf(buf, "%pI6c:%u->%pI6c:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } @@ -317,22 +319,35 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, */ int __net_init ip_vs_protocol_net_init(struct net *net) { + int i, ret; + static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP - register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); + &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP - register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); + &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP - register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); + &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH - register_ip_vs_proto_netns(net, &ip_vs_protocol_ah); + &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP - register_ip_vs_proto_netns(net, &ip_vs_protocol_esp); + &ip_vs_protocol_esp, #endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(net, protos[i]); + if (ret < 0) + goto cleanup; + } return 0; + +cleanup: + ip_vs_protocol_net_cleanup(net); + return ret; } void __net_exit ip_vs_protocol_net_cleanup(struct net *net) diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 5b8eb8b12c3..5de3dd312c0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -57,7 +57,7 @@ ah_esp_conn_fill_param_proto(struct net *net, int af, static struct ip_vs_conn * ah_esp_conn_in_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, unsigned int proto_off, + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn *cp; @@ -85,9 +85,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, static struct ip_vs_conn * ah_esp_conn_out_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) + const struct ip_vs_iphdr *iph, int inverse) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; @@ -110,7 +108,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { /* * AH/ESP is only related traffic. Pass the packet to IP stack. diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 1fbf7a2816f..2f7ea756404 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -10,36 +10,42 @@ static int sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { struct net *net; struct ip_vs_service *svc; + struct netns_ipvs *ipvs; sctp_chunkhdr_t _schunkh, *sch; sctp_sctphdr_t *sh, _sctph; - struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph); - if (sh == NULL) + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh == NULL) { + *verdict = NF_DROP; return 0; + } - sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t), + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), sizeof(_schunkh), &_schunkh); - if (sch == NULL) + if (sch == NULL) { + *verdict = NF_DROP; return 0; + } + net = skb_net(skb); - if ((sch->type == SCTP_CID_INIT) && - (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, - &iph.daddr, sh->dest))) { + ipvs = net_ipvs(net); + rcu_read_lock(); + if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, sh->dest))) { int ignored; - if (ip_vs_todrop(net_ipvs(net))) { + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -47,106 +53,119 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pd); - else { - ip_vs_service_put(svc); + *verdict = ip_vs_leave(svc, skb, pd, iph); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } +static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph, + unsigned int sctphoff) +{ + sctph->checksum = sctp_compute_cksum(skb, sctphoff); + skb->ip_summed = CHECKSUM_UNNECESSARY; +} + static int -sctp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { sctp_sctphdr_t *sctph; - unsigned int sctphoff; - struct sk_buff *iter; - __be32 crc32; + unsigned int sctphoff = iph->len; + bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - sctphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - sctphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + ret = ip_vs_app_pkt_out(cp, skb); + if (ret == 0) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; - sctph->source = cp->vport; - /* Calculate the checksum */ - crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); - skb_walk_frags(skb, iter) - crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), - crc32); - crc32 = sctp_end_cksum(crc32); - sctph->checksum = crc32; + /* Only update csum if we really have to */ + if (sctph->source != cp->vport || payload_csum || + skb->ip_summed == CHECKSUM_PARTIAL) { + sctph->source = cp->vport; + sctp_nat_csum(skb, sctph, sctphoff); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } return 1; } static int -sctp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { sctp_sctphdr_t *sctph; - unsigned int sctphoff; - struct sk_buff *iter; - __be32 crc32; + unsigned int sctphoff = iph->len; + bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - sctphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - sctphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_in(cp, skb)) + ret = ip_vs_app_pkt_in(cp, skb); + if (ret == 0) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; - sctph->dest = cp->dport; - /* Calculate the checksum */ - crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); - skb_walk_frags(skb, iter) - crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), - crc32); - crc32 = sctp_end_cksum(crc32); - sctph->checksum = crc32; + /* Only update csum if we really have to */ + if (sctph->dest != cp->dport || payload_csum || + (skb->ip_summed == CHECKSUM_PARTIAL && + !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { + sctph->dest = cp->dport; + sctp_nat_csum(skb, sctph, sctphoff); + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } return 1; } @@ -156,10 +175,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int sctphoff; struct sctphdr *sh, _sctph; - struct sk_buff *iter; - __le32 cmp; - __le32 val; - __u32 tmp; + __le32 cmp, val; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) @@ -173,13 +189,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 0; cmp = sh->checksum; - - tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); - skb_walk_frags(skb, iter) - tmp = sctp_update_cksum((__u8 *) iter->data, - skb_headlen(iter), tmp); - - val = sctp_end_cksum(tmp); + val = sctp_compute_cksum(skb, sctphoff); if (val != cmp) { /* CRC failure, dump it. */ @@ -190,710 +200,159 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 1; } -struct ipvs_sctp_nextstate { - int next_state; -}; enum ipvs_sctp_event_t { - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_SER, - IP_VS_SCTP_EVE_INIT_CLI, - IP_VS_SCTP_EVE_INIT_SER, - IP_VS_SCTP_EVE_INIT_ACK_CLI, - IP_VS_SCTP_EVE_INIT_ACK_SER, - IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, - IP_VS_SCTP_EVE_COOKIE_ECHO_SER, - IP_VS_SCTP_EVE_COOKIE_ACK_CLI, - IP_VS_SCTP_EVE_COOKIE_ACK_SER, - IP_VS_SCTP_EVE_ABORT_CLI, - IP_VS_SCTP_EVE__ABORT_SER, - IP_VS_SCTP_EVE_SHUT_CLI, - IP_VS_SCTP_EVE_SHUT_SER, - IP_VS_SCTP_EVE_SHUT_ACK_CLI, - IP_VS_SCTP_EVE_SHUT_ACK_SER, - IP_VS_SCTP_EVE_SHUT_COM_CLI, - IP_VS_SCTP_EVE_SHUT_COM_SER, - IP_VS_SCTP_EVE_LAST + IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ + IP_VS_SCTP_INIT, + IP_VS_SCTP_INIT_ACK, + IP_VS_SCTP_COOKIE_ECHO, + IP_VS_SCTP_COOKIE_ACK, + IP_VS_SCTP_SHUTDOWN, + IP_VS_SCTP_SHUTDOWN_ACK, + IP_VS_SCTP_SHUTDOWN_COMPLETE, + IP_VS_SCTP_ERROR, + IP_VS_SCTP_ABORT, + IP_VS_SCTP_EVENT_LAST }; -static enum ipvs_sctp_event_t sctp_events[255] = { - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_INIT_CLI, - IP_VS_SCTP_EVE_INIT_ACK_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_ABORT_CLI, - IP_VS_SCTP_EVE_SHUT_CLI, - IP_VS_SCTP_EVE_SHUT_ACK_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, - IP_VS_SCTP_EVE_COOKIE_ACK_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_DATA_CLI, - IP_VS_SCTP_EVE_SHUT_COM_CLI, +/* RFC 2960, 3.2 Chunk Field Descriptions */ +static __u8 sctp_events[] = { + [SCTP_CID_DATA] = IP_VS_SCTP_DATA, + [SCTP_CID_INIT] = IP_VS_SCTP_INIT, + [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, + [SCTP_CID_SACK] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, + [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, + [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, + [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, + [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, + [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, + [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, + [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, + [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, + [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, }; -static struct ipvs_sctp_nextstate - sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = { - /* - * STATE : IP_VS_SCTP_S_NONE - */ - /*next state *//*event */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }, - }, - /* - * STATE : IP_VS_SCTP_S_INIT_CLI - * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT) - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_INIT_SER - * Server sent INIT and waiting for INIT ACK from the client - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_INIT_ACK_CLI - * Client sent INIT ACK and waiting for ECHO from the server - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK has been resent by the client, let us stay is in - * the same state - */ - {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK sent by the server, close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * ECHO by client, it should not happen, close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO by server, this is what we are expecting, move to ECHO_SER - */ - {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, it should not happen, close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * Unexpected COOKIE ACK from server, staty in the same state - */ - {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_INIT_ACK_SER - * Server sent INIT ACK and waiting for ECHO from the client - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * Unexpected INIT_ACK by the client, let us close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK resent by the server, let us move to same state - */ - {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client send the ECHO, this is what we are expecting, - * move to ECHO_CLI - */ - {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO received from the server, Not sure what to do, - * let us close it - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, let us stay in the same state - */ - {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * COOKIE ACK from server, hmm... this should not happen, lets close - * the connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_ECHO_CLI - * Cient sent ECHO and waiting COOKEI ACK from the Server - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK has been by the client, let us close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client resent the ECHO, let us stay in the same state - */ - {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO received from the server, Not sure what to do, - * let us close it - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, this shoud not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * COOKIE ACK from server, this is what we are awaiting,lets move to - * ESTABLISHED. - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_ECHO_SER - * Server sent ECHO and waiting COOKEI ACK from the client - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - /* - * INIT_ACK has been by the server, let us close the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent the ECHO, not sure what to do, let's close the - * connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - /* - * ECHO resent by the server, stay in the same state - */ - {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, this is what we are expecting, let's move - * to ESTABLISHED. - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - /* - * COOKIE ACK from server, this should not happen, lets close the - * connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_ESTABLISHED - * Association established - */ - {{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this should not happen, let's close - * the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_SHUT_CLI - * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ - - {{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN resent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this should not happen, let's close - * the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server sent SHUTDOWN ACK, this is what we are expecting, let's move - * to SHUDOWN_ACK_SER - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_SHUT_SER - * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ - - {{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN resent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN resent from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this is what we are expecting, let's - * move to SHUT_ACK_CLI - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server sent SHUTDOWN ACK, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - - /* - * State : IP_VS_SCTP_S_SHUT_ACK_CLI - * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ - - {{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN sent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN sent from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client resent SHUDTDOWN_ACK, let's stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server sent SHUTDOWN ACK, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this should not happen, let's close the - * connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - /* - * SHUTDOWN COMPLETE from server this is what we are expecting. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - - /* - * State : IP_VS_SCTP_S_SHUT_ACK_SER - * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client - */ - /* - * We received the data chuck, keep the state unchanged. I assume - * that still data chuncks can be received by both the peers in - * SHUDOWN state - */ +/* SCTP States: + * See RFC 2960, 4. SCTP Association State Diagram + * + * New states (not in diagram): + * - INIT1 state: use shorter timeout for dropped INIT packets + * - REJECTED state: use shorter timeout if INIT is rejected with ABORT + * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging + * + * The states are as seen in real server. In the diagram, INIT1, INIT, + * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. + * + * States as per packets from client (C) and server (S): + * + * Setup of client connection: + * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK + * + * Setup of server connection: + * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK + * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK + */ - {{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, - /* - * We have got an INIT from client. From the spec.“Upon receipt of - * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with - * an INIT ACK using the same parameters it sent in its original - * INIT chunk (including its Initiate Tag, unchanged”). - */ - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - /* - * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, - * “If an INIT ACK is received by an endpoint in any state other - * than the COOKIE-WAIT state, the endpoint should discard the - * INIT ACK chunk”. Stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - /* - * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the - * peer and peer shall move to the ESTABISHED. if it doesn't handle - * it will send ERROR chunk. So, stay in the same state - */ - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - /* - * COOKIE ACK from client, not sure what to do stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - /* - * SHUTDOWN sent from the client, move to SHUDDOWN_CLI - */ - {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - /* - * SHUTDOWN sent from the server, move to SHUTDOWN_SER - */ - {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, - /* - * client sent SHUDTDOWN_ACK, this should not happen let's close - * the connection. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - /* - * Server resent SHUTDOWN ACK, stay in the same state - */ - {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - /* - * SHUTDOWN COM from client, this what we are expecting, let's close - * the connection - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - /* - * SHUTDOWN COMPLETE from server this should not happen. - */ - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - }, - /* - * State : IP_VS_SCTP_S_CLOSED - */ - {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, - {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, - {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, - {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } - } +#define sNO IP_VS_SCTP_S_NONE +#define sI1 IP_VS_SCTP_S_INIT1 +#define sIN IP_VS_SCTP_S_INIT +#define sCS IP_VS_SCTP_S_COOKIE_SENT +#define sCR IP_VS_SCTP_S_COOKIE_REPLIED +#define sCW IP_VS_SCTP_S_COOKIE_WAIT +#define sCO IP_VS_SCTP_S_COOKIE +#define sCE IP_VS_SCTP_S_COOKIE_ECHOED +#define sES IP_VS_SCTP_S_ESTABLISHED +#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT +#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED +#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT +#define sRJ IP_VS_SCTP_S_REJECTED +#define sCL IP_VS_SCTP_S_CLOSED + +static const __u8 sctp_states + [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { + { /* INPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* OUTPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, +/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, +/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* INPUT-ONLY */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, }; -/* - * Timeout table[state] - */ +#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) + +/* Timeout table[state] */ static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { - [IP_VS_SCTP_S_NONE] = 2 * HZ, - [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_INIT_ACK_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_INIT_ACK_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_ECHO_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_ECHO_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_ACK_CLI] = 1 * 60 * HZ, - [IP_VS_SCTP_S_SHUT_ACK_SER] = 1 * 60 * HZ, - [IP_VS_SCTP_S_CLOSED] = 10 * HZ, - [IP_VS_SCTP_S_LAST] = 2 * HZ, + [IP_VS_SCTP_S_NONE] = 2 * HZ, + [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, + [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_LAST] = 2 * HZ, }; static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { - [IP_VS_SCTP_S_NONE] = "NONE", - [IP_VS_SCTP_S_INIT_CLI] = "INIT_CLI", - [IP_VS_SCTP_S_INIT_SER] = "INIT_SER", - [IP_VS_SCTP_S_INIT_ACK_CLI] = "INIT_ACK_CLI", - [IP_VS_SCTP_S_INIT_ACK_SER] = "INIT_ACK_SER", - [IP_VS_SCTP_S_ECHO_CLI] = "COOKIE_ECHO_CLI", - [IP_VS_SCTP_S_ECHO_SER] = "COOKIE_ECHO_SER", - [IP_VS_SCTP_S_ESTABLISHED] = "ESTABISHED", - [IP_VS_SCTP_S_SHUT_CLI] = "SHUTDOWN_CLI", - [IP_VS_SCTP_S_SHUT_SER] = "SHUTDOWN_SER", - [IP_VS_SCTP_S_SHUT_ACK_CLI] = "SHUTDOWN_ACK_CLI", - [IP_VS_SCTP_S_SHUT_ACK_SER] = "SHUTDOWN_ACK_SER", - [IP_VS_SCTP_S_CLOSED] = "CLOSED", - [IP_VS_SCTP_S_LAST] = "BUG!" + [IP_VS_SCTP_S_NONE] = "NONE", + [IP_VS_SCTP_S_INIT1] = "INIT1", + [IP_VS_SCTP_S_INIT] = "INIT", + [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", + [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", + [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", + [IP_VS_SCTP_S_COOKIE] = "COOKIE", + [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", + [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", + [IP_VS_SCTP_S_REJECTED] = "REJECTED", + [IP_VS_SCTP_S_CLOSED] = "CLOSED", + [IP_VS_SCTP_S_LAST] = "BUG!", }; @@ -913,7 +372,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, sctp_chunkhdr_t _sctpch, *sch; unsigned char chunk_type; int event, next_state; - int ihl; + int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -921,8 +380,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, ihl = ip_hdrlen(skb); #endif - sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), - sizeof(_sctpch), &_sctpch); + cofs = ihl + sizeof(sctp_sctphdr_t); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) return; @@ -940,25 +399,30 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { - sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + - sch->length), sizeof(_sctpch), &_sctpch); - if (sch) { - if (sch->type == SCTP_CID_ABORT) + int clen = ntohs(sch->length); + + if (clen >= sizeof(sctp_chunkhdr_t)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } - event = sctp_events[chunk_type]; + event = (chunk_type < sizeof(sctp_events)) ? + sctp_events[chunk_type] : IP_VS_SCTP_DATA; - /* - * If the direction is IP_VS_DIR_OUTPUT, this event is from server + /* Update direction to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected */ - if (direction == IP_VS_DIR_OUTPUT) - event++; - /* - * get next state - */ - next_state = sctp_states_table[cp->state][event].next_state; + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (direction == IP_VS_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + direction = IP_VS_DIR_INPUT_ONLY; + } + + next_state = sctp_states[direction][event][cp->state]; if (next_state != cp->state) { struct ip_vs_dest *dest = cp->dest; @@ -999,9 +463,9 @@ static void sctp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); set_sctp_state(pd, cp, direction, skb); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline __u16 sctp_app_hashkey(__be16 port) @@ -1021,30 +485,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc) hash = sctp_app_hashkey(port); - spin_lock_bh(&ipvs->sctp_app_lock); list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->sctp_app_lock); return ret; } static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); - spin_lock_bh(&ipvs->sctp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->sctp_app_lock); + list_del_rcu(&inc->p_list); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) @@ -1060,12 +519,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); - spin_lock(&ipvs->sctp_app_lock); - list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -1081,7 +540,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); out: return result; } @@ -1090,14 +549,16 @@ out: * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); - spin_lock_init(&ipvs->sctp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index ef8641f7af8..e3a697234a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -33,33 +33,34 @@ static int tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; - struct ip_vs_iphdr iph; + struct netns_ipvs *ipvs; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th == NULL) { *verdict = NF_DROP; return 0; } net = skb_net(skb); + ipvs = net_ipvs(net); /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ - if (th->syn && - (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, - &iph.daddr, th->dest))) { + rcu_read_lock(); + if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, th->dest))) { int ignored; - if (ip_vs_todrop(net_ipvs(net))) { + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -68,18 +69,17 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pd); - else { - ip_vs_service_put(svc); + *verdict = ip_vs_leave(svc, skb, pd, iph); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -128,20 +128,18 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph, static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; - unsigned int tcphoff; + unsigned int tcphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - tcphoff = ip_hdrlen(skb); oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ @@ -208,20 +206,18 @@ tcp_snat_handler(struct sk_buff *skb, static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; - unsigned int tcphoff; + unsigned int tcphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - tcphoff = ip_hdrlen(skb); oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ @@ -407,7 +403,7 @@ static struct tcp_states_t tcp_states [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ @@ -421,7 +417,7 @@ static struct tcp_states_t tcp_states [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; @@ -430,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ @@ -444,7 +440,7 @@ static struct tcp_states_t tcp_states_dos [] = { /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; @@ -563,9 +559,9 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, if (th == NULL) return; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) @@ -586,18 +582,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) hash = tcp_app_hashkey(port); - spin_lock_bh(&ipvs->tcp_app_lock); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->tcp_app_lock); return ret; } @@ -605,13 +599,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) static void tcp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock_bh(&ipvs->tcp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->tcp_app_lock); + list_del_rcu(&inc->p_list); } @@ -630,12 +621,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - spin_lock(&ipvs->tcp_app_lock); - list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -652,7 +643,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); out: return result; @@ -666,26 +657,28 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); - spin_lock_init(&ipvs->tcp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; pd->tcp_state_table = tcp_states; + return 0; } static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index f4b7262896b..b62a3c0ff9b 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -30,23 +30,23 @@ static int udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - int *verdict, struct ip_vs_conn **cpp) + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) { struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; - struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); if (uh == NULL) { *verdict = NF_DROP; return 0; } net = skb_net(skb); - svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, - &iph.daddr, uh->dest); + rcu_read_lock(); + svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, uh->dest); if (svc) { int ignored; @@ -55,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -64,18 +64,17 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) - *verdict = ip_vs_leave(svc, skb, pd); - else { - ip_vs_service_put(svc); + *verdict = ip_vs_leave(svc, skb, pd, iph); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -125,20 +124,18 @@ udp_partial_csum_update(int af, struct udphdr *uhdr, static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; - unsigned int udphoff; + unsigned int udphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - udphoff = ip_hdrlen(skb); oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ @@ -210,20 +207,18 @@ udp_snat_handler(struct sk_buff *skb, static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; - unsigned int udphoff; + unsigned int udphoff = iph->len; int oldlen; int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; #endif - udphoff = ip_hdrlen(skb); oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ @@ -364,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc) hash = udp_app_hashkey(port); - - spin_lock_bh(&ipvs->udp_app_lock); list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->udp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->udp_app_lock); return ret; } @@ -385,12 +377,9 @@ static void udp_unregister_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); - struct netns_ipvs *ipvs = net_ipvs(net); - spin_lock_bh(&ipvs->udp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->udp_app_lock); + list_del_rcu(&inc->p_list); } @@ -408,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - spin_lock(&ipvs->udp_app_lock); - list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -430,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); out: return result; @@ -467,14 +456,16 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; } -static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); - spin_lock_init(&ipvs->udp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index c49b388d108..176b87c35e3 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) { - svc->sched_data = &svc->destinations; + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -46,38 +55,44 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc) * Round-Robin Scheduling */ static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { - struct list_head *p, *q; - struct ip_vs_dest *dest; + struct list_head *p; + struct ip_vs_dest *dest, *last; + int pass = 0; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + if (dest == last) + goto stop; } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + spin_unlock_bh(&svc->sched_lock); ip_vs_scheduler_err(svc, "no destination available"); return NULL; out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); + svc->sched_data = &dest->n_list; + spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), @@ -94,7 +109,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, + .add_dest = NULL, + .del_dest = ip_vs_rr_del_dest, .schedule = ip_vs_rr_schedule, }; @@ -106,6 +122,7 @@ static int __init ip_vs_rr_init(void) static void __exit ip_vs_rr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); + synchronize_rcu(); } module_init(ip_vs_rr_init); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 08dbdd5bc18..4dbcda6258b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err); */ static LIST_HEAD(ip_vs_schedulers); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_sched_lock); +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex); /* @@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, { int ret; - svc->scheduler = scheduler; - if (scheduler->init_service) { ret = scheduler->init_service(svc); if (ret) { @@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, return ret; } } - + rcu_assign_pointer(svc->scheduler, scheduler); return 0; } @@ -64,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, /* * Unbind a service with its scheduler */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *sched) { - struct ip_vs_scheduler *sched = svc->scheduler; + struct ip_vs_scheduler *cur_sched; - if (!sched) - return 0; + cur_sched = rcu_dereference_protected(svc->scheduler, 1); + /* This check proves that old 'sched' was installed */ + if (!cur_sched) + return; - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - pr_err("%s(): done error\n", __func__); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; + if (sched->done_service) + sched->done_service(svc); + /* svc->scheduler can not be set to NULL */ } @@ -92,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -106,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return sched; } if (sched->module) module_put(sched->module); } - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return NULL; } @@ -153,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - svc->scheduler->name, svc->fwmark, - svc->fwmark, msg); + sched->name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { - IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } @@ -192,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (!list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); @@ -208,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); @@ -219,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -237,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; @@ -249,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 89ead246ed3..e446b9fa742 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -44,7 +44,7 @@ #include <net/ip_vs.h> -static inline unsigned int +static inline int ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) { /* @@ -59,10 +59,11 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -79,7 +80,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -94,12 +95,12 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -134,6 +135,7 @@ static int __init ip_vs_sed_init(void) static void __exit ip_vs_sed_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); + synchronize_rcu(); } module_init(ip_vs_sed_init); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 069e8d4d5c0..cc65b2f42cd 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -48,12 +48,16 @@ #include <net/ip_vs.h> +#include <net/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> + /* * IPVS SH bucket */ struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -66,11 +70,24 @@ struct ip_vs_sh_bucket { #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) +struct ip_vs_sh_state { + struct rcu_head rcu_head; + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; +}; + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} /* * Returns hash value for IPVS SH entry */ -static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) { __be32 addr_fold = addr->ip; @@ -79,7 +96,8 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; + return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + IP_VS_SH_TAB_MASK; } @@ -87,38 +105,91 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) { - return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; + unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; } +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* first try the dest it's supposed to go to */ + ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + dest = rcu_dereference(s->buckets[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + /* if the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; + hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); + dest = rcu_dereference(s->buckets[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, "SH: selected unavailable " + "server %s:%d (offset %d), reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + /* * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_sh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; int d_count; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); d_count = 0; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", i, IP_VS_DBG_ADDR(svc->af, &dest->addr), @@ -140,16 +211,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +static void ip_vs_sh_flush(struct ip_vs_sh_state *s) { int i; struct ip_vs_sh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -158,63 +231,84 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl; + struct ip_vs_sh_state *s; /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) + s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - svc->sched_data = tbl; + svc->sched_data = s; IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_sh_reassign(s, svc); return 0; } -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +static void ip_vs_sh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; + struct ip_vs_sh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + ip_vs_sh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; } -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + struct ip_vs_sh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + ip_vs_sh_reassign(s, svc); return 0; } -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - return dest->flags & IP_VS_DEST_F_OVERLOAD; + __be16 port; + struct tcphdr _tcph, *th; + struct udphdr _udph, *uh; + sctp_sctphdr_t _sctph, *sh; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (unlikely(th == NULL)) + return 0; + port = th->source; + break; + case IPPROTO_UDP: + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (unlikely(uh == NULL)) + return 0; + port = uh->source; + break; + case IPPROTO_SCTP: + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (unlikely(sh == NULL)) + return 0; + port = sh->source; + break; + default: + port = 0; + } + + return port; } @@ -222,28 +316,32 @@ static inline int is_overloaded(struct ip_vs_dest *dest) * Source Hashing scheduling */ static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + struct ip_vs_sh_state *s; + __be16 port = 0; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { + if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) + port = ip_vs_sh_get_port(skb, iph); + + s = (struct ip_vs_sh_state *) svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) + dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + else + dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + + if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), + IP_VS_DBG_ADDR(svc->af, &iph->saddr), IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); @@ -262,7 +360,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, + .add_dest = ip_vs_sh_dest_changed, + .del_dest = ip_vs_sh_dest_changed, + .upd_dest = ip_vs_sh_dest_changed, .schedule = ip_vs_sh_schedule, }; @@ -276,6 +376,7 @@ static int __init ip_vs_sh_init(void) static void __exit ip_vs_sh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 8a0d6d6889f..db801263ee9 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -196,6 +196,7 @@ struct ip_vs_sync_thread_data { struct net *net; struct socket *sock; char *buf; + int id; }; /* Version 0 definition of packet sizes */ @@ -245,7 +246,7 @@ struct ip_vs_sync_thread_data { struct ip_vs_sync_mesg_v0 { __u8 nr_conns; __u8 syncid; - __u16 size; + __be16 size; /* ip_vs_sync_conn entries start here */ }; @@ -254,7 +255,7 @@ struct ip_vs_sync_mesg_v0 { struct ip_vs_sync_mesg { __u8 reserved; /* must be zero */ __u8 syncid; - __u16 size; + __be16 size; __u8 nr_conns; __s8 version; /* SYNC_PROTO_VER */ __u16 spare; @@ -271,13 +272,6 @@ struct ip_vs_sync_buff { unsigned char *end; }; -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), -}; - /* * Copy of struct ip_vs_seq * From unaligned network order to aligned host order @@ -300,18 +294,22 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) put_unaligned_be32(ho->previous_delta, &no->previous_delta); } -static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_lock); - if (list_empty(&ipvs->sync_queue)) { + if (list_empty(&ms->sync_queue)) { sb = NULL; + __set_current_state(TASK_INTERRUPTIBLE); } else { - sb = list_entry(ipvs->sync_queue.next, - struct ip_vs_sync_buff, + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, list); list_del(&sb->list); + ms->sync_queue_len--; + if (!ms->sync_queue_len) + ms->sync_queue_delay = 0; } spin_unlock_bh(&ipvs->sync_lock); @@ -334,10 +332,10 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs) kfree(sb); return NULL; } - sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; sb->mesg->syncid = ipvs->master_syncid; - sb->mesg->size = sizeof(struct ip_vs_sync_mesg); + sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); @@ -353,14 +351,22 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } -static inline void sb_queue_tail(struct netns_ipvs *ipvs) +static inline void sb_queue_tail(struct netns_ipvs *ipvs, + struct ipvs_master_sync_state *ms) { - struct ip_vs_sync_buff *sb = ipvs->sync_buff; + struct ip_vs_sync_buff *sb = ms->sync_buff; spin_lock(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ipvs->sync_queue); - else + if (ipvs->sync_state & IP_VS_STATE_MASTER && + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { + if (!ms->sync_queue_len) + schedule_delayed_work(&ms->master_wakeup_work, + max(IPVS_SYNC_SEND_DELAY, 1)); + ms->sync_queue_len++; + list_add_tail(&sb->list, &ms->sync_queue); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) + wake_up_process(ms->master_thread); + } else ip_vs_sync_buff_release(sb); spin_unlock(&ipvs->sync_lock); } @@ -370,49 +376,26 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs) * than the specified time or the specified time is zero. */ static inline struct ip_vs_sync_buff * -get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, + unsigned long time) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_buff_lock); - if (ipvs->sync_buff && - time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { - sb = ipvs->sync_buff; - ipvs->sync_buff = NULL; + sb = ms->sync_buff; + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { + ms->sync_buff = NULL; + __set_current_state(TASK_RUNNING); } else sb = NULL; spin_unlock_bh(&ipvs->sync_buff_lock); return sb; } -/* - * Switch mode from sending version 0 or 1 - * - must handle sync_buf - */ -void ip_vs_sync_switch_mode(struct net *net, int mode) +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(net); - - if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) - return; - if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) - return; - - spin_lock_bh(&ipvs->sync_buff_lock); - /* Buffer empty ? then let buf_create do the job */ - if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { - kfree(ipvs->sync_buff); - ipvs->sync_buff = NULL; - } else { - spin_lock_bh(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&ipvs->sync_buff->list, - &ipvs->sync_queue); - else - ip_vs_sync_buff_release(ipvs->sync_buff); - spin_unlock_bh(&ipvs->sync_lock); - } - spin_unlock_bh(&ipvs->sync_buff_lock); + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; } /* @@ -435,22 +418,121 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; mesg->syncid = ipvs->master_syncid; - mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; sb->firstuse = jiffies; return sb; } +/* Check if connection is controlled by persistence */ +static inline bool in_persistence(struct ip_vs_conn *cp) +{ + for (cp = cp->control; cp; cp = cp->control) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + return true; + } + return false; +} + +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + * sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + * for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + * with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, + struct ip_vs_conn *cp, int pkts) +{ + unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long now = jiffies; + unsigned long n = (now + cp->timeout) & ~3UL; + unsigned int sync_refresh_period; + int sync_period; + int force; + + /* Check if we sync in current state */ + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) + force = 0; + else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) + return 0; + else if (likely(cp->protocol == IPPROTO_TCP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_TCP_S_ESTABLISHED) | + (1 << IP_VS_TCP_S_FIN_WAIT) | + (1 << IP_VS_TCP_S_CLOSE) | + (1 << IP_VS_TCP_S_CLOSE_WAIT) | + (1 << IP_VS_TCP_S_TIME_WAIT)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) + goto set; + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_SCTP_S_ESTABLISHED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | + (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | + (1 << IP_VS_SCTP_S_CLOSED)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) + goto set; + } else { + /* UDP or another protocol with single state */ + force = 0; + } + + sync_refresh_period = sysctl_sync_refresh_period(ipvs); + if (sync_refresh_period > 0) { + long diff = n - orig; + long min_diff = max(cp->timeout >> 1, 10UL * HZ); + + /* Avoid sync if difference is below sync_refresh_period + * and below the half timeout. + */ + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { + int retries = orig & 3; + + if (retries >= sysctl_sync_retries(ipvs)) + return 0; + if (time_before(now, orig - cp->timeout + + (sync_refresh_period >> 3))) + return 0; + n |= retries + 1; + } + } + sync_period = sysctl_sync_period(ipvs); + if (sync_period > 0) { + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && + pkts % sync_period != sysctl_sync_threshold(ipvs)) + return 0; + } else if (sync_refresh_period <= 0 && + pkts != sysctl_sync_threshold(ipvs)) + return 0; + +set: + cp->old_state = cp->state; + n = cmpxchg(&cp->sync_endtime, orig, n); + return n == orig || force; +} + /* * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) +static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, + int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; int len; if (unlikely(cp->af != AF_INET)) @@ -459,21 +541,41 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return; - spin_lock(&ipvs->sync_buff_lock); - if (!ipvs->sync_buff) { - ipvs->sync_buff = - ip_vs_sync_buff_create_v0(ipvs); - if (!ipvs->sync_buff) { - spin_unlock(&ipvs->sync_buff_lock); + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + buff = ms->sync_buff; + if (buff) { + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + /* Send buffer if it is for v1 */ + if (!m->nr_conns) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + } + } + if (!buff) { + buff = ip_vs_sync_buff_create_v0(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; } len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; - m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; - s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + s = (struct ip_vs_sync_conn_v0 *) buff->head; /* copy members */ s->reserved = 0; @@ -493,19 +595,25 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) } m->nr_conns++; - m->size += len; - ipvs->sync_buff->head += len; + m->size = htons(ntohs(m->size) + len); + buff->head += len; /* check if there is a space for next one */ - if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + if (buff->head + FULL_CONN_SIZE > buff->end) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(net, cp->control); + cp = cp->control; + if (cp) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + ip_vs_sync_conn(net, cp->control, pkts); + } } /* @@ -513,23 +621,29 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; __u8 *p; unsigned int len, pe_name_len, pad; /* Handle old version of the protocol */ if (sysctl_sync_ver(ipvs) == 0) { - ip_vs_sync_conn_v0(net, cp); + ip_vs_sync_conn_v0(net, cp, pkts); return; } /* Do not sync ONE PACKET */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) goto control; sloop: + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + goto control; + /* Sanity checks */ pe_name_len = 0; if (cp->pe_data_len) { @@ -540,7 +654,14 @@ sloop: pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); } - spin_lock(&ipvs->sync_buff_lock); + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -559,28 +680,33 @@ sloop: /* check if there is a space for this one */ pad = 0; - if (ipvs->sync_buff) { - pad = (4 - (size_t)ipvs->sync_buff->head) & 3; - if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + buff = ms->sync_buff; + if (buff) { + m = buff->mesg; + pad = (4 - (size_t) buff->head) & 3; + /* Send buffer if it is for v0 */ + if (buff->head + len + pad > buff->end || m->reserved) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; pad = 0; } } - if (!ipvs->sync_buff) { - ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); - if (!ipvs->sync_buff) { - spin_unlock(&ipvs->sync_buff_lock); + if (!buff) { + buff = ip_vs_sync_buff_create(ipvs); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; + m = buff->mesg; } - m = ipvs->sync_buff->mesg; - p = ipvs->sync_buff->head; - ipvs->sync_buff->head += pad + len; - m->size += pad + len; + p = buff->head; + buff->head += pad + len; + m->size = htons(ntohs(m->size) + pad + len); /* Add ev. padding from prev. sync_conn */ while (pad--) *(p++) = 0; @@ -637,23 +763,17 @@ sloop: } } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); control: /* synchronize its controller if it has */ cp = cp->control; if (!cp) return; - /* - * Reduce sync rate for templates - * i.e only increment in_pkts for Templates. - */ - if (cp->flags & IP_VS_CONN_F_TEMPLATE) { - int pkts = atomic_add_return(1, &cp->in_pkts); - - if (pkts % sysctl_sync_period(ipvs) != 1) - return; - } + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); goto sloop; } @@ -731,66 +851,46 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, else cp = ip_vs_ct_in_get(param); - if (cp && param->pe_data) /* Free pe_data */ + if (cp) { + /* Free pe_data */ kfree(param->pe_data); - if (!cp) { + + dest = cp->dest; + spin_lock_bh(&cp->lock); + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { + if (flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + } else { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + } + } + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; + cp->flags = flags; + spin_unlock_bh(&cp->lock); + if (!dest) + ip_vs_try_bind_dest(cp); + } else { /* * Find the appropriate destination for the connection. * If it is not found the connection will remain unbound * but still handled. */ + rcu_read_lock(); dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); - /* Set the approprite ativity flag */ - if (protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } else if (protocol == IPPROTO_SCTP) { - if (state != IP_VS_SCTP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); - if (dest) - atomic_dec(&dest->refcnt); + rcu_read_unlock(); if (!cp) { if (param->pe_data) kfree(param->pe_data); IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && - (cp->state != state)) { - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_SCTP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } } if (opt) @@ -839,7 +939,7 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer, p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); for (i=0; i<m->nr_conns; i++) { - unsigned flags, state; + unsigned int flags, state; if (p + SIMPLE_CONN_SIZE > buffer+buflen) { IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); @@ -1088,10 +1188,8 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, IP_VS_DBG(2, "BACKUP, message header too short\n"); return; } - /* Convert size back to host byte order */ - m2->size = ntohs(m2->size); - if (buflen != m2->size) { + if (buflen != ntohs(m2->size)) { IP_VS_DBG(2, "BACKUP, bogus message size\n"); return; } @@ -1109,7 +1207,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, for (i=0; i<nr_conns; i++) { union ip_vs_sync_conn *s; - unsigned size; + unsigned int size; int retc; p = msg_end; @@ -1149,6 +1247,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, /* + * Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ + lock_sock(sk); + if (mode) { + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, + sysctl_wmem_max); + sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } else { + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, + sysctl_rmem_max); + sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + +/* * Setup loopback of outgoing multicasts on a sending socket */ static void set_mcast_loop(struct sock *sk, u_char loop) @@ -1298,9 +1418,15 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct net *net) +static struct socket *make_send_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1324,6 +1450,9 @@ static struct socket *make_send_sock(struct net *net) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 1, result); result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); if (result < 0) { @@ -1349,9 +1478,15 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct net *net) +static struct socket *make_receive_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1368,7 +1503,10 @@ static struct socket *make_receive_sock(struct net *net) */ sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; + sock->sk->sk_reuse = SK_CAN_REUSE; + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 0, result); result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, sizeof(struct sockaddr)); @@ -1411,18 +1549,19 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) return len; } -static void +static int ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) { int msize; + int ret; - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); + msize = ntohs(msg->size); - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - pr_err("ip_vs_send_async error\n"); + ret = ip_vs_send_async(sock, (char *)msg, msize); + if (ret >= 0 || ret == -EAGAIN) + return ret; + pr_err("ip_vs_send_async error %d\n", ret); + return 0; } static int @@ -1438,48 +1577,90 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) iov.iov_base = buffer; iov.iov_len = (size_t)buflen; - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); if (len < 0) - return -1; + return len; LeaveFunction(7); return len; } +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ + struct ipvs_master_sync_state *ms = + container_of(work, struct ipvs_master_sync_state, + master_wakeup_work.work); + struct netns_ipvs *ipvs = ms->ipvs; + + spin_lock_bh(&ipvs->sync_lock); + if (ms->sync_queue_len && + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; + wake_up_process(ms->master_thread); + } + spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + sb = sb_dequeue(ipvs, ms); + if (sb) + return sb; + /* Do not delay entries in buffer for more than 2 seconds */ + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +} static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; + struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid); + "syncid = %d, id = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); - while (!kthread_should_stop()) { - while ((sb = sb_dequeue(ipvs))) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + for (;;) { + sb = next_sync_buff(ipvs, ms); + if (unlikely(kthread_should_stop())) + break; + if (!sb) { + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); + continue; } - - /* check if entries stay in ipvs->sync_buff for 2 seconds */ - sb = get_curr_sync_buff(ipvs, 2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { + /* (Ab)use interruptible sleep to avoid increasing + * the load avg. + */ + __wait_event_interruptible(*sk_sleep(sk), + sock_writeable(sk) || + kthread_should_stop()); + if (unlikely(kthread_should_stop())) + goto done; } - - schedule_timeout_interruptible(HZ); + ip_vs_sync_buff_release(sb); } +done: + __set_current_state(TASK_RUNNING); + if (sb) + ip_vs_sync_buff_release(sb); + /* clean up the sync_buff queue */ - while ((sb = sb_dequeue(ipvs))) + while ((sb = sb_dequeue(ipvs, ms))) ip_vs_sync_buff_release(sb); + __set_current_state(TASK_RUNNING); /* clean up the current sync_buff */ - sb = get_curr_sync_buff(ipvs, 0); + sb = get_curr_sync_buff(ipvs, ms, 0); if (sb) ip_vs_sync_buff_release(sb); @@ -1498,8 +1679,8 @@ static int sync_thread_backup(void *data) int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid); + "syncid = %d, id = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1511,15 +1692,12 @@ static int sync_thread_backup(void *data) len = ip_vs_receive(tinfo->sock, tinfo->buf, ipvs->recv_mesg_maxlen); if (len <= 0) { - pr_err("receiving message error\n"); + if (len != -EAGAIN) + pr_err("receiving message error\n"); break; } - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); ip_vs_process_message(tinfo->net, tinfo->buf, len); - local_bh_enable(); } } @@ -1535,86 +1713,142 @@ static int sync_thread_backup(void *data) int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) { struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; + struct task_struct **array = NULL, *task; struct socket *sock; struct netns_ipvs *ipvs = net_ipvs(net); - char *name, *buf = NULL; + char *name; int (*threadfn)(void *data); + int id, count; int result = -ENOMEM; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + if (!ipvs->sync_state) { + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); + ipvs->threads_mask = count - 1; + } else + count = ipvs->threads_mask + 1; if (state == IP_VS_STATE_MASTER) { - if (ipvs->master_thread) + if (ipvs->ms) return -EEXIST; strlcpy(ipvs->master_mcast_ifn, mcast_ifn, sizeof(ipvs->master_mcast_ifn)); ipvs->master_syncid = syncid; - realtask = &ipvs->master_thread; - name = "ipvs_master:%d"; + name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; - sock = make_send_sock(net); } else if (state == IP_VS_STATE_BACKUP) { - if (ipvs->backup_thread) + if (ipvs->backup_threads) return -EEXIST; strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, sizeof(ipvs->backup_mcast_ifn)); ipvs->backup_syncid = syncid; - realtask = &ipvs->backup_thread; - name = "ipvs_backup:%d"; + name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; - sock = make_receive_sock(net); } else { return -EINVAL; } - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } + if (state == IP_VS_STATE_MASTER) { + struct ipvs_master_sync_state *ms; - set_sync_mesg_maxlen(net, state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; + ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); + if (!ipvs->ms) + goto out; + ms = ipvs->ms; + for (id = 0; id < count; id++, ms++) { + INIT_LIST_HEAD(&ms->sync_queue); + ms->sync_queue_len = 0; + ms->sync_queue_delay = 0; + INIT_DELAYED_WORK(&ms->master_wakeup_work, + master_wakeup_work_handler); + ms->ipvs = ipvs; + } + } else { + array = kzalloc(count * sizeof(struct task_struct *), + GFP_KERNEL); + if (!array) + goto out; } + set_sync_mesg_maxlen(net, state); - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->net = net; - tinfo->sock = sock; - tinfo->buf = buf; + tinfo = NULL; + for (id = 0; id < count; id++) { + if (state == IP_VS_STATE_MASTER) + sock = make_send_sock(net, id); + else + sock = make_receive_sock(net, id); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto outtinfo; + } + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outsocket; + tinfo->net = net; + tinfo->sock = sock; + if (state == IP_VS_STATE_BACKUP) { + tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + GFP_KERNEL); + if (!tinfo->buf) + goto outtinfo; + } else { + tinfo->buf = NULL; + } + tinfo->id = id; - task = kthread_run(threadfn, tinfo, name, ipvs->gen); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + tinfo = NULL; + if (state == IP_VS_STATE_MASTER) + ipvs->ms[id].master_thread = task; + else + array[id] = task; } /* mark as active */ - *realtask = task; + + if (state == IP_VS_STATE_BACKUP) + ipvs->backup_threads = array; + spin_lock_bh(&ipvs->sync_buff_lock); ipvs->sync_state |= state; + spin_unlock_bh(&ipvs->sync_buff_lock); /* increase the module use count */ ip_vs_use_count_inc(); return 0; -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); outsocket: sk_release_kernel(sock->sk); + +outtinfo: + if (tinfo) { + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + } + count = id; + while (count-- > 0) { + if (state == IP_VS_STATE_MASTER) + kthread_stop(ipvs->ms[count].master_thread); + else + kthread_stop(array[count]); + } + kfree(array); + out: + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + kfree(ipvs->ms); + ipvs->ms = NULL; + } return result; } @@ -1622,38 +1856,60 @@ out: int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + struct task_struct **array; + int id; int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); if (state == IP_VS_STATE_MASTER) { - if (!ipvs->master_thread) + if (!ipvs->ms) return -ESRCH; - pr_info("stopping master sync thread %d ...\n", - task_pid_nr(ipvs->master_thread)); - /* * The lock synchronizes with sb_queue_tail(), so that we don't * add sync buffers to the queue, when we are already in * progress of stopping the master sync daemon. */ - spin_lock_bh(&ipvs->sync_lock); + spin_lock_bh(&ipvs->sync_buff_lock); + spin_lock(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ipvs->sync_lock); - retc = kthread_stop(ipvs->master_thread); - ipvs->master_thread = NULL; + spin_unlock(&ipvs->sync_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; + int ret; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ms->master_thread)); + cancel_delayed_work_sync(&ms->master_wakeup_work); + ret = kthread_stop(ms->master_thread); + if (retc >= 0) + retc = ret; + } + kfree(ipvs->ms); + ipvs->ms = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!ipvs->backup_thread) + if (!ipvs->backup_threads) return -ESRCH; - pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(ipvs->backup_thread)); - ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - retc = kthread_stop(ipvs->backup_thread); - ipvs->backup_thread = NULL; + array = ipvs->backup_threads; + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + int ret; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(array[id])); + ret = kthread_stop(array[id]); + if (retc >= 0) + retc = ret; + } + kfree(array); + ipvs->backup_threads = NULL; } /* decrease the module use count */ @@ -1670,13 +1926,8 @@ int __net_init ip_vs_sync_net_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); - INIT_LIST_HEAD(&ipvs->sync_queue); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); - - ipvs->sync_mcast_addr.sin_family = AF_INET; - ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); - ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index bc1bfc48a17..b5b4650d50a 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -31,10 +31,11 @@ * Weighted Least Connection scheduling */ static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); @@ -51,7 +52,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -66,12 +67,12 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } @@ -106,6 +107,7 @@ static int __init ip_vs_wlc_init(void) static void __exit ip_vs_wlc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); + synchronize_rcu(); } module_init(ip_vs_wlc_init); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index fd0d4e09876..0546cd572d6 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -29,14 +29,45 @@ #include <net/ip_vs.h> +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ + /* * current destination pointer for weighted round-robin scheduling */ struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ + struct ip_vs_dest *cl; /* current dest or head */ int cw; /* current weight */ int mw; /* maximum weight */ int di; /* decreasing interval */ + struct rcu_head rcu_head; }; @@ -84,40 +115,45 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) /* * Allocate the mark variable for WRR scheduling */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); if (mark == NULL) return -ENOMEM; - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + mark->cw = mark->mw; svc->sched_data = mark; return 0; } -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) { + struct ip_vs_wrr_mark *mark = svc->sched_data; + /* * Release the mark variable */ - kfree(svc->sched_data); - - return 0; + kfree_rcu(mark, rcu_head); } -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { struct ip_vs_wrr_mark *mark = svc->sched_data; - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); + spin_lock_bh(&svc->sched_lock); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + if (mark->cw > mark->mw || !mark->cw) + mark->cw = mark->mw; + else if (mark->di > 1) + mark->cw = (mark->cw / mark->di) * mark->di + 1; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -126,82 +162,82 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) * Weighted Round-Robin Scheduling */ static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) { - struct ip_vs_dest *dest; + struct ip_vs_dest *dest, *last, *stop = NULL; struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; + bool last_pass = false, restarted = false; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; + spin_lock_bh(&svc->sched_lock); + dest = mark->cl; + /* No available dests? */ + if (mark->mw == 0) + goto err_noavail; + last = dest; + /* Stop only after all dests were checked for weight >= 1 (last pass) */ while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - ip_vs_scheduler_err(svc, - "no destination available: " - "no destinations present"); - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - ip_vs_scheduler_err(svc, - "no destination available"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } + atomic_read(&dest->weight) >= mark->cw) + goto found; + if (dest == stop) + goto err_over; } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - ip_vs_scheduler_err(svc, - "no destination available: " - "all destinations are overloaded"); - goto out; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* Stop if we tried last pass from first dest: + * 1. last_pass: we started checks when cw > di but + * then all dests were checked for w >= 1 + * 2. last was head: the first and only traversal + * was for weight >= 1, for all dests. + */ + if (last_pass || + &last->n_list == &svc->destinations) + goto err_over; + restarted = true; + } + last_pass = mark->cw <= mark->di; + if (last_pass && restarted && + &last->n_list != &svc->destinations) { + /* First traversal was for w >= 1 but only + * for dests after 'last', now do the same + * for all dests up to 'last'. + */ + stop = last; } } +found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + mark->cl = dest; out: - write_unlock(&svc->sched_lock); + spin_unlock_bh(&svc->sched_lock); return dest; + +err_noavail: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available"); + goto out; + +err_over: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available: " + "all destinations are overloaded"); + goto out; } @@ -212,7 +248,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, + .add_dest = ip_vs_wrr_dest_changed, + .del_dest = ip_vs_wrr_dest_changed, + .upd_dest = ip_vs_wrr_dest_changed, .schedule = ip_vs_wrr_schedule, }; @@ -224,6 +262,7 @@ static int __init ip_vs_wrr_init(void) static void __exit ip_vs_wrr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); + synchronize_rcu(); } module_init(ip_vs_wrr_init); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 7fd66dec859..73ba1cc7a88 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -17,6 +17,8 @@ * - not all connections have destination server, for example, * connections in backup server when fwmark is used * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback * LOCAL_OUT rules: * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) * - skb->pkt_type is not set yet @@ -49,158 +51,246 @@ enum { IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to * local */ + IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ + IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ + IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ }; +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ + return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ + kfree(dest_dst); +} + /* * Destination cache to speed up outgoing route lookup */ static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, - u32 dst_cookie) +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, + struct dst_entry *dst, u32 dst_cookie) { - struct dst_entry *old_dst; + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, + lockdep_is_held(&dest->dst_lock)); - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dest->dst_cookie = dst_cookie; - dst_release(old_dst); + if (dest_dst) { + dest_dst->dst_cache = dst; + dest_dst->dst_cookie = dst_cookie; + } + rcu_assign_pointer(dest->dest_dst, dest_dst); + + if (old) + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest) { - struct dst_entry *dst = dest->dst_cache; + struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); + struct dst_entry *dst; - if (!dst) + if (!dest_dst) return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && - dst->ops->check(dst, dest->dst_cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); + dst = dest_dst->dst_cache; + if (dst->obsolete && + dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; + return dest_dst; +} + +static inline bool +__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) +{ + if (IP6CB(skb)->frag_max_size) { + /* frag_max_size tell us that, this packet have been + * defragmented by netfilter IPv6 conntrack module. + */ + if (IP6CB(skb)->frag_max_size > mtu) + return true; /* largest fragment violate MTU */ } - dst_hold(dst); - return dst; + else if (skb->len > mtu && !skb_is_gso(skb)) { + return true; /* Packet size violate MTU size */ + } + return false; +} + +/* Get route to daddr, update *saddr, optionally bind route to saddr */ +static struct rtable *do_output_route4(struct net *net, __be32 daddr, + int rt_mode, __be32 *saddr) +{ + struct flowi4 fl4; + struct rtable *rt; + int loop = 0; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; + fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? + FLOWI_FLAG_KNOWN_NH : 0; + +retry: + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + /* Invalid saddr ? */ + if (PTR_ERR(rt) == -EINVAL && *saddr && + rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { + *saddr = 0; + flowi4_update_output(&fl4, 0, 0, daddr, 0); + goto retry; + } + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); + return NULL; + } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + ip_rt_put(rt); + *saddr = fl4.saddr; + flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + loop++; + goto retry; + } + *saddr = fl4.saddr; + return rt; } /* Get route to destination or remote server */ -static struct rtable * +static int __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, - __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr) + __be32 daddr, int rt_mode, __be32 *ret_saddr) { struct net *net = dev_net(skb_dst(skb)->dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ struct rtable *ort; /* Original route */ - int local; + struct iphdr *iph; + __be16 df; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos))) { - struct flowi4 fl4; - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = dest->addr.ip; - fl4.flowi4_tos = rtos; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &dest->addr.ip); - return NULL; + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rtable *) dest_dst->dst_cache; + else { + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + rt = do_output_route4(net, dest->addr.ip, rt_mode, + &dest_dst->dst_saddr.ip); + if (!rt) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); - dest->dst_saddr.ip = fl4.saddr; - IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " - "rtos=%X\n", - &dest->addr.ip, &dest->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt), rtos); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt)); } daddr = dest->addr.ip; if (ret_saddr) - *ret_saddr = dest->dst_saddr.ip; - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.ip; } else { - struct flowi4 fl4; - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = daddr; - fl4.flowi4_tos = rtos; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) { - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &daddr); - return NULL; - } + __be32 saddr = htonl(INADDR_ANY); + + noref = 0; + + /* For such unconfigured boxes avoid many route lookups + * for performance reasons because we do not remember saddr + */ + rt_mode &= ~IP_VS_RT_MODE_CONNECT; + rt = do_output_route4(net, daddr, rt_mode, &saddr); + if (!rt) + goto err_unreach; if (ret_saddr) - *ret_saddr = fl4.saddr; + *ret_saddr = saddr; } - local = rt->rt_flags & RTCF_LOCAL; + local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", (rt->rt_flags & RTCF_LOCAL) ? "local":"non-local", &daddr); - ip_rt_put(rt); - return NULL; - } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) { - IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " - "requires NAT method, dest: %pI4\n", - &ip_hdr(skb)->daddr, &daddr); - ip_rt_put(rt); - return NULL; + goto err_put; } - if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " - "to non-local address, dest: %pI4\n", - &ip_hdr(skb)->saddr, &daddr); - ip_rt_put(rt); - return NULL; + iph = ip_hdr(skb); + if (likely(!local)) { + if (unlikely(ipv4_is_loopback(iph->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI4 to non-local address, dest: %pI4\n", + &iph->saddr, &daddr); + goto err_put; + } + } else { + ort = skb_rtable(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !(ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to " + "local requires NAT method, dest: %pI4\n", + &iph->daddr, &daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + ip_rt_put(rt); + return local; } - return rt; -} - -/* Reroute packet to local IPv4 stack after DNAT */ -static int -__ip_vs_reroute_locally(struct sk_buff *skb) -{ - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - struct net *net = dev_net(dev); - struct iphdr *iph = ip_hdr(skb); - - if (rt_is_input_route(rt)) { - unsigned long orefdst = skb->_skb_refdst; - - if (ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) - return 0; - refdst_drop(orefdst); + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { + mtu = dst_mtu(&rt->dst); + df = iph->frag_off & htons(IP_DF); } else { - struct flowi4 fl4 = { - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), - .flowi4_mark = skb->mark, - }; - - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return 0; - if (!(rt->rt_flags & RTCF_LOCAL)) { - ip_rt_put(rt); - return 0; + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto err_put; } - /* Drop old route. */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ort = skb_rtable(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + /* MTU check allowed? */ + df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; + } + + /* MTU checking */ + if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + goto err_put; } - return 1; + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + ip_rt_put(rt); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #ifdef CONFIG_IP_VS_IPV6 @@ -247,131 +337,196 @@ out_err: /* * Get route to destination or remote server */ -static struct rt6_info * +static int __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, - int do_xfrm, int rt_mode) + struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { struct net *net = dev_net(skb_dst(skb)->dev); + struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ struct rt6_info *ort; /* Original route */ struct dst_entry *dst; - int local; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); - if (!rt) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rt6_info *) dest_dst->dst_cache; + else { u32 cookie; + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, - &dest->dst_saddr.in6, + &dest_dst->dst_saddr.in6, do_xfrm); if (!dst) { - spin_unlock(&dest->dst_lock); - return NULL; + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } rt = (struct rt6_info *) dst; cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", - &dest->addr.in6, &dest->dst_saddr.in6, + &dest->addr.in6, &dest_dst->dst_saddr.in6, atomic_read(&rt->dst.__refcnt)); } if (ret_saddr) - *ret_saddr = dest->dst_saddr.in6; - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.in6; } else { + noref = 0; dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) - return NULL; + goto err_unreach; rt = (struct rt6_info *) dst; } local = __ip_vs_is_local_route6(rt); if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", local ? "local":"non-local", daddr); - dst_release(&rt->dst); - return NULL; + goto err_put; } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = (struct rt6_info *) skb_dst(skb)) && - __ip_vs_is_local_route6(ort))) { - IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " - "requires NAT method, dest: %pI6\n", - &ipv6_hdr(skb)->daddr, daddr); - dst_release(&rt->dst); - return NULL; + if (likely(!local)) { + if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI6c to non-local address, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err_put; + } + } else { + ort = (struct rt6_info *) skb_dst(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !__ip_vs_is_local_route6(ort)) { + IP_VS_DBG_RL("Redirect from non-local address %pI6c " + "to local requires NAT method, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->daddr, daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + dst_release(&rt->dst); + return local; } - if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&ipv6_hdr(skb)->saddr) & - IPV6_ADDR_LOOPBACK)) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " - "to non-local address, dest: %pI6\n", - &ipv6_hdr(skb)->saddr, daddr); - dst_release(&rt->dst); - return NULL; + + /* MTU checking */ + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) + mtu = dst_mtu(&rt->dst); + else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto err_put; + } + ort = (struct rt6_info *) skb_dst(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } - return rt; + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + dst_release(&rt->dst); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #endif -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, + struct ip_vs_conn *cp) { - struct dst_entry *old_dst; + int ret = NF_ACCEPT; + + skb->ipvs_property = 1; + if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) + ret = ip_vs_confirm_conntrack(skb); + if (ret == NF_ACCEPT) { + nf_reset(skb); + skb_forward_csum(skb); + } + return ret; +} - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 1); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; } -#define IP_VS_XMIT_TUNNEL(skb, cp) \ -({ \ - int __ret = NF_ACCEPT; \ - \ - (skb)->ipvs_property = 1; \ - if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ - __ret = ip_vs_confirm_conntrack(skb); \ - if (__ret == NF_ACCEPT) { \ - nf_reset(skb); \ - skb_forward_csum(skb); \ - } \ - __ret; \ -}) - -#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - else \ - ip_vs_update_conntrack(skb, cp, 1); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) - -#define IP_VS_XMIT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; +} /* @@ -379,10 +534,10 @@ do { \ */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { /* we do not touch skb and do not need pskb ptr */ - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } @@ -393,54 +548,31 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, */ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - int mtu; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos), - IP_VS_RT_MODE_NON_LOCAL, NULL))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, + NULL) < 0) goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -448,58 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - struct ipv6hdr *iph = ipv6_hdr(skb); - int mtu; - EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, - IP_VS_RT_MODE_NON_LOCAL))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu && !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -511,32 +612,33 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - int local; + int local, rc, was_input; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + was_input = rt_is_input_route(skb_rtable(skb)); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -544,63 +646,37 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, - "ip_vs_nat_xmit(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error_put; + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) + goto tx_error; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - if (!local) { - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -608,52 +684,50 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; - int local; + int local, rc; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; - p = skb_header_pointer(skb, sizeof(struct ipv6hdr), - sizeof(_pt), &_pt); + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR)))) - goto tx_error_icmp; - local = __ip_vs_is_local_route6(rt); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -661,13 +735,13 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif @@ -678,44 +752,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu && !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } - IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -723,22 +774,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; -tx_error_icmp: - dst_link_failure(skb); tx_error: LeaveFunction(10); kfree_skb(skb); + rcu_read_unlock(); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -764,66 +812,52 @@ tx_error_put: */ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; + __be16 df; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(tos), IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL, - &saddr))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_CONNECT | + IP_VS_RT_MODE_TUNNEL, &saddr); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } + rt = skb_rtable(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - if (mtu < 68) { - IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); - goto tx_error_put; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF) && - mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } + /* Copy DF, reset fragment offset and MF */ + df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } - kfree_skb(skb); + + if (!new_skb) + goto tx_error; + consume_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); } @@ -837,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -853,36 +883,33 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ @@ -890,59 +917,38 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *old_iph = ipv6_hdr(skb); struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, - &saddr, 1, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, ipvsh, 1, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_TUNNEL); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } + rt = (struct rt6_info *) skb_dst(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); - if (mtu < IPV6_MIN_MTU) { - IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, - IPV6_MIN_MTU); - goto tx_error_put; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && - !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->dst); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } - kfree_skb(skb); + + if (!new_skb) + goto tx_error; + consume_skb(skb); skb = new_skb; old_iph = ipv6_hdr(skb); } @@ -953,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -972,27 +974,24 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip6_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -1003,60 +1002,38 @@ tx_error_put: */ int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL, NULL))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } ip_send_check(ip_hdr(skb)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1064,62 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1132,13 +1083,13 @@ tx_error: */ int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset, unsigned int hooknum) + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *iph) { struct rtable *rt; /* Route to the other host */ - int mtu; int rc; int local; - int rt_mode; + int rt_mode, was_input; EnterFunction(10); @@ -1147,7 +1098,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); + rc = cp->packet_xmit(skb, cp, pp, iph); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -1158,16 +1109,17 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* * mangle and send the packet here (only for VS/NAT) */ + was_input = rt_is_input_route(skb_rtable(skb)); /* LOCALNODE from FORWARD hook is not supported */ rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(ip_hdr(skb)->tos), - rt_mode, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic @@ -1176,87 +1128,57 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp(skb, pp, cp, 0); - if (!local) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); goto out; - tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset, unsigned int hooknum) + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; int rc; int local; int rt_mode; @@ -1268,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); + rc = cp->packet_xmit(skb, cp, pp, ipvsh); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -1284,11 +1206,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, rt_mode))) - goto tx_error_icmp; - - local = __ip_vs_is_local_route6(rt); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, rt_mode); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1296,13 +1219,13 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; + goto tx_error; } } #endif @@ -1313,58 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (skb->len > mtu && !skb_is_gso(skb)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp_v6(skb, pp, cp, 0); - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); goto out; -tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index f4f8cda0598..a4b5e2a435a 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -39,21 +39,23 @@ static struct ctl_table acct_sysctl_table[] = { unsigned int seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { - struct nf_conn_counter *acct; + struct nf_conn_acct *acct; + struct nf_conn_counter *counter; acct = nf_conn_acct_find(ct); if (!acct) return 0; + counter = acct->counter; return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)atomic64_read(&acct[dir].packets), - (unsigned long long)atomic64_read(&acct[dir].bytes)); + (unsigned long long)atomic64_read(&counter[dir].packets), + (unsigned long long)atomic64_read(&counter[dir].bytes)); }; EXPORT_SYMBOL_GPL(seq_print_acct); static struct nf_ct_ext_type acct_extend __read_mostly = { - .len = sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]), - .align = __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]), + .len = sizeof(struct nf_conn_acct), + .align = __alignof__(struct nf_conn_acct), .id = NF_CT_EXT_ACCT, }; @@ -69,8 +71,12 @@ static int nf_conntrack_acct_init_sysctl(struct net *net) table[0].data = &net->ct.sysctl_acct; - net->ct.acct_sysctl_header = register_net_sysctl_table(net, - nf_net_netfilter_sysctl_path, table); + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter", + table); if (!net->ct.acct_sysctl_header) { printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); goto out_register; @@ -102,36 +108,26 @@ static void nf_conntrack_acct_fini_sysctl(struct net *net) } #endif -int nf_conntrack_acct_init(struct net *net) +int nf_conntrack_acct_pernet_init(struct net *net) { - int ret; - net->ct.sysctl_acct = nf_ct_acct; + return nf_conntrack_acct_init_sysctl(net); +} - if (net_eq(net, &init_net)) { - ret = nf_ct_extend_register(&acct_extend); - if (ret < 0) { - printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); - goto out_extend_register; - } - } +void nf_conntrack_acct_pernet_fini(struct net *net) +{ + nf_conntrack_acct_fini_sysctl(net); +} - ret = nf_conntrack_acct_init_sysctl(net); +int nf_conntrack_acct_init(void) +{ + int ret = nf_ct_extend_register(&acct_extend); if (ret < 0) - goto out_sysctl; - - return 0; - -out_sysctl: - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&acct_extend); -out_extend_register: + pr_err("nf_conntrack_acct: Unable to register extension\n"); return ret; } -void nf_conntrack_acct_fini(struct net *net) +void nf_conntrack_acct_fini(void) { - nf_conntrack_acct_fini_sysctl(net); - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&acct_extend); + nf_ct_extend_unregister(&acct_extend); } diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 13fd2c55e32..b8b95f4027c 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -2,6 +2,7 @@ * * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> * based on HW's ip_conntrack_irc.c as well as other modules + * (C) 2006 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -40,6 +41,7 @@ MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned int matchoff, unsigned int matchlen, struct nf_conntrack_expect *exp) @@ -107,8 +109,7 @@ static int amanda_help(struct sk_buff *skb, /* No data? */ dataoff = protoff + sizeof(struct udphdr); if (dataoff >= skb->len) { - if (net_ratelimit()) - printk(KERN_ERR "amanda_help: skblen = %u\n", skb->len); + net_err_ratelimited("amanda_help: skblen = %u\n", skb->len); return NF_ACCEPT; } @@ -145,6 +146,7 @@ static int amanda_help(struct sk_buff *skb, exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -156,10 +158,12 @@ static int amanda_help(struct sk_buff *skb, nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook); if (nf_nat_amanda && ct->status & IPS_NAT_MASK) - ret = nf_nat_amanda(skb, ctinfo, off - dataoff, - len, exp); - else if (nf_ct_expect_related(exp) != 0) + ret = nf_nat_amanda(skb, ctinfo, protoff, + off - dataoff, len, exp); + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index fa4b82c8ae8..1f4f954c4b4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -5,6 +5,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -38,14 +39,19 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> #define NF_CONNTRACK_VERSION "0.5.0" @@ -54,8 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, const struct nlattr *attr) __read_mostly; EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); -DEFINE_SPINLOCK(nf_conntrack_lock); -EXPORT_SYMBOL_GPL(nf_conntrack_lock); +__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; +EXPORT_SYMBOL_GPL(nf_conntrack_locks); + +__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); + +static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) +{ + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + spin_unlock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_unlock(&nf_conntrack_locks[h2]); +} + +/* return true if we need to recompute hashes (in case hash table was resized) */ +static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, + unsigned int h2, unsigned int sequence) +{ + h1 %= CONNTRACK_LOCKS; + h2 %= CONNTRACK_LOCKS; + if (h1 <= h2) { + spin_lock(&nf_conntrack_locks[h1]); + if (h1 != h2) + spin_lock_nested(&nf_conntrack_locks[h2], + SINGLE_DEPTH_NESTING); + } else { + spin_lock(&nf_conntrack_locks[h2]); + spin_lock_nested(&nf_conntrack_locks[h1], + SINGLE_DEPTH_NESTING); + } + if (read_seqcount_retry(&net->ct.generation, sequence)) { + nf_conntrack_double_unlock(h1, h2); + return true; + } + return false; +} + +static void nf_conntrack_all_lock(void) +{ + int i; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_nested(&nf_conntrack_locks[i], i); +} + +static void nf_conntrack_all_unlock(void) +{ + int i; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_unlock(&nf_conntrack_locks[i]); +} unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); @@ -186,6 +243,50 @@ clean_from_lists(struct nf_conn *ct) nf_ct_remove_expectations(ct); } +/* must be called with local_bh_disable */ +static void nf_ct_add_to_dying_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) dying list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->dying); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* add this conntrack to the (per cpu) unconfirmed list */ + ct->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->unconfirmed); + spin_unlock(&pcpu->lock); +} + +/* must be called with local_bh_disable */ +static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) +{ + struct ct_pcpu *pcpu; + + /* We overload first tuple to link into unconfirmed or dying list.*/ + pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); + + spin_lock(&pcpu->lock); + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&pcpu->lock); +} + static void destroy_conntrack(struct nf_conntrack *nfct) { @@ -197,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); - /* To make sure we don't get any weird locking issues here: - * destroy_conntrack() MUST NOT be called with a write lock - * to nf_conntrack_lock!!! -HW */ rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) @@ -207,21 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct) rcu_read_unlock(); - spin_lock_bh(&nf_conntrack_lock); + local_bh_disable(); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, - * too. */ + * too. + */ nf_ct_remove_expectations(ct); - /* We overload first tuple to link into unconfirmed list. */ - if (!nf_ct_is_confirmed(ct)) { - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); - } + nf_ct_del_from_dying_or_unconfirmed_list(ct); NF_CT_STAT_INC(net, delete); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (ct->master) nf_ct_put(ct->master); @@ -230,84 +325,114 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_conntrack_free(ct); } -void nf_ct_delete_from_lists(struct nf_conn *ct) +static void nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + unsigned int hash, reply_hash; + u16 zone = nf_ct_zone(ct); + unsigned int sequence; nf_ct_helper_destroy(ct); - spin_lock_bh(&nf_conntrack_lock); - /* Inside lock so preempt is disabled on module removal path. - * Otherwise we can get spurious warnings. */ - NF_CT_STAT_INC(net, delete_list); + + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + clean_from_lists(ct); - spin_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + + nf_ct_add_to_dying_list(ct); + + NF_CT_STAT_INC(net, delete_list); + local_bh_enable(); } -EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); static void death_by_event(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { /* bad luck, let's retry again */ - ct->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + ecache->timeout.expires = jiffies + + (prandom_u32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ecache->timeout); return; } /* we've got the event delivered, now it's dying */ set_bit(IPS_DYING_BIT, &ct->status); - spin_lock(&nf_conntrack_lock); - hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); - spin_unlock(&nf_conntrack_lock); nf_ct_put(ct); } -void nf_ct_insert_dying_list(struct nf_conn *ct) +static void nf_ct_dying_timeout(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); - /* add this conntrack to the dying list */ - spin_lock_bh(&nf_conntrack_lock); - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.dying); - spin_unlock_bh(&nf_conntrack_lock); /* set a new timer to retry event delivery */ - setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); - ct->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); + ecache->timeout.expires = jiffies + + (prandom_u32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ecache->timeout); } -EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); -static void death_by_timeout(unsigned long ul_conntrack) +bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { - struct nf_conn *ct = (void *)ul_conntrack; struct nf_conn_tstamp *tstamp; tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) tstamp->stop = ktime_to_ns(ktime_get_real()); - if (!test_bit(IPS_DYING_BIT, &ct->status) && - unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { + if (!nf_ct_is_dying(ct) && + unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, + portid, report) < 0)) { /* destroy event was not delivered */ nf_ct_delete_from_lists(ct); - nf_ct_insert_dying_list(ct); - return; + nf_ct_dying_timeout(ct); + return false; } set_bit(IPS_DYING_BIT, &ct->status); nf_ct_delete_from_lists(ct); nf_ct_put(ct); + return true; +} +EXPORT_SYMBOL_GPL(nf_ct_delete); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); +} + +static inline bool +nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, + const struct nf_conntrack_tuple *tuple, + u16 zone) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + /* A conntrack can be recreated with the equal tuple, + * so we need to check that the conntrack is confirmed + */ + return nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone && + nf_ct_is_confirmed(ct); } /* * Warning : * - Caller must take a reference on returned object * and recheck nf_ct_tuple_equal(tuple, &h->tuple) - * OR - * - Caller must lock nf_conntrack_lock before calling this function */ static struct nf_conntrack_tuple_hash * ____nf_conntrack_find(struct net *net, u16 zone, @@ -323,8 +448,7 @@ ____nf_conntrack_find(struct net *net, u16 zone, local_bh_disable(); begin: hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { + if (nf_ct_key_equal(h, tuple, zone)) { NF_CT_STAT_INC(net, found); local_bh_enable(); return h; @@ -345,15 +469,6 @@ begin: return NULL; } -struct nf_conntrack_tuple_hash * -__nf_conntrack_find(struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple) -{ - return ____nf_conntrack_find(net, zone, tuple, - hash_conntrack_raw(tuple, zone)); -} -EXPORT_SYMBOL_GPL(__nf_conntrack_find); - /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * __nf_conntrack_find_get(struct net *net, u16 zone, @@ -371,8 +486,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || - nf_ct_zone(ct) != zone)) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { nf_ct_put(ct); goto begin; } @@ -394,32 +508,36 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, - unsigned int repl_hash) + unsigned int reply_hash) { struct net *net = nf_ct_net(ct); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &net->ct.hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[repl_hash]); + &net->ct.hash[reply_hash]); } int nf_conntrack_hash_check_insert(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int hash, repl_hash; + unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; u16 zone; + unsigned int sequence; zone = nf_ct_zone(ct); - hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - spin_lock_bh(&nf_conntrack_lock); + local_bh_disable(); + do { + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) @@ -427,32 +545,57 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; add_timer(&ct->timeout); - nf_conntrack_get(&ct->ct_general); - __nf_conntrack_hash_insert(ct, hash, repl_hash); + smp_wmb(); + /* The caller holds a reference to this object */ + atomic_set(&ct->ct_general.use, 2); + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); - spin_unlock_bh(&nf_conntrack_lock); - + local_bh_enable(); return 0; out: + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); return -EEXIST; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ + struct ct_pcpu *pcpu; + + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + + /* add this conntrack to the (per cpu) tmpl list */ + local_bh_disable(); + tmpl->cpu = smp_processor_id(); + pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); + + spin_lock(&pcpu->lock); + /* Overload tuple linked list to put us in template list. */ + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &pcpu->tmpl); + spin_unlock_bh(&pcpu->lock); +} +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { - unsigned int hash, repl_hash; + unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; @@ -461,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; u16 zone; + unsigned int sequence; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -473,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; zone = nf_ct_zone(ct); - /* reuse the hash saved before */ - hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); - repl_hash = hash_conntrack(net, zone, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + local_bh_disable(); + + do { + sequence = read_seqcount_begin(&net->ct.generation); + /* reuse the hash saved before */ + hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; + hash = hash_bucket(hash, net); + reply_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related - connections for unconfirmed conns. But packet copies and - REJECT will give spurious warnings here. */ + * connections for unconfirmed conns. But packet copies and + * REJECT will give spurious warnings here. + */ /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ /* No external references means no one else could have - confirmed us. */ + * confirmed us. + */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); pr_debug("Confirming conntrack %p\n", ct); - - spin_lock_bh(&nf_conntrack_lock); - /* We have to check the DYING flag inside the lock to prevent a race against nf_ct_get_next_corpse() possibly called from user context, else we insert an already 'dead' hash, blocking further use of that particular connection -JM */ if (unlikely(nf_ct_is_dying(ct))) { - spin_unlock_bh(&nf_conntrack_lock); + nf_conntrack_double_unlock(hash, reply_hash); + local_bh_enable(); return NF_ACCEPT; } @@ -509,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb) &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - /* Remove from unconfirmed list */ - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + nf_ct_del_from_dying_or_unconfirmed_list(ct); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in @@ -530,7 +679,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) tstamp = nf_conn_tstamp_find(ct); if (tstamp) { if (skb->tstamp.tv64 == 0) - __net_timestamp((struct sk_buff *)skb); + __net_timestamp(skb); tstamp->start = ktime_to_ns(skb->tstamp); } @@ -539,9 +688,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) * guarantee that no other CPU can find the conntrack before the above * stores are visible. */ - __nf_conntrack_hash_insert(ct, hash, repl_hash); + __nf_conntrack_hash_insert(ct, hash, reply_hash); + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); help = nfct_help(ct); if (help && help->helper) @@ -552,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: + nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); return NF_DROP; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -596,48 +747,54 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int hash) +static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; struct nf_conn *ct = NULL, *tmp; struct hlist_nulls_node *n; - unsigned int i, cnt = 0; + unsigned int i = 0, cnt = 0; int dropped = 0; + unsigned int hash, sequence; + spinlock_t *lockp; - rcu_read_lock(); - for (i = 0; i < net->ct.htable_size; i++) { + local_bh_disable(); +restart: + sequence = read_seqcount_begin(&net->ct.generation); + hash = hash_bucket(_hash, net); + for (; i < net->ct.htable_size; i++) { + lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (read_seqcount_retry(&net->ct.generation, sequence)) { + spin_unlock(lockp); + goto restart; + } hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) + if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && + !nf_ct_is_dying(tmp) && + atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; + break; + } cnt++; } - if (ct != NULL) { - if (likely(!nf_ct_is_dying(ct) && - atomic_inc_not_zero(&ct->ct_general.use))) - break; - else - ct = NULL; - } + hash = (hash + 1) % net->ct.htable_size; + spin_unlock(lockp); - if (cnt >= NF_CT_EVICTION_RANGE) + if (ct || cnt >= NF_CT_EVICTION_RANGE) break; - hash = (hash + 1) % net->ct.htable_size; } - rcu_read_unlock(); + local_bh_enable(); if (!ct) return dropped; if (del_timer(&ct->timeout)) { - death_by_timeout((unsigned long)ct); - /* Check if we indeed killed this entry. Reliable event - delivery may have inserted it into the dying list. */ - if (test_bit(IPS_DYING_BIT, &ct->status)) { + if (nf_ct_delete(ct, 0, 0)) { dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); } @@ -680,12 +837,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { - if (!early_drop(net, hash_bucket(hash, net))) { + if (!early_drop(net, hash)) { atomic_dec(&net->ct.count); - if (net_ratelimit()) - printk(KERN_WARNING - "nf_conntrack: table full, dropping" - " packet.\n"); + net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); } } @@ -725,15 +879,15 @@ __nf_conntrack_alloc(struct net *net, u16 zone, nf_ct_zone->id = zone; } #endif - /* - * changes to lookup keys must be done before setting refcnt to 1 + /* Because we use RCU lookups, we set ct_general.use to zero before + * this is inserted in any list. */ - smp_wmb(); - atomic_set(&ct->ct_general.use, 1); + atomic_set(&ct->ct_general.use, 0); return ct; #ifdef CONFIG_NF_CONNTRACK_ZONES out_free: + atomic_dec(&net->ct.count); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); return ERR_PTR(-ENOMEM); #endif @@ -752,13 +906,20 @@ void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + /* A freed object has refcnt == 0, that's + * the golden rule for SLAB_DESTROY_BY_RCU + */ + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); + nf_ct_ext_destroy(ct); - atomic_dec(&net->ct.count); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + smp_mb__before_atomic(); + atomic_dec(&net->ct.count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); + /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct nf_conntrack_tuple_hash * @@ -773,8 +934,10 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_ecache *ecache; - struct nf_conntrack_expect *exp; + struct nf_conntrack_expect *exp = NULL; u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + struct nf_conn_timeout *timeout_ext; + unsigned int *timeouts; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { pr_debug("Can't invert tuple.\n"); @@ -786,52 +949,73 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; - if (!l4proto->new(ct, skb, dataoff)) { + if (tmpl && nfct_synproxy(tmpl)) { + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); + } + + timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; + if (timeout_ext) + timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); + else + timeouts = l4proto->get_timeouts(net); + + if (!l4proto->new(ct, skb, dataoff, timeouts)) { nf_conntrack_free(ct); pr_debug("init conntrack: can't track with proto module\n"); return NULL; } + if (timeout_ext) + nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC); + nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + nf_ct_labels_ext_add(ct); ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, ecache ? ecache->expmask : 0, GFP_ATOMIC); - spin_lock_bh(&nf_conntrack_lock); - exp = nf_ct_find_expectation(net, zone, tuple); - if (exp) { - pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", - ct, exp); - /* Welcome, Mr. Bond. We've been expecting you... */ - __set_bit(IPS_EXPECTED_BIT, &ct->status); - ct->master = exp->master; - if (exp->helper) { - help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); - if (help) - rcu_assign_pointer(help->helper, exp->helper); - } + local_bh_disable(); + if (net->ct.expect_count) { + spin_lock(&nf_conntrack_expect_lock); + exp = nf_ct_find_expectation(net, zone, tuple); + if (exp) { + pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + ct, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &ct->status); + /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ + ct->master = exp->master; + if (exp->helper) { + help = nf_ct_helper_ext_add(ct, exp->helper, + GFP_ATOMIC); + if (help) + rcu_assign_pointer(help->helper, exp->helper); + } #ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = exp->master->mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK - ct->secmark = exp->master->secmark; + ct->secmark = exp->master->secmark; #endif - nf_conntrack_get(&ct->master->ct_general); - NF_CT_STAT_INC(net, expect_new); - } else { + NF_CT_STAT_INC(net, expect_new); + } + spin_unlock(&nf_conntrack_expect_lock); + } + if (!exp) { __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); NF_CT_STAT_INC(net, new); } - /* Overload tuple linked list to put us in unconfirmed list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.unconfirmed); + /* Now it is inserted into the unconfirmed list, bump refcount */ + nf_conntrack_get(&ct->ct_general); + nf_ct_add_to_unconfirmed_list(ct); - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (exp) { if (exp->expectfn) @@ -913,6 +1097,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, enum ip_conntrack_info ctinfo; struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts; unsigned int dataoff; u_int8_t protonum; int set_reply = 0; @@ -977,7 +1162,10 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, NF_CT_ASSERT(skb->nfct); - ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); + /* Decide what timeout policy we want to apply to this flow. */ + timeouts = nf_ct_timeout_lookup(net, ct, l4proto); + + ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ @@ -1075,12 +1263,14 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, acct: if (do_acct) { - struct nf_conn_counter *acct; + struct nf_conn_acct *acct; acct = nf_conn_acct_find(ct); if (acct) { - atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len, &acct[CTINFO2DIR(ctinfo)].bytes); + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes); } } } @@ -1092,13 +1282,15 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, int do_acct) { if (do_acct) { - struct nf_conn_counter *acct; + struct nf_conn_acct *acct; acct = nf_conn_acct_find(ct); if (acct) { - atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets); + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); atomic64_add(skb->len - skb_network_offset(skb), - &acct[CTINFO2DIR(ctinfo)].bytes); + &counter[CTINFO2DIR(ctinfo)].bytes); } } @@ -1130,8 +1322,9 @@ static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); - NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); + if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || + nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) + goto nla_put_failure; return 0; nla_put_failure: @@ -1166,7 +1359,7 @@ EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ -static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -1192,31 +1385,48 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; + int cpu; + spinlock_t *lockp; - spin_lock_bh(&nf_conntrack_lock); for (; *bucket < net->ct.htable_size; (*bucket)++) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; + local_bh_disable(); + spin_lock(lockp); + if (*bucket < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } + } + spin_unlock(lockp); + local_bh_enable(); + } + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) - goto found; + set_bit(IPS_DYING_BIT, &ct->status); } + spin_unlock_bh(&pcpu->lock); } - hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - set_bit(IPS_DYING_BIT, &ct->status); - } - spin_unlock_bh(&nf_conntrack_lock); return NULL; found: atomic_inc(&ct->ct_general.use); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock(lockp); + local_bh_enable(); return ct; } void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), - void *data) + void *data, u32 portid, int report) { struct nf_conn *ct; unsigned int bucket = 0; @@ -1224,7 +1434,8 @@ void nf_ct_iterate_cleanup(struct net *net, while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) - death_by_timeout((unsigned long)ct); + nf_ct_delete(ct, portid, report); + /* ... else the timer will get him soon. */ nf_ct_put(ct); @@ -1232,30 +1443,6 @@ void nf_ct_iterate_cleanup(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); -struct __nf_ct_flush_report { - u32 pid; - int report; -}; - -static int kill_report(struct nf_conn *i, void *data) -{ - struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; - struct nf_conn_tstamp *tstamp; - - tstamp = nf_conn_tstamp_find(i); - if (tstamp && tstamp->stop == 0) - tstamp->stop = ktime_to_ns(ktime_get_real()); - - /* If we fail to deliver the event, death_by_timeout() will retry */ - if (nf_conntrack_event_report(IPCT_DESTROY, i, - fr->pid, fr->report) < 0) - return 1; - - /* Avoid the delivery of the destroy event in death_by_timeout(). */ - set_bit(IPS_DYING_BIT, &i->status); - return 1; -} - static int kill_all(struct nf_conn *i, void *data) { return 1; @@ -1271,13 +1458,9 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { - struct __nf_ct_flush_report fr = { - .pid = pid, - .report = report, - }; - nf_ct_iterate_cleanup(net, kill_report, &fr); + nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report); } EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); @@ -1286,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net) struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; + int cpu; - spin_lock_bh(&nf_conntrack_lock); - hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - /* never fails to remove them, no listeners at this point */ - nf_ct_kill(ct); + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&pcpu->lock); } - spin_unlock_bh(&nf_conntrack_lock); } static int untrack_refs(void) @@ -1308,55 +1496,79 @@ static int untrack_refs(void) return cnt; } -static void nf_conntrack_cleanup_init_net(void) +void nf_conntrack_cleanup_start(void) { + RCU_INIT_POINTER(ip_ct_attach, NULL); +} + +void nf_conntrack_cleanup_end(void) +{ + RCU_INIT_POINTER(nf_ct_destroy, NULL); while (untrack_refs() > 0) schedule(); - nf_conntrack_helper_fini(); - nf_conntrack_proto_fini(); #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); #endif + nf_conntrack_proto_fini(); + nf_conntrack_seqadj_fini(); + nf_conntrack_labels_fini(); + nf_conntrack_helper_fini(); + nf_conntrack_timeout_fini(); + nf_conntrack_ecache_fini(); + nf_conntrack_tstamp_fini(); + nf_conntrack_acct_fini(); + nf_conntrack_expect_fini(); } -static void nf_conntrack_cleanup_net(struct net *net) +/* + * Mishearing the voices in his head, our hero wonders how he's + * supposed to kill the mall. + */ +void nf_conntrack_cleanup_net(struct net *net) { - i_see_dead_people: - nf_ct_iterate_cleanup(net, kill_all, NULL); - nf_ct_release_dying_list(net); - if (atomic_read(&net->ct.count) != 0) { - schedule(); - goto i_see_dead_people; - } + LIST_HEAD(single); - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); - nf_conntrack_ecache_fini(net); - nf_conntrack_tstamp_fini(net); - nf_conntrack_acct_fini(net); - nf_conntrack_expect_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); - free_percpu(net->ct.stat); + list_add(&net->exit_list, &single); + nf_conntrack_cleanup_net_list(&single); } -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void nf_conntrack_cleanup(struct net *net) +void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { - if (net_eq(net, &init_net)) - RCU_INIT_POINTER(ip_ct_attach, NULL); + int busy; + struct net *net; - /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module - delete... */ + /* + * This makes sure all current packets have passed through + * netfilter framework. Roll on, two-stage module + * delete... + */ synchronize_net(); +i_see_dead_people: + busy = 0; + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); + nf_ct_release_dying_list(net); + if (atomic_read(&net->ct.count) != 0) + busy = 1; + } + if (busy) { + schedule(); + goto i_see_dead_people; + } - nf_conntrack_cleanup_net(net); - - if (net_eq(net, &init_net)) { - RCU_INIT_POINTER(nf_ct_destroy, NULL); - nf_conntrack_cleanup_init_net(); + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); + nf_conntrack_proto_pernet_fini(net); + nf_conntrack_helper_pernet_fini(net); + nf_conntrack_ecache_pernet_fini(net); + nf_conntrack_tstamp_pernet_fini(net); + nf_conntrack_acct_pernet_fini(net); + nf_conntrack_expect_pernet_fini(net); + kmem_cache_destroy(net->ct.nf_conntrack_cachep); + kfree(net->ct.slabname); + free_percpu(net->ct.stat); + free_percpu(net->ct.pcpu_lists); } } @@ -1386,7 +1598,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) { - int i, bucket; + int i, bucket, rc; unsigned int hashsize, old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; @@ -1399,7 +1611,9 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!nf_conntrack_htable_size) return param_set_uint(val, kp); - hashsize = simple_strtoul(val, NULL, 0); + rc = kstrtouint(val, 0, &hashsize); + if (rc) + return rc; if (!hashsize) return -EINVAL; @@ -1407,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hash) return -ENOMEM; + local_bh_disable(); + nf_conntrack_all_lock(); + write_seqcount_begin(&init_net.ct.generation); + /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections * created because of a false negative won't make it into the hash - * though since that required taking the lock. + * though since that required taking the locks. */ - spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < init_net.ct.htable_size; i++) { while (!hlist_nulls_empty(&init_net.ct.hash[i])) { h = hlist_nulls_entry(init_net.ct.hash[i].first, @@ -1429,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; init_net.ct.hash = hash; - spin_unlock_bh(&nf_conntrack_lock); + + write_seqcount_end(&init_net.ct.generation); + nf_conntrack_all_unlock(); + local_bh_enable(); nf_ct_free_hashtable(old_hash, old_size); return 0; @@ -1448,10 +1669,13 @@ void nf_ct_untracked_status_or(unsigned long bits) } EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); -static int nf_conntrack_init_init_net(void) +int nf_conntrack_init_start(void) { int max_factor = 8; - int ret, cpu; + int i, ret, cpu; + + for (i = 0; i < CONNTRACK_LOCKS; i++) + spin_lock_init(&nf_conntrack_locks[i]); /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ @@ -1476,19 +1700,47 @@ static int nf_conntrack_init_init_net(void) NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); - ret = nf_conntrack_proto_init(); + ret = nf_conntrack_expect_init(); if (ret < 0) - goto err_proto; + goto err_expect; + + ret = nf_conntrack_acct_init(); + if (ret < 0) + goto err_acct; + + ret = nf_conntrack_tstamp_init(); + if (ret < 0) + goto err_tstamp; + + ret = nf_conntrack_ecache_init(); + if (ret < 0) + goto err_ecache; + + ret = nf_conntrack_timeout_init(); + if (ret < 0) + goto err_timeout; ret = nf_conntrack_helper_init(); if (ret < 0) goto err_helper; + ret = nf_conntrack_labels_init(); + if (ret < 0) + goto err_labels; + + ret = nf_conntrack_seqadj_init(); + if (ret < 0) + goto err_seqadj; + #ifdef CONFIG_NF_CONNTRACK_ZONES ret = nf_ct_extend_register(&nf_ct_zone_extend); if (ret < 0) goto err_extend; #endif + ret = nf_conntrack_proto_init(); + if (ret < 0) + goto err_proto; + /* Set up fake conntrack: to never be deleted, not in any hashes */ for_each_possible_cpu(cpu) { struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); @@ -1499,78 +1751,117 @@ static int nf_conntrack_init_init_net(void) nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); return 0; +err_proto: #ifdef CONFIG_NF_CONNTRACK_ZONES + nf_ct_extend_unregister(&nf_ct_zone_extend); err_extend: - nf_conntrack_helper_fini(); #endif + nf_conntrack_seqadj_fini(); +err_seqadj: + nf_conntrack_labels_fini(); +err_labels: + nf_conntrack_helper_fini(); err_helper: - nf_conntrack_proto_fini(); -err_proto: + nf_conntrack_timeout_fini(); +err_timeout: + nf_conntrack_ecache_fini(); +err_ecache: + nf_conntrack_tstamp_fini(); +err_tstamp: + nf_conntrack_acct_fini(); +err_acct: + nf_conntrack_expect_fini(); +err_expect: return ret; } +void nf_conntrack_init_end(void) +{ + /* For use by REJECT target */ + RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); + RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); +} + /* * We need to use special "null" values, not used in hash table */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) #define DYING_NULLS_VAL ((1<<30)+1) +#define TEMPLATE_NULLS_VAL ((1<<30)+2) -static int nf_conntrack_init_net(struct net *net) +int nf_conntrack_init_net(struct net *net) { - int ret; + int ret = -ENOMEM; + int cpu; atomic_set(&net->ct.count, 0); - INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); - net->ct.stat = alloc_percpu(struct ip_conntrack_stat); - if (!net->ct.stat) { - ret = -ENOMEM; + seqcount_init(&net->ct.generation); + + net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); + if (!net->ct.pcpu_lists) goto err_stat; + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_init(&pcpu->lock); + INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); } + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) + goto err_pcpu_lists; + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); - if (!net->ct.slabname) { - ret = -ENOMEM; + if (!net->ct.slabname) goto err_slabname; - } net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, sizeof(struct nf_conn), 0, SLAB_DESTROY_BY_RCU, NULL); if (!net->ct.nf_conntrack_cachep) { printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - ret = -ENOMEM; goto err_cache; } net->ct.htable_size = nf_conntrack_htable_size; net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); if (!net->ct.hash) { - ret = -ENOMEM; printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_hash; } - ret = nf_conntrack_expect_init(net); + ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; - ret = nf_conntrack_acct_init(net); + ret = nf_conntrack_acct_pernet_init(net); if (ret < 0) goto err_acct; - ret = nf_conntrack_tstamp_init(net); + ret = nf_conntrack_tstamp_pernet_init(net); if (ret < 0) goto err_tstamp; - ret = nf_conntrack_ecache_init(net); + ret = nf_conntrack_ecache_pernet_init(net); if (ret < 0) goto err_ecache; - + ret = nf_conntrack_helper_pernet_init(net); + if (ret < 0) + goto err_helper; + ret = nf_conntrack_proto_pernet_init(net); + if (ret < 0) + goto err_proto; return 0; +err_proto: + nf_conntrack_helper_pernet_fini(net); +err_helper: + nf_conntrack_ecache_pernet_fini(net); err_ecache: - nf_conntrack_tstamp_fini(net); + nf_conntrack_tstamp_pernet_fini(net); err_tstamp: - nf_conntrack_acct_fini(net); + nf_conntrack_acct_pernet_fini(net); err_acct: - nf_conntrack_expect_fini(net); + nf_conntrack_expect_pernet_fini(net); err_expect: nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); err_hash: @@ -1579,41 +1870,8 @@ err_cache: kfree(net->ct.slabname); err_slabname: free_percpu(net->ct.stat); +err_pcpu_lists: + free_percpu(net->ct.pcpu_lists); err_stat: return ret; } - -s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); -EXPORT_SYMBOL_GPL(nf_ct_nat_offset); - -int nf_conntrack_init(struct net *net) -{ - int ret; - - if (net_eq(net, &init_net)) { - ret = nf_conntrack_init_init_net(); - if (ret < 0) - goto out_init_net; - } - ret = nf_conntrack_init_net(net); - if (ret < 0) - goto out_net; - - if (net_eq(net, &init_net)) { - /* For use by REJECT target */ - RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); - RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); - - /* Howto get NAT offsets */ - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); - } - return 0; - -out_net: - if (net_eq(net, &init_net)) - nf_conntrack_cleanup_init_net(); -out_init_net: - return ret; -} diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 14af6329bdd..1df17614656 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -1,8 +1,10 @@ /* Event cache for netfilter. */ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> +/* + * (C) 2005 Harald Welte <laforge@gnumonks.org> + * (C) 2005 Patrick McHardy <kaber@trash.net> + * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,9 +34,11 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); void nf_ct_deliver_cached_events(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned long events; + unsigned long events, missed; struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; + struct nf_ct_event item; + int ret; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); @@ -47,31 +51,32 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) events = xchg(&e->cache, 0); - if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) { - struct nf_ct_event item = { - .ct = ct, - .pid = 0, - .report = 0 - }; - int ret; - /* We make a copy of the missed event cache without taking - * the lock, thus we may send missed events twice. However, - * this does not harm and it happens very rarely. */ - unsigned long missed = e->missed; - - if (!((events | missed) & e->ctmask)) - goto out_unlock; - - ret = notify->fcn(events | missed, &item); - if (unlikely(ret < 0 || missed)) { - spin_lock_bh(&ct->lock); - if (ret < 0) - e->missed |= events; - else - e->missed &= ~missed; - spin_unlock_bh(&ct->lock); - } - } + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events) + goto out_unlock; + + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. */ + missed = e->missed; + + if (!((events | missed) & e->ctmask)) + goto out_unlock; + + item.ct = ct; + item.portid = 0; + item.report = 0; + + ret = notify->fcn(events | missed, &item); + + if (likely(ret >= 0 && !missed)) + goto out_unlock; + + spin_lock_bh(&ct->lock); + if (ret < 0) + e->missed |= events; + else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); out_unlock: rcu_read_unlock(); @@ -81,7 +86,7 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); int nf_conntrack_register_notifier(struct net *net, struct nf_ct_event_notifier *new) { - int ret = 0; + int ret; struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); @@ -92,8 +97,7 @@ int nf_conntrack_register_notifier(struct net *net, goto out_unlock; } rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); - mutex_unlock(&nf_ct_ecache_mutex); - return ret; + ret = 0; out_unlock: mutex_unlock(&nf_ct_ecache_mutex); @@ -118,7 +122,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); int nf_ct_expect_register_notifier(struct net *net, struct nf_exp_event_notifier *new) { - int ret = 0; + int ret; struct nf_exp_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); @@ -129,8 +133,7 @@ int nf_ct_expect_register_notifier(struct net *net, goto out_unlock; } rcu_assign_pointer(net->ct.nf_expect_event_cb, new); - mutex_unlock(&nf_ct_ecache_mutex); - return ret; + ret = 0; out_unlock: mutex_unlock(&nf_ct_ecache_mutex); @@ -195,9 +198,12 @@ static int nf_conntrack_event_init_sysctl(struct net *net) table[0].data = &net->ct.sysctl_events; table[1].data = &net->ct.sysctl_events_retry_timeout; + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + net->ct.event_sysctl_header = - register_net_sysctl_table(net, - nf_net_netfilter_sysctl_path, table); + register_net_sysctl(net, "net/netfilter", table); if (!net->ct.event_sysctl_header) { printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); goto out_register; @@ -229,38 +235,27 @@ static void nf_conntrack_event_fini_sysctl(struct net *net) } #endif /* CONFIG_SYSCTL */ -int nf_conntrack_ecache_init(struct net *net) +int nf_conntrack_ecache_pernet_init(struct net *net) { - int ret; - net->ct.sysctl_events = nf_ct_events; net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + return nf_conntrack_event_init_sysctl(net); +} - if (net_eq(net, &init_net)) { - ret = nf_ct_extend_register(&event_extend); - if (ret < 0) { - printk(KERN_ERR "nf_ct_event: Unable to register " - "event extension.\n"); - goto out_extend_register; - } - } +void nf_conntrack_ecache_pernet_fini(struct net *net) +{ + nf_conntrack_event_fini_sysctl(net); +} - ret = nf_conntrack_event_init_sysctl(net); +int nf_conntrack_ecache_init(void) +{ + int ret = nf_ct_extend_register(&event_extend); if (ret < 0) - goto out_sysctl; - - return 0; - -out_sysctl: - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&event_extend); -out_extend_register: + pr_err("nf_ct_event: Unable to register event extension.\n"); return ret; } -void nf_conntrack_ecache_fini(struct net *net) +void nf_conntrack_ecache_fini(void) { - nf_conntrack_event_fini_sysctl(net); - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&event_extend); + nf_ct_extend_unregister(&event_extend); } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 4147ba3f653..f87e8f68ad4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (c) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -40,7 +41,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, - u32 pid, int report) + u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -54,7 +55,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del(&exp->lnode); master_help->expecting[exp->class]--; - nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); + nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); @@ -65,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) { struct nf_conntrack_expect *exp = (void *)ul_expect; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } @@ -90,14 +91,13 @@ __nf_ct_expect_find(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; - struct hlist_node *n; unsigned int h; if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && nf_ct_zone(i->master) == zone) return i; @@ -130,14 +130,13 @@ nf_ct_find_expectation(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i, *exp = NULL; - struct hlist_node *n; unsigned int h; if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && nf_ct_zone(i->master) == zone) { @@ -156,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone, if (!nf_ct_is_confirmed(exp->master)) return NULL; + /* Avoid race with other CPUs, that for exp->master ct, is + * about to invoke ->destroy(), or nf_ct_delete() via timeout + * or early_drop(). + * + * The atomic_inc_not_zero() check tells: If that fails, we + * know that the ct is being destroyed. If it succeeds, we + * can be sure the ct cannot disappear underneath. + */ + if (unlikely(nf_ct_is_dying(exp->master) || + !atomic_inc_not_zero(&exp->master->ct_general.use))) + return NULL; + if (exp->flags & NF_CT_EXPECT_PERMANENT) { atomic_inc(&exp->use); return exp; @@ -163,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone, nf_ct_unlink_expect(exp); return exp; } + /* Undo exp->master refcnt increase, if del_timer() failed */ + nf_ct_put(exp->master); return NULL; } @@ -172,18 +185,20 @@ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; - struct hlist_node *n, *next; + struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; - hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + spin_lock_bh(&nf_conntrack_expect_lock); + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); @@ -218,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); @@ -294,6 +309,11 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, sizeof(exp->tuple.dst.u3) - len); exp->tuple.dst.u.all = *dst; + +#ifdef CONFIG_NF_NAT_NEEDED + memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); + memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); +#endif } EXPORT_SYMBOL_GPL(nf_ct_expect_init); @@ -331,7 +351,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); helper = rcu_dereference_protected(master_help->helper, - lockdep_is_held(&nf_conntrack_lock)); + lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; @@ -348,9 +368,8 @@ static void evict_oldest_expect(struct nf_conn *master, { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; - struct hlist_node *n; - hlist_for_each_entry(exp, n, &master_help->expectations, lnode) { + hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } @@ -361,23 +380,6 @@ static void evict_oldest_expect(struct nf_conn *master, } } -static inline int refresh_timer(struct nf_conntrack_expect *i) -{ - struct nf_conn_help *master_help = nfct_help(i->master); - const struct nf_conntrack_expect_policy *p; - - if (!del_timer(&i->timeout)) - return 0; - - p = &rcu_dereference_protected( - master_help->helper, - lockdep_is_held(&nf_conntrack_lock) - )->expect_policy[i->class]; - i->timeout.expires = jiffies + p->timeout * HZ; - add_timer(&i->timeout); - return 1; -} - static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) { const struct nf_conntrack_expect_policy *p; @@ -386,7 +388,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); - struct hlist_node *n; + struct hlist_node *next; unsigned int h; int ret = 1; @@ -395,12 +397,12 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_ct_expect_put(i); + break; } } else if (expect_clash(i, expect)) { ret = -EBUSY; @@ -409,7 +411,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, - lockdep_is_held(&nf_conntrack_lock)); + lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && @@ -424,21 +426,19 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } if (net->ct.expect_count >= nf_ct_expect_max) { - if (net_ratelimit()) - printk(KERN_WARNING - "nf_conntrack: expectation table full\n"); + net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } out: return ret; } -int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 pid, int report) +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, + u32 portid, int report) { int ret; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect); if (ret <= 0) goto out; @@ -446,11 +446,11 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, ret = nf_ct_expect_insert(expect); if (ret < 0) goto out; - spin_unlock_bh(&nf_conntrack_lock); - nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); + spin_unlock_bh(&nf_conntrack_expect_lock); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; out: - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); @@ -590,7 +590,8 @@ static int exp_proc_init(struct net *net) #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; - proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops); + proc = proc_create("nf_conntrack_expect", 0440, net->proc_net, + &exp_file_ops); if (!proc) return -ENOMEM; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ @@ -600,59 +601,56 @@ static int exp_proc_init(struct net *net) static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS - proc_net_remove(net, "nf_conntrack_expect"); + remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); -int nf_conntrack_expect_init(struct net *net) +int nf_conntrack_expect_pernet_init(struct net *net) { int err = -ENOMEM; - if (net_eq(net, &init_net)) { - if (!nf_ct_expect_hsize) { - nf_ct_expect_hsize = net->ct.htable_size / 256; - if (!nf_ct_expect_hsize) - nf_ct_expect_hsize = 1; - } - nf_ct_expect_max = nf_ct_expect_hsize * 4; - } - net->ct.expect_count = 0; net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (net->ct.expect_hash == NULL) goto err1; - if (net_eq(net, &init_net)) { - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", - sizeof(struct nf_conntrack_expect), - 0, 0, NULL); - if (!nf_ct_expect_cachep) - goto err2; - } - err = exp_proc_init(net); if (err < 0) - goto err3; + goto err2; return 0; - -err3: - if (net_eq(net, &init_net)) - kmem_cache_destroy(nf_ct_expect_cachep); err2: nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); err1: return err; } -void nf_conntrack_expect_fini(struct net *net) +void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); - if (net_eq(net, &init_net)) { - rcu_barrier(); /* Wait for call_rcu() before destroy */ - kmem_cache_destroy(nf_ct_expect_cachep); - } nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } + +int nf_conntrack_expect_init(void) +{ + if (!nf_ct_expect_hsize) { + nf_ct_expect_hsize = nf_conntrack_htable_size / 256; + if (!nf_ct_expect_hsize) + nf_ct_expect_hsize = 1; + } + nf_ct_expect_max = nf_ct_expect_hsize * 4; + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL); + if (!nf_ct_expect_cachep) + return -ENOMEM; + return 0; +} + +void nf_conntrack_expect_fini(void) +{ + rcu_barrier(); /* Wait for call_rcu() before destroy */ + kmem_cache_destroy(nf_ct_expect_cachep); +} diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 641ff5f9671..1a9545965c0 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -44,7 +44,8 @@ void __nf_ct_ext_destroy(struct nf_conn *ct) EXPORT_SYMBOL(__nf_ct_ext_destroy); static void * -nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp) +nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, + size_t var_alloc_len, gfp_t gfp) { unsigned int off, len; struct nf_ct_ext_type *t; @@ -54,8 +55,8 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp) t = rcu_dereference(nf_ct_ext_types[id]); BUG_ON(t == NULL); off = ALIGN(sizeof(struct nf_ct_ext), t->align); - len = off + t->len; - alloc_size = t->alloc_size; + len = off + t->len + var_alloc_len; + alloc_size = t->alloc_size + var_alloc_len; rcu_read_unlock(); *ext = kzalloc(alloc_size, gfp); @@ -68,7 +69,8 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp) return (void *)(*ext) + off; } -void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) +void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, + size_t var_alloc_len, gfp_t gfp) { struct nf_ct_ext *old, *new; int i, newlen, newoff; @@ -79,7 +81,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) old = ct->ext; if (!old) - return nf_ct_ext_create(&ct->ext, id, gfp); + return nf_ct_ext_create(&ct->ext, id, var_alloc_len, gfp); if (__nf_ct_ext_exist(old, id)) return NULL; @@ -89,7 +91,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) BUG_ON(t == NULL); newoff = ALIGN(old->len, t->align); - newlen = newoff + t->len; + newlen = newoff + t->len + var_alloc_len; rcu_read_unlock(); new = __krealloc(old, newlen, gfp); @@ -117,7 +119,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) memset((void *)new + newoff, 0, newlen - newoff); return (void *)new + newoff; } -EXPORT_SYMBOL(__nf_ct_ext_add); +EXPORT_SYMBOL(__nf_ct_ext_add_length); static void update_alloc_size(struct nf_ct_ext_type *type) { diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 8c5c95c6d34..b8a0924064e 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -48,15 +49,20 @@ module_param(loose, bool, 0600); unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, enum nf_ct_ftp_type type, + unsigned int protoff, unsigned int matchoff, unsigned int matchlen, struct nf_conntrack_expect *exp); EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); -static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); -static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); +static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, - char); + char, unsigned int *); static struct ftp_search { const char *pattern; @@ -64,7 +70,7 @@ static struct ftp_search { char skip; char term; enum nf_ct_ftp_type ftptype; - int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); } search[IP_CT_DIR_MAX][2] = { [IP_CT_DIR_ORIGINAL] = { { @@ -88,10 +94,8 @@ static struct ftp_search { { .pattern = "227 ", .plen = sizeof("227 ") - 1, - .skip = '(', - .term = ')', .ftptype = NF_CT_FTP_PASV, - .getnum = try_rfc959, + .getnum = try_rfc1123, }, { .pattern = "229 ", @@ -130,8 +134,9 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[], i++; else { /* Unexpected character; true if it's the - terminator and we're finished. */ - if (*data == term && i == array_size - 1) + terminator (or we don't care about one) + and we're finished. */ + if ((*data == term || !term) && i == array_size - 1) return len; pr_debug("Char %u (got %u nums) `%u' unexpected\n", @@ -146,7 +151,8 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[], /* Returns 0, or length of numbers: 192,168,1,1,5,6 */ static int try_rfc959(const char *data, size_t dlen, - struct nf_conntrack_man *cmd, char term) + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) { int length; u_int32_t array[6]; @@ -161,6 +167,33 @@ static int try_rfc959(const char *data, size_t dlen, return length; } +/* + * From RFC 1123: + * The format of the 227 reply to a PASV command is not + * well standardized. In particular, an FTP client cannot + * assume that the parentheses shown on page 40 of RFC-959 + * will be present (and in fact, Figure 3 on page 43 omits + * them). Therefore, a User-FTP program that interprets + * the PASV reply must scan the reply for the first digit + * of the host and port numbers. + */ +static int try_rfc1123(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) +{ + int i; + for (i = 0; i < dlen; i++) + if (isdigit(data[i])) + break; + + if (i == dlen) + return 0; + + *offset += i; + + return try_rfc959(data + i, dlen - i, cmd, 0, offset); +} + /* Grab port: number up to delimiter */ static int get_port(const char *data, int start, size_t dlen, char delim, __be16 *port) @@ -189,7 +222,7 @@ static int get_port(const char *data, int start, size_t dlen, char delim, /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, - char term) + char term, unsigned int *offset) { char delim; int length; @@ -237,7 +270,8 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, /* Returns 0, or length of numbers: |||6446| */ static int try_epsv_response(const char *data, size_t dlen, - struct nf_conntrack_man *cmd, char term) + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) { char delim; @@ -259,9 +293,10 @@ static int find_pattern(const char *data, size_t dlen, unsigned int *numlen, struct nf_conntrack_man *cmd, int (*getnum)(const char *, size_t, - struct nf_conntrack_man *, char)) + struct nf_conntrack_man *, char, + unsigned int *)) { - size_t i; + size_t i = plen; pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); if (dlen == 0) @@ -291,16 +326,18 @@ static int find_pattern(const char *data, size_t dlen, pr_debug("Pattern matches!\n"); /* Now we've found the constant string, try to skip to the 'skip' character */ - for (i = plen; data[i] != skip; i++) - if (i == dlen - 1) return -1; + if (skip) { + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; - /* Skip over the last character */ - i++; + /* Skip over the last character */ + i++; + } pr_debug("Skipped up to `%c'!\n", skip); *numoff = i; - *numlen = getnum(data + i, dlen - i, cmd, term); + *numlen = getnum(data + i, dlen - i, cmd, term, numoff); if (!*numlen) return -1; @@ -358,7 +395,7 @@ static int help(struct sk_buff *skb, u32 seq; int dir = CTINFO2DIR(ctinfo); unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); - struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info; + struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; union nf_inet_addr *daddr; struct nf_conntrack_man cmd = {}; @@ -395,6 +432,12 @@ static int help(struct sk_buff *skb, /* Look up to see if we're just after a \n. */ if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* We're picking up this, clear flags and let it continue */ + if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) { + ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP; + goto skip_nl_seq; + } + /* Now if this ends in \n, update ftp info. */ pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n", ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", @@ -405,6 +448,7 @@ static int help(struct sk_buff *skb, goto out_update_nl; } +skip_nl_seq: /* Initialize IP/IPv6 addr to expected address (it's not mentioned in EPSV responses) */ cmd.l3num = nf_ct_l3num(ct); @@ -427,8 +471,8 @@ static int help(struct sk_buff *skb, connection tracking, not packet filtering. However, it is necessary for accurate tracking in this case. */ - pr_debug("conntrack_ftp: partial %s %u+%u\n", - search[dir][i].pattern, ntohl(th->seq), datalen); + nf_ct_helper_log(skb, ct, "partial matching of `%s'", + search[dir][i].pattern); ret = NF_DROP; goto out; } else if (found == 0) { /* No match */ @@ -442,6 +486,7 @@ static int help(struct sk_buff *skb, exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -489,12 +534,13 @@ static int help(struct sk_buff *skb, nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook); if (nf_nat_ftp && ct->status & IPS_NAT_MASK) ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype, - matchoff, matchlen, exp); + protoff, matchoff, matchlen, exp); else { /* Can't expect this? Best to drop packet now. */ - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - else + } else ret = NF_ACCEPT; } @@ -511,8 +557,20 @@ out_update_nl: return ret; } +static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct) +{ + struct nf_ct_ftp_master *ftp = nfct_help_data(ct); + + /* This conntrack has been injected from user-space, always pick up + * sequence tracking. Otherwise, the first FTP command after the + * failover breaks. + */ + ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP; + ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP; + return 0; +} + static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; -static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")] __read_mostly; static const struct nf_conntrack_expect_policy ftp_exp_policy = { .max_expected = 1, @@ -541,7 +599,6 @@ static void nf_conntrack_ftp_fini(void) static int __init nf_conntrack_ftp_init(void) { int i, j = -1, ret = 0; - char *tmpname; ftp_buffer = kmalloc(65536, GFP_KERNEL); if (!ftp_buffer) @@ -556,17 +613,17 @@ static int __init nf_conntrack_ftp_init(void) ftp[i][0].tuple.src.l3num = PF_INET; ftp[i][1].tuple.src.l3num = PF_INET6; for (j = 0; j < 2; j++) { + ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master); ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; ftp[i][j].expect_policy = &ftp_exp_policy; ftp[i][j].me = THIS_MODULE; ftp[i][j].help = help; - tmpname = &ftp_names[i][j][0]; + ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr; if (ports[i] == FTP_PORT) - sprintf(tmpname, "ftp"); + sprintf(ftp[i][j].name, "ftp"); else - sprintf(tmpname, "ftp-%d", ports[i]); - ftp[i][j].name = tmpname; + sprintf(ftp[i][j].name, "ftp-%d", ports[i]); pr_debug("nf_ct_ftp: registering helper for pf: %d " "port: %d\n", diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 722291f8af7..3a3a60b126e 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -2,6 +2,7 @@ * H.323 connection tracking helper * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * This source code is licensed under General Public License version 2. * @@ -49,12 +50,12 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " "(determined by routing information)"); /* Hooks for NAT */ -int (*set_h245_addr_hook) (struct sk_buff *skb, +int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, union nf_inet_addr *addr, __be16 port) __read_mostly; -int (*set_h225_addr_hook) (struct sk_buff *skb, +int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, union nf_inet_addr *addr, __be16 port) @@ -62,16 +63,17 @@ int (*set_h225_addr_hook) (struct sk_buff *skb, int (*set_sig_addr_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) __read_mostly; int (*set_ras_addr_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) __read_mostly; int (*nat_rtp_rtcp_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, __be16 port, __be16 rtp_port, @@ -80,24 +82,28 @@ int (*nat_rtp_rtcp_hook) (struct sk_buff *skb, int (*nat_t120_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; int (*nat_h245_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; int (*nat_callforwarding_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; int (*nat_q931_hook) (struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int idx, __be16 port, struct nf_conntrack_expect *exp) __read_mostly; @@ -114,7 +120,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned char **data, int *datalen, int *dataoff) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); const struct tcphdr *th; struct tcphdr _tcph; @@ -251,6 +257,7 @@ static int get_h245_addr(struct nf_conn *ct, const unsigned char *data, /****************************************************************************/ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr) { @@ -270,9 +277,8 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, return 0; /* RTP port is even */ - port &= htons(~1); - rtp_port = port; - rtcp_port = htons(ntohs(port) + 1); + rtp_port = port & ~htons(1); + rtcp_port = port | htons(1); /* Create expect for RTP */ if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL) @@ -296,9 +302,10 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, rtp_port, rtp_exp, rtcp_exp); } else { /* Conntrack only */ if (nf_ct_expect_related(rtp_exp) == 0) { @@ -325,6 +332,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, static int expect_t120(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H245_TransportAddress *taddr) { @@ -354,9 +362,10 @@ static int expect_t120(struct sk_buff *skb, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_t120 = rcu_dereference(nat_t120_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_t120(skb, ct, ctinfo, data, dataoff, taddr, + ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp) == 0) { @@ -375,6 +384,7 @@ static int expect_t120(struct sk_buff *skb, static int process_h245_channel(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, H2250LogicalChannelParameters *channel) { @@ -382,7 +392,7 @@ static int process_h245_channel(struct sk_buff *skb, if (channel->options & eH2250LogicalChannelParameters_mediaChannel) { /* RTP */ - ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, &channel->mediaChannel); if (ret < 0) return -1; @@ -391,7 +401,7 @@ static int process_h245_channel(struct sk_buff *skb, if (channel-> options & eH2250LogicalChannelParameters_mediaControlChannel) { /* RTCP */ - ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, &channel->mediaControlChannel); if (ret < 0) return -1; @@ -403,6 +413,7 @@ static int process_h245_channel(struct sk_buff *skb, /****************************************************************************/ static int process_olc(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, OpenLogicalChannel *olc) { @@ -413,7 +424,8 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct, if (olc->forwardLogicalChannelParameters.multiplexParameters.choice == eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters) { - ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, + ret = process_h245_channel(skb, ct, ctinfo, + protoff, data, dataoff, &olc-> forwardLogicalChannelParameters. multiplexParameters. @@ -431,7 +443,8 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct, eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) { ret = - process_h245_channel(skb, ct, ctinfo, data, dataoff, + process_h245_channel(skb, ct, ctinfo, + protoff, data, dataoff, &olc-> reverseLogicalChannelParameters. multiplexParameters. @@ -449,7 +462,7 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct, t120.choice == eDataProtocolCapability_separateLANStack && olc->separateStack.networkAddress.choice == eNetworkAccessParameters_networkAddress_localAreaAddress) { - ret = expect_t120(skb, ct, ctinfo, data, dataoff, + ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff, &olc->separateStack.networkAddress. localAreaAddress); if (ret < 0) @@ -462,7 +475,7 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, OpenLogicalChannelAck *olca) { H2250LogicalChannelAckParameters *ack; @@ -478,7 +491,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, choice == eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) { - ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, + ret = process_h245_channel(skb, ct, ctinfo, + protoff, data, dataoff, &olca-> reverseLogicalChannelParameters. multiplexParameters. @@ -497,7 +511,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, if (ack->options & eH2250LogicalChannelAckParameters_mediaChannel) { /* RTP */ - ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, + protoff, data, dataoff, &ack->mediaChannel); if (ret < 0) return -1; @@ -506,7 +521,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, if (ack->options & eH2250LogicalChannelAckParameters_mediaControlChannel) { /* RTCP */ - ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + ret = expect_rtp_rtcp(skb, ct, ctinfo, + protoff, data, dataoff, &ack->mediaControlChannel); if (ret < 0) return -1; @@ -516,7 +532,7 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, if ((olca->options & eOpenLogicalChannelAck_separateStack) && olca->separateStack.networkAddress.choice == eNetworkAccessParameters_networkAddress_localAreaAddress) { - ret = expect_t120(skb, ct, ctinfo, data, dataoff, + ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff, &olca->separateStack.networkAddress. localAreaAddress); if (ret < 0) @@ -529,14 +545,15 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, MultimediaSystemControlMessage *mscm) { switch (mscm->choice) { case eMultimediaSystemControlMessage_request: if (mscm->request.choice == eRequestMessage_openLogicalChannel) { - return process_olc(skb, ct, ctinfo, data, dataoff, + return process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &mscm->request.openLogicalChannel); } pr_debug("nf_ct_h323: H.245 Request %d\n", @@ -545,7 +562,8 @@ static int process_h245(struct sk_buff *skb, struct nf_conn *ct, case eMultimediaSystemControlMessage_response: if (mscm->response.choice == eResponseMessage_openLogicalChannelAck) { - return process_olca(skb, ct, ctinfo, data, dataoff, + return process_olca(skb, ct, ctinfo, + protoff, data, dataoff, &mscm->response. openLogicalChannelAck); } @@ -596,7 +614,8 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff, } /* Process H.245 signal */ - if (process_h245(skb, ct, ctinfo, &data, dataoff, &mscm) < 0) + if (process_h245(skb, ct, ctinfo, protoff, + &data, dataoff, &mscm) < 0) goto drop; } @@ -605,8 +624,7 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - if (net_ratelimit()) - pr_info("nf_ct_h245: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process H.245 message"); return NF_DROP; } @@ -619,6 +637,7 @@ static const struct nf_conntrack_expect_policy h245_exp_policy = { static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = { .name = "H.245", .me = THIS_MODULE, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_UNSPEC, .tuple.dst.protonum = IPPROTO_UDP, .help = h245_help, @@ -660,7 +679,7 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data, /****************************************************************************/ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr) { int dir = CTINFO2DIR(ctinfo); @@ -689,9 +708,10 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_h245 = rcu_dereference(nat_h245_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_h245(skb, ct, ctinfo, data, dataoff, taddr, + ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp) == 0) { @@ -734,7 +754,8 @@ static int callforward_do_filter(const union nf_inet_addr *src, flowi4_to_flowi(&fl1), false)) { if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, flowi4_to_flowi(&fl2), false)) { - if (rt1->rt_gateway == rt2->rt_gateway && + if (rt_nexthop(rt1, fl1.daddr) == + rt_nexthop(rt2, fl2.daddr) && rt1->dst.dev == rt2->dst.dev) ret = 1; dst_release(&rt2->dst); @@ -757,8 +778,8 @@ static int callforward_do_filter(const union nf_inet_addr *src, flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2), false)) { - if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, - sizeof(rt1->rt6i_gateway)) && + if (ipv6_addr_equal(rt6_nexthop(rt1), + rt6_nexthop(rt2)) && rt1->dst.dev == rt2->dst.dev) ret = 1; dst_release(&rt2->dst); @@ -777,6 +798,7 @@ static int callforward_do_filter(const union nf_inet_addr *src, static int expect_callforwarding(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr) { @@ -812,9 +834,11 @@ static int expect_callforwarding(struct sk_buff *skb, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* Need NAT */ - ret = nat_callforwarding(skb, ct, ctinfo, data, dataoff, + ret = nat_callforwarding(skb, ct, ctinfo, + protoff, data, dataoff, taddr, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp) == 0) { @@ -832,6 +856,7 @@ static int expect_callforwarding(struct sk_buff *skb, /****************************************************************************/ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Setup_UUIE *setup) { @@ -845,7 +870,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_q931: Setup\n"); if (setup->options & eSetup_UUIE_h245Address) { - ret = expect_h245(skb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &setup->h245Address); if (ret < 0) return -1; @@ -853,14 +878,15 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, set_h225_addr = rcu_dereference(set_h225_addr_hook); if ((setup->options & eSetup_UUIE_destCallSignalAddress) && - (set_h225_addr) && ct->status & IPS_NAT_MASK && + (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK && get_h225_addr(ct, *data, &setup->destCallSignalAddress, &addr, &port) && memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) { pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n", &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3, ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); - ret = set_h225_addr(skb, data, dataoff, + ret = set_h225_addr(skb, protoff, data, dataoff, &setup->destCallSignalAddress, &ct->tuplehash[!dir].tuple.src.u3, ct->tuplehash[!dir].tuple.src.u.tcp.port); @@ -869,14 +895,15 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, } if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && - (set_h225_addr) && ct->status & IPS_NAT_MASK && + (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK && get_h225_addr(ct, *data, &setup->sourceCallSignalAddress, &addr, &port) && memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) { pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n", &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3, ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); - ret = set_h225_addr(skb, data, dataoff, + ret = set_h225_addr(skb, protoff, data, dataoff, &setup->sourceCallSignalAddress, &ct->tuplehash[!dir].tuple.dst.u3, ct->tuplehash[!dir].tuple.dst.u.tcp.port); @@ -886,7 +913,8 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, if (setup->options & eSetup_UUIE_fastStart) { for (i = 0; i < setup->fastStart.count; i++) { - ret = process_olc(skb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &setup->fastStart.item[i]); if (ret < 0) return -1; @@ -900,6 +928,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, static int process_callproceeding(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, CallProceeding_UUIE *callproc) { @@ -909,7 +938,7 @@ static int process_callproceeding(struct sk_buff *skb, pr_debug("nf_ct_q931: CallProceeding\n"); if (callproc->options & eCallProceeding_UUIE_h245Address) { - ret = expect_h245(skb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &callproc->h245Address); if (ret < 0) return -1; @@ -917,7 +946,8 @@ static int process_callproceeding(struct sk_buff *skb, if (callproc->options & eCallProceeding_UUIE_fastStart) { for (i = 0; i < callproc->fastStart.count; i++) { - ret = process_olc(skb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &callproc->fastStart.item[i]); if (ret < 0) return -1; @@ -930,6 +960,7 @@ static int process_callproceeding(struct sk_buff *skb, /****************************************************************************/ static int process_connect(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Connect_UUIE *connect) { @@ -939,7 +970,7 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_q931: Connect\n"); if (connect->options & eConnect_UUIE_h245Address) { - ret = expect_h245(skb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &connect->h245Address); if (ret < 0) return -1; @@ -947,7 +978,8 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct, if (connect->options & eConnect_UUIE_fastStart) { for (i = 0; i < connect->fastStart.count; i++) { - ret = process_olc(skb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &connect->fastStart.item[i]); if (ret < 0) return -1; @@ -960,6 +992,7 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Alerting_UUIE *alert) { @@ -969,7 +1002,7 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_q931: Alerting\n"); if (alert->options & eAlerting_UUIE_h245Address) { - ret = expect_h245(skb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &alert->h245Address); if (ret < 0) return -1; @@ -977,7 +1010,8 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, if (alert->options & eAlerting_UUIE_fastStart) { for (i = 0; i < alert->fastStart.count; i++) { - ret = process_olc(skb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &alert->fastStart.item[i]); if (ret < 0) return -1; @@ -990,6 +1024,7 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_facility(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Facility_UUIE *facility) { @@ -1000,15 +1035,15 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct, if (facility->reason.choice == eFacilityReason_callForwarded) { if (facility->options & eFacility_UUIE_alternativeAddress) - return expect_callforwarding(skb, ct, ctinfo, data, - dataoff, + return expect_callforwarding(skb, ct, ctinfo, + protoff, data, dataoff, &facility-> alternativeAddress); return 0; } if (facility->options & eFacility_UUIE_h245Address) { - ret = expect_h245(skb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &facility->h245Address); if (ret < 0) return -1; @@ -1016,7 +1051,8 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct, if (facility->options & eFacility_UUIE_fastStart) { for (i = 0; i < facility->fastStart.count; i++) { - ret = process_olc(skb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &facility->fastStart.item[i]); if (ret < 0) return -1; @@ -1029,6 +1065,7 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_progress(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, int dataoff, Progress_UUIE *progress) { @@ -1038,7 +1075,7 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_q931: Progress\n"); if (progress->options & eProgress_UUIE_h245Address) { - ret = expect_h245(skb, ct, ctinfo, data, dataoff, + ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff, &progress->h245Address); if (ret < 0) return -1; @@ -1046,7 +1083,8 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct, if (progress->options & eProgress_UUIE_fastStart) { for (i = 0; i < progress->fastStart.count; i++) { - ret = process_olc(skb, ct, ctinfo, data, dataoff, + ret = process_olc(skb, ct, ctinfo, + protoff, data, dataoff, &progress->fastStart.item[i]); if (ret < 0) return -1; @@ -1059,7 +1097,8 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, Q931 *q931) + unsigned int protoff, unsigned char **data, int dataoff, + Q931 *q931) { H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu; int i; @@ -1067,28 +1106,29 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct, switch (pdu->h323_message_body.choice) { case eH323_UU_PDU_h323_message_body_setup: - ret = process_setup(skb, ct, ctinfo, data, dataoff, + ret = process_setup(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.setup); break; case eH323_UU_PDU_h323_message_body_callProceeding: - ret = process_callproceeding(skb, ct, ctinfo, data, dataoff, + ret = process_callproceeding(skb, ct, ctinfo, + protoff, data, dataoff, &pdu->h323_message_body. callProceeding); break; case eH323_UU_PDU_h323_message_body_connect: - ret = process_connect(skb, ct, ctinfo, data, dataoff, + ret = process_connect(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.connect); break; case eH323_UU_PDU_h323_message_body_alerting: - ret = process_alerting(skb, ct, ctinfo, data, dataoff, + ret = process_alerting(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.alerting); break; case eH323_UU_PDU_h323_message_body_facility: - ret = process_facility(skb, ct, ctinfo, data, dataoff, + ret = process_facility(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.facility); break; case eH323_UU_PDU_h323_message_body_progress: - ret = process_progress(skb, ct, ctinfo, data, dataoff, + ret = process_progress(skb, ct, ctinfo, protoff, data, dataoff, &pdu->h323_message_body.progress); break; default: @@ -1102,7 +1142,8 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct, if (pdu->options & eH323_UU_PDU_h245Control) { for (i = 0; i < pdu->h245Control.count; i++) { - ret = process_h245(skb, ct, ctinfo, data, dataoff, + ret = process_h245(skb, ct, ctinfo, + protoff, data, dataoff, &pdu->h245Control.item[i]); if (ret < 0) return -1; @@ -1147,7 +1188,8 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff, } /* Process Q.931 signal */ - if (process_q931(skb, ct, ctinfo, &data, dataoff, &q931) < 0) + if (process_q931(skb, ct, ctinfo, protoff, + &data, dataoff, &q931) < 0) goto drop; } @@ -1156,8 +1198,7 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - if (net_ratelimit()) - pr_info("nf_ct_q931: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process Q.931 message"); return NF_DROP; } @@ -1172,6 +1213,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { { .name = "Q.931", .me = THIS_MODULE, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET, .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), .tuple.dst.protonum = IPPROTO_TCP, @@ -1230,7 +1272,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, /****************************************************************************/ static int set_expect_timeout(struct nf_conntrack_expect *exp, - unsigned timeout) + unsigned int timeout) { if (!exp || !del_timer(&exp->timeout)) return 0; @@ -1244,10 +1286,10 @@ static int set_expect_timeout(struct nf_conntrack_expect *exp, /****************************************************************************/ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned char **data, + unsigned int protoff, unsigned char **data, TransportAddress *taddr, int count) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int ret = 0; int i; @@ -1279,8 +1321,10 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nat_q931 = rcu_dereference(nat_q931_hook); - if (nat_q931 && ct->status & IPS_NAT_MASK) { /* Need NAT */ - ret = nat_q931(skb, ct, ctinfo, data, taddr, i, port, exp); + if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { /* Need NAT */ + ret = nat_q931(skb, ct, ctinfo, protoff, data, + taddr, i, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1300,6 +1344,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_grq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, GatekeeperRequest *grq) { typeof(set_ras_addr_hook) set_ras_addr; @@ -1307,8 +1352,9 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_ras: GRQ\n"); set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) /* NATed */ - return set_ras_addr(skb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) /* NATed */ + return set_ras_addr(skb, ct, ctinfo, protoff, data, &grq->rasAddress, 1); return 0; } @@ -1316,6 +1362,7 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, GatekeeperConfirm *gcf) { int dir = CTINFO2DIR(ctinfo); @@ -1360,23 +1407,25 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, RegistrationRequest *rrq) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int ret; typeof(set_ras_addr_hook) set_ras_addr; pr_debug("nf_ct_ras: RRQ\n"); - ret = expect_q931(skb, ct, ctinfo, data, + ret = expect_q931(skb, ct, ctinfo, protoff, data, rrq->callSignalAddress.item, rrq->callSignalAddress.count); if (ret < 0) return -1; set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) { - ret = set_ras_addr(skb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, protoff, data, rrq->rasAddress.item, rrq->rasAddress.count); if (ret < 0) @@ -1395,9 +1444,10 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, RegistrationConfirm *rcf) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int ret; struct nf_conntrack_expect *exp; @@ -1406,8 +1456,9 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_ras: RCF\n"); set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(skb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, protoff, data, rcf->callSignalAddress.item, rcf->callSignalAddress.count); if (ret < 0) @@ -1425,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_refresh(ct, skb, info->timeout * HZ); /* Set expect timeout */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3, info->sig_port[!dir]); if (exp) { @@ -1435,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_dump_tuple(&exp->tuple); set_expect_timeout(exp, info->timeout); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; @@ -1444,9 +1495,10 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, UnregistrationRequest *urq) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int ret; typeof(set_sig_addr_hook) set_sig_addr; @@ -1454,8 +1506,9 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_ras: URQ\n"); set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(skb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, protoff, data, urq->callSignalAddress.item, urq->callSignalAddress.count); if (ret < 0) @@ -1476,9 +1529,10 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, AdmissionRequest *arq) { - const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + const struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); __be16 port; union nf_inet_addr addr; @@ -1492,9 +1546,10 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, &addr, &port) && !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && port == info->sig_port[dir] && + nf_ct_l3num(ct) == NFPROTO_IPV4 && set_h225_addr && ct->status & IPS_NAT_MASK) { /* Answering ARQ */ - return set_h225_addr(skb, data, 0, + return set_h225_addr(skb, protoff, data, 0, &arq->destCallSignalAddress, &ct->tuplehash[!dir].tuple.dst.u3, info->sig_port[!dir]); @@ -1504,9 +1559,10 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, get_h225_addr(ct, *data, &arq->srcCallSignalAddress, &addr, &port) && !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && - set_h225_addr && ct->status & IPS_NAT_MASK) { + set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { /* Calling ARQ */ - return set_h225_addr(skb, data, 0, + return set_h225_addr(skb, protoff, data, 0, &arq->srcCallSignalAddress, &ct->tuplehash[!dir].tuple.dst.u3, port); @@ -1518,6 +1574,7 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, AdmissionConfirm *acf) { int dir = CTINFO2DIR(ctinfo); @@ -1536,8 +1593,9 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) { /* Answering ACF */ set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) - return set_sig_addr(skb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) + return set_sig_addr(skb, ct, ctinfo, protoff, data, &acf->destCallSignalAddress, 1); return 0; } @@ -1565,6 +1623,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, LocationRequest *lrq) { typeof(set_ras_addr_hook) set_ras_addr; @@ -1572,8 +1631,9 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_ras: LRQ\n"); set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) - return set_ras_addr(skb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) + return set_ras_addr(skb, ct, ctinfo, protoff, data, &lrq->replyAddress, 1); return 0; } @@ -1581,6 +1641,7 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, LocationConfirm *lcf) { int dir = CTINFO2DIR(ctinfo); @@ -1620,6 +1681,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_irr(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, InfoRequestResponse *irr) { int ret; @@ -1629,16 +1691,18 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_ras: IRR\n"); set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && ct->status & IPS_NAT_MASK) { - ret = set_ras_addr(skb, ct, ctinfo, data, + if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, protoff, data, &irr->rasAddress, 1); if (ret < 0) return -1; } set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(skb, ct, ctinfo, data, + if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, protoff, data, irr->callSignalAddress.item, irr->callSignalAddress.count); if (ret < 0) @@ -1651,38 +1715,39 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int process_ras(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned char **data, RasMessage *ras) { switch (ras->choice) { case eRasMessage_gatekeeperRequest: - return process_grq(skb, ct, ctinfo, data, + return process_grq(skb, ct, ctinfo, protoff, data, &ras->gatekeeperRequest); case eRasMessage_gatekeeperConfirm: - return process_gcf(skb, ct, ctinfo, data, + return process_gcf(skb, ct, ctinfo, protoff, data, &ras->gatekeeperConfirm); case eRasMessage_registrationRequest: - return process_rrq(skb, ct, ctinfo, data, + return process_rrq(skb, ct, ctinfo, protoff, data, &ras->registrationRequest); case eRasMessage_registrationConfirm: - return process_rcf(skb, ct, ctinfo, data, + return process_rcf(skb, ct, ctinfo, protoff, data, &ras->registrationConfirm); case eRasMessage_unregistrationRequest: - return process_urq(skb, ct, ctinfo, data, + return process_urq(skb, ct, ctinfo, protoff, data, &ras->unregistrationRequest); case eRasMessage_admissionRequest: - return process_arq(skb, ct, ctinfo, data, + return process_arq(skb, ct, ctinfo, protoff, data, &ras->admissionRequest); case eRasMessage_admissionConfirm: - return process_acf(skb, ct, ctinfo, data, + return process_acf(skb, ct, ctinfo, protoff, data, &ras->admissionConfirm); case eRasMessage_locationRequest: - return process_lrq(skb, ct, ctinfo, data, + return process_lrq(skb, ct, ctinfo, protoff, data, &ras->locationRequest); case eRasMessage_locationConfirm: - return process_lcf(skb, ct, ctinfo, data, + return process_lcf(skb, ct, ctinfo, protoff, data, &ras->locationConfirm); case eRasMessage_infoRequestResponse: - return process_irr(skb, ct, ctinfo, data, + return process_irr(skb, ct, ctinfo, protoff, data, &ras->infoRequestResponse); default: pr_debug("nf_ct_ras: RAS message %d\n", ras->choice); @@ -1722,7 +1787,7 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff, } /* Process RAS message */ - if (process_ras(skb, ct, ctinfo, &data, &ras) < 0) + if (process_ras(skb, ct, ctinfo, protoff, &data, &ras) < 0) goto drop; accept: @@ -1731,8 +1796,7 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - if (net_ratelimit()) - pr_info("nf_ct_ras: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process RAS message"); return NF_DROP; } @@ -1746,6 +1810,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { { .name = "RAS", .me = THIS_MODULE, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET, .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, @@ -1755,6 +1820,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { { .name = "RAS", .me = THIS_MODULE, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET6, .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, @@ -1833,4 +1899,6 @@ MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); MODULE_DESCRIPTION("H.323 connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_h323"); -MODULE_ALIAS_NFCT_HELPER("h323"); +MODULE_ALIAS_NFCT_HELPER("RAS"); +MODULE_ALIAS_NFCT_HELPER("Q.931"); +MODULE_ALIAS_NFCT_HELPER("H.245"); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index bbe23baa19b..5b3eae7d4c9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -28,12 +29,80 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_log.h> static DEFINE_MUTEX(nf_ct_helper_mutex); -static struct hlist_head *nf_ct_helper_hash __read_mostly; -static unsigned int nf_ct_helper_hsize __read_mostly; +struct hlist_head *nf_ct_helper_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_helper_hash); +unsigned int nf_ct_helper_hsize __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; +static bool nf_ct_auto_assign_helper __read_mostly = true; +module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); +MODULE_PARM_DESC(nf_conntrack_helper, + "Enable automatic conntrack helper assignment (default 1)"); + +#ifdef CONFIG_SYSCTL +static struct ctl_table helper_sysctl_table[] = { + { + .procname = "nf_conntrack_helper", + .data = &init_net.ct.sysctl_auto_assign_helper, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_auto_assign_helper; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.helper_sysctl_header = + register_net_sysctl(net, "net/netfilter", table); + + if (!net->ct.helper_sysctl_header) { + pr_err("nf_conntrack_helper: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.helper_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.helper_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ @@ -48,14 +117,13 @@ __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_helper *helper; struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; - struct hlist_node *n; unsigned int h; if (!nf_ct_helper_count) return NULL; h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) { + hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) return helper; } @@ -66,11 +134,10 @@ struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; - struct hlist_node *n; unsigned int i; for (i = 0; i < nf_ct_helper_hsize; i++) { - hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) { + hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { if (!strcmp(h->name, name) && h->tuple.src.l3num == l3num && h->tuple.dst.protonum == protonum) @@ -100,11 +167,14 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) } EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); -struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) +struct nf_conn_help * +nf_ct_helper_ext_add(struct nf_conn *ct, + struct nf_conntrack_helper *helper, gfp_t gfp) { struct nf_conn_help *help; - help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); + help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER, + helper->data_len, gfp); if (help) INIT_HLIST_HEAD(&help->expectations); else @@ -118,17 +188,38 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; + struct net *net = nf_ct_net(ct); int ret = 0; + /* We already got a helper explicitly attached. The function + * nf_conntrack_alter_reply - in case NAT is in use - asks for looking + * the helper up again. Since now the user is in full control of + * making consistent helper configurations, skip this automatic + * re-lookup, otherwise we'll lose the helper. + */ + if (test_bit(IPS_HELPER_BIT, &ct->status)) + return 0; + if (tmpl != NULL) { help = nfct_help(tmpl); - if (help != NULL) + if (help != NULL) { helper = help->helper; + set_bit(IPS_HELPER_BIT, &ct->status); + } } help = nfct_help(ct); - if (helper == NULL) + if (net->ct.sysctl_auto_assign_helper && helper == NULL) { helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + if (unlikely(!net->ct.auto_assign_helper_warned && helper)) { + pr_info("nf_conntrack: automatic helper " + "assignment is deprecated and it will " + "be removed soon. Use the iptables CT target " + "to attach helpers instead.\n"); + net->ct.auto_assign_helper_warned = true; + } + } + if (helper == NULL) { if (help) RCU_INIT_POINTER(help->helper, NULL); @@ -136,13 +227,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } if (help == NULL) { - help = nf_ct_helper_ext_add(ct, flags); + help = nf_ct_helper_ext_add(ct, helper, flags); if (help == NULL) { ret = -ENOMEM; goto out; } } else { - memset(&help->help, 0, sizeof(help->help)); + /* We only allow helper re-assignment of the same sort since + * we cannot reallocate the helper extension area. + */ + struct nf_conntrack_helper *tmp = rcu_dereference(help->helper); + + if (tmp && tmp->help != helper->help) { + RCU_INIT_POINTER(help->helper, NULL); + goto out; + } } rcu_assign_pointer(help->helper, helper); @@ -151,16 +250,14 @@ out: } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); +/* appropiate ct lock protecting must be taken by caller */ static inline int unhelp(struct nf_conntrack_tuple_hash *i, const struct nf_conntrack_helper *me) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_help *help = nfct_help(ct); - if (help && rcu_dereference_protected( - help->helper, - lockdep_is_held(&nf_conntrack_lock) - ) == me) { + if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); RCU_INIT_POINTER(help->helper, NULL); } @@ -181,8 +278,91 @@ void nf_ct_helper_destroy(struct nf_conn *ct) } } +static LIST_HEAD(nf_ct_helper_expectfn_list); + +void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) +{ + spin_lock_bh(&nf_conntrack_expect_lock); + list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); + spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); + +void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) +{ + spin_lock_bh(&nf_conntrack_expect_lock); + list_del_rcu(&n->head); + spin_unlock_bh(&nf_conntrack_expect_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); + +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_name(const char *name) +{ + struct nf_ct_helper_expectfn *cur; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { + if (!strcmp(cur->name, name)) { + found = true; + break; + } + } + rcu_read_unlock(); + return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); + +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_symbol(const void *symbol) +{ + struct nf_ct_helper_expectfn *cur; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { + if (cur->expectfn == symbol) { + found = true; + break; + } + } + rcu_read_unlock(); + return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); + +__printf(3, 4) +void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, + const char *fmt, ...) +{ + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + /* Called from the helper function, this call never fails */ + help = nfct_help(ct); + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + + nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, + "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_log); + int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { + int ret = 0; + struct nf_conntrack_helper *cur; unsigned int h = helper_hash(&me->tuple); BUG_ON(me->expect_policy == NULL); @@ -190,11 +370,19 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); mutex_lock(&nf_ct_helper_mutex); + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { + if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 && + cur->tuple.src.l3num == me->tuple.src.l3num && + cur->tuple.dst.protonum == me->tuple.dst.protonum) { + ret = -EEXIST; + goto out; + } + } hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); nf_ct_helper_count++; +out: mutex_unlock(&nf_ct_helper_mutex); - - return 0; + return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); @@ -203,18 +391,20 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; - const struct hlist_node *n, *next; + const struct hlist_node *next; const struct hlist_nulls_node *nn; unsigned int i; + int cpu; /* Get rid of expectations */ + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, n, next, + hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, - lockdep_is_held(&nf_conntrack_lock) + lockdep_is_held(&nf_conntrack_expect_lock) ) == me || exp->helper == me) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); @@ -222,14 +412,27 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } } } + spin_unlock_bh(&nf_conntrack_expect_lock); /* Get rid of expecteds, set helpers to NULL. */ - hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) - unhelp(h, me); - for (i = 0; i < net->ct.htable_size; i++) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) unhelp(h, me); + spin_unlock_bh(&pcpu->lock); + } + local_bh_disable(); + for (i = 0; i < net->ct.htable_size; i++) { + spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + if (i < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + unhelp(h, me); + } + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); } + local_bh_enable(); } void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) @@ -247,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) synchronize_rcu(); rtnl_lock(); - spin_lock_bh(&nf_conntrack_lock); for_each_net(net) __nf_conntrack_helper_unregister(me, net); - spin_unlock_bh(&nf_conntrack_lock); rtnl_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); @@ -261,24 +462,37 @@ static struct nf_ct_ext_type helper_extend __read_mostly = { .id = NF_CT_EXT_HELPER, }; -int nf_conntrack_helper_init(void) +int nf_conntrack_helper_pernet_init(struct net *net) { - int err; + net->ct.auto_assign_helper_warned = false; + net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; + return nf_conntrack_helper_init_sysctl(net); +} + +void nf_conntrack_helper_pernet_fini(struct net *net) +{ + nf_conntrack_helper_fini_sysctl(net); +} +int nf_conntrack_helper_init(void) +{ + int ret; nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ - nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); + nf_ct_helper_hash = + nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); if (!nf_ct_helper_hash) return -ENOMEM; - err = nf_ct_extend_register(&helper_extend); - if (err < 0) - goto err1; + ret = nf_ct_extend_register(&helper_extend); + if (ret < 0) { + pr_err("nf_ct_helper: Unable to register helper extension.\n"); + goto out_extend; + } return 0; - -err1: +out_extend: nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); - return err; + return ret; } void nf_conntrack_helper_fini(void) diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 4f9390b9869..0fd2976db7e 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -1,6 +1,7 @@ /* IRC extension for IP connection tracking, Version 1.21 * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> * based on RR's ip_conntrack_ftp.c + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -33,6 +34,7 @@ static DEFINE_SPINLOCK(irc_buffer_lock); unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned int matchoff, unsigned int matchlen, struct nf_conntrack_expect *exp) __read_mostly; @@ -185,16 +187,16 @@ static int help(struct sk_buff *skb, unsigned int protoff, tuple = &ct->tuplehash[dir].tuple; if (tuple->src.u3.ip != dcc_ip && tuple->dst.u3.ip != dcc_ip) { - if (net_ratelimit()) - printk(KERN_WARNING - "Forged DCC command from %pI4: %pI4:%u\n", - &tuple->src.u3.ip, - &dcc_ip, dcc_port); + net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", + &tuple->src.u3.ip, + &dcc_ip, dcc_port); continue; } exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, + "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -207,12 +209,15 @@ static int help(struct sk_buff *skb, unsigned int protoff, nf_nat_irc = rcu_dereference(nf_nat_irc_hook); if (nf_nat_irc && ct->status & IPS_NAT_MASK) - ret = nf_nat_irc(skb, ctinfo, + ret = nf_nat_irc(skb, ctinfo, protoff, addr_beg_p - ib_ptr, addr_end_p - addr_beg_p, exp); - else if (nf_ct_expect_related(exp) != 0) + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, + "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); goto out; } @@ -223,7 +228,6 @@ static int help(struct sk_buff *skb, unsigned int protoff, } static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; -static char irc_names[MAX_PORTS][sizeof("irc-65535")] __read_mostly; static struct nf_conntrack_expect_policy irc_exp_policy; static void nf_conntrack_irc_fini(void); @@ -231,7 +235,6 @@ static void nf_conntrack_irc_fini(void); static int __init nf_conntrack_irc_init(void) { int i, ret; - char *tmpname; if (max_dcc_channels < 1) { printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n"); @@ -257,12 +260,10 @@ static int __init nf_conntrack_irc_init(void) irc[i].me = THIS_MODULE; irc[i].help = help; - tmpname = &irc_names[i][0]; if (ports[i] == IRC_PORT) - sprintf(tmpname, "irc"); + sprintf(irc[i].name, "irc"); else - sprintf(tmpname, "irc-%u", i); - irc[i].name = tmpname; + sprintf(irc[i].name, "irc-%u", i); ret = nf_conntrack_helper_register(&irc[i]); if (ret) { diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c new file mode 100644 index 00000000000..bb53f120e79 --- /dev/null +++ b/net/netfilter/nf_conntrack_labels.c @@ -0,0 +1,108 @@ +/* + * test/set flag bits stored in conntrack extension area. + * + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/export.h> +#include <linux/types.h> + +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +static unsigned int label_bits(const struct nf_conn_labels *l) +{ + unsigned int longs = l->words; + return longs * BITS_PER_LONG; +} + +bool nf_connlabel_match(const struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return false; + + return bit < label_bits(labels) && test_bit(bit, labels->bits); +} +EXPORT_SYMBOL_GPL(nf_connlabel_match); + +int nf_connlabel_set(struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels || bit >= label_bits(labels)) + return -ENOSPC; + + if (test_bit(bit, labels->bits)) + return 0; + + if (!test_and_set_bit(bit, labels->bits)) + nf_conntrack_event_cache(IPCT_LABEL, ct); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabel_set); + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) +static void replace_u32(u32 *address, u32 mask, u32 new) +{ + u32 old, tmp; + + do { + old = *address; + tmp = (old & mask) ^ new; + } while (cmpxchg(address, old, tmp) != old); +} + +int nf_connlabels_replace(struct nf_conn *ct, + const u32 *data, + const u32 *mask, unsigned int words32) +{ + struct nf_conn_labels *labels; + unsigned int size, i; + u32 *dst; + + labels = nf_ct_labels_find(ct); + if (!labels) + return -ENOSPC; + + size = labels->words * sizeof(long); + if (size < (words32 * sizeof(u32))) + words32 = size / sizeof(u32); + + dst = (u32 *) labels->bits; + if (words32) { + for (i = 0; i < words32; i++) + replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); + } + + size /= sizeof(u32); + for (i = words32; i < size; i++) /* pad */ + replace_u32(&dst[i], 0, 0); + + nf_conntrack_event_cache(IPCT_LABEL, ct); + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_replace); +#endif + +static struct nf_ct_ext_type labels_extend __read_mostly = { + .len = sizeof(struct nf_conn_labels), + .align = __alignof__(struct nf_conn_labels), + .id = NF_CT_EXT_LABELS, +}; + +int nf_conntrack_labels_init(void) +{ + return nf_ct_extend_register(&labels_extend); +} + +void nf_conntrack_labels_fini(void) +{ + nf_ct_extend_unregister(&labels_extend); +} diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index b49da6c925b..300ed1eec72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -4,7 +4,7 @@ * (C) 2001 by Jay Schulist <jschlst@samba.org> * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org> * (C) 2003 by Patrick Mchardy <kaber@trash.net> - * (C) 2005-2011 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial connection tracking via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) @@ -37,15 +37,18 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_labels.h> #ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_protocol.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_helper.h> #endif #include <linux/netfilter/nfnetlink.h> @@ -66,7 +69,8 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; - NLA_PUT_U8(skb, CTA_PROTO_NUM, tuple->dst.protonum); + if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum)) + goto nla_put_failure; if (likely(l4proto->tuple_to_nlattr)) ret = l4proto->tuple_to_nlattr(skb, tuple); @@ -110,22 +114,24 @@ ctnetlink_dump_tuples(struct sk_buff *skb, struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; + rcu_read_lock(); l3proto = __nf_ct_l3proto_find(tuple->src.l3num); ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); - if (unlikely(ret < 0)) - return ret; - - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); - ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); - + if (ret >= 0) { + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, + tuple->dst.protonum); + ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); + } + rcu_read_unlock(); return ret; } static inline int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { - NLA_PUT_BE32(skb, CTA_STATUS, htonl(ct->status)); + if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) + goto nla_put_failure; return 0; nla_put_failure: @@ -140,7 +146,8 @@ ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) if (timeout < 0) timeout = 0; - NLA_PUT_BE32(skb, CTA_TIMEOUT, htonl(timeout)); + if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) + goto nla_put_failure; return 0; nla_put_failure: @@ -189,7 +196,8 @@ ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED); if (!nest_helper) goto nla_put_failure; - NLA_PUT_STRING(skb, CTA_HELP_NAME, helper->name); + if (nla_put_string(skb, CTA_HELP_NAME, helper->name)) + goto nla_put_failure; if (helper->to_nlattr) helper->to_nlattr(skb, ct); @@ -203,18 +211,29 @@ nla_put_failure: } static int -dump_counters(struct sk_buff *skb, u64 pkts, u64 bytes, - enum ip_conntrack_dir dir) +dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, + enum ip_conntrack_dir dir, int type) { - enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nf_conn_counter *counter = acct->counter; struct nlattr *nest_count; + u64 pkts, bytes; + + if (type == IPCTNL_MSG_CT_GET_CTRZERO) { + pkts = atomic64_xchg(&counter[dir].packets, 0); + bytes = atomic64_xchg(&counter[dir].bytes, 0); + } else { + pkts = atomic64_read(&counter[dir].packets); + bytes = atomic64_read(&counter[dir].bytes); + } - nest_count = nla_nest_start(skb, type | NLA_F_NESTED); + nest_count = nla_nest_start(skb, attr | NLA_F_NESTED); if (!nest_count) goto nla_put_failure; - NLA_PUT_BE64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)); - NLA_PUT_BE64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)); + if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) || + nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes))) + goto nla_put_failure; nla_nest_end(skb, nest_count); @@ -225,24 +244,19 @@ nla_put_failure: } static int -ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, - enum ip_conntrack_dir dir, int type) +ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type) { - struct nf_conn_counter *acct; - u64 pkts, bytes; + struct nf_conn_acct *acct = nf_conn_acct_find(ct); - acct = nf_conn_acct_find(ct); if (!acct) return 0; - if (type == IPCTNL_MSG_CT_GET_CTRZERO) { - pkts = atomic64_xchg(&acct[dir].packets, 0); - bytes = atomic64_xchg(&acct[dir].bytes, 0); - } else { - pkts = atomic64_read(&acct[dir].packets); - bytes = atomic64_read(&acct[dir].bytes); - } - return dump_counters(skb, pkts, bytes, dir); + if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0) + return -1; + if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0) + return -1; + + return 0; } static int @@ -259,11 +273,10 @@ ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_count) goto nla_put_failure; - NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)); - if (tstamp->stop != 0) { - NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP, - cpu_to_be64(tstamp->stop)); - } + if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) || + (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, + cpu_to_be64(tstamp->stop)))) + goto nla_put_failure; nla_nest_end(skb, nest_count); return 0; @@ -276,7 +289,8 @@ nla_put_failure: static inline int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) { - NLA_PUT_BE32(skb, CTA_MARK, htonl(ct->mark)); + if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) + goto nla_put_failure; return 0; nla_put_failure: @@ -303,7 +317,8 @@ ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_secctx) goto nla_put_failure; - NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx); + if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) + goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; @@ -315,6 +330,40 @@ nla_put_failure: #define ctnetlink_dump_secctx(a, b) (0) #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS +static int ctnetlink_label_size(const struct nf_conn *ct) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return 0; + return nla_total_size(labels->words * sizeof(long)); +} + +static int +ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + unsigned int len, i; + + if (!labels) + return 0; + + len = labels->words * sizeof(long); + i = 0; + do { + if (labels->bits[i] != 0) + return nla_put(skb, CTA_LABELS, len, labels->bits); + i++; + } while (i < labels->words); + + return 0; +} +#else +#define ctnetlink_dump_labels(a, b) (0) +#define ctnetlink_label_size(a) (0) +#endif + #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) static inline int @@ -338,9 +387,8 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NF_NAT_NEEDED static int -dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; @@ -348,12 +396,13 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) if (!nest_parms) goto nla_put_failure; - NLA_PUT_BE32(skb, CTA_NAT_SEQ_CORRECTION_POS, - htonl(natseq->correction_pos)); - NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, - htonl(natseq->offset_before)); - NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_AFTER, - htonl(natseq->offset_after)); + if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, + htonl(seq->correction_pos)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, + htonl(seq->offset_before)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, + htonl(seq->offset_after))) + goto nla_put_failure; nla_nest_end(skb, nest_parms); @@ -364,32 +413,30 @@ nla_put_failure: } static inline int -ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) { - struct nf_nat_seq *natseq; - struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *seq; - if (!(ct->status & IPS_SEQ_ADJUST) || !nat) + if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; - natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) + seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) return -1; - natseq = &nat->seq[IP_CT_DIR_REPLY]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) + seq = &seqadj->seq[IP_CT_DIR_REPLY]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) return -1; return 0; } -#else -#define ctnetlink_dump_nat_seq_adj(a, b) (0) -#endif static inline int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { - NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct)); + if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) + goto nla_put_failure; return 0; nla_put_failure: @@ -399,7 +446,8 @@ nla_put_failure: static inline int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { - NLA_PUT_BE32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))); + if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) + goto nla_put_failure; return 0; nla_put_failure: @@ -407,16 +455,16 @@ nla_put_failure: } static int -ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, +ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nf_conn *ct) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; - unsigned int flags = pid ? NLM_F_MULTI : 0, event; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW); - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -439,22 +487,23 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct)) - NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); + if (nf_ct_zone(ct) && + nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0 || ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, type) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, type) < 0 || + ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -466,7 +515,6 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NF_CONNTRACK_EVENTS static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) { @@ -486,7 +534,7 @@ ctnetlink_proto_size(const struct nf_conn *ct) } static inline size_t -ctnetlink_counters_size(const struct nf_conn *ct) +ctnetlink_acct_size(const struct nf_conn *ct) { if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) return 0; @@ -535,7 +583,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ - + ctnetlink_counters_size(ct) + + ctnetlink_acct_size(ct) + ctnetlink_timestamp_size(ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ @@ -549,10 +597,15 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + ctnetlink_proto_size(ct) + + ctnetlink_label_size(ct) ; } +#ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) { @@ -592,7 +645,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto errout; type |= NFNL_SUBSYS_CTNETLINK << 8; - nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); + nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -616,8 +669,9 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct)) - NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); + if (nf_ct_zone(ct) && + nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; @@ -626,10 +680,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; if (events & (1 << IPCT_DESTROY)) { - if (ctnetlink_dump_counters(skb, ct, - IP_CT_DIR_ORIGINAL, type) < 0 || - ctnetlink_dump_counters(skb, ct, - IP_CT_DIR_REPLY, type) < 0 || + if (ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0) goto nla_put_failure; } else { @@ -649,13 +700,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif + if (events & (1 << IPCT_LABEL) && + ctnetlink_dump_labels(skb, ct) < 0) + goto nla_put_failure; if (events & (1 << IPCT_RELATED) && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; - if (events & (1 << IPCT_NATSEQADJ) && - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + if (events & (1 << IPCT_SEQADJ) && + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; } @@ -667,7 +721,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - err = nfnetlink_send(skb, net, item->pid, group, item->report, + err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); if (err == -ENOBUFS || err == -EAGAIN) return -ENOBUFS; @@ -691,9 +745,18 @@ static int ctnetlink_done(struct netlink_callback *cb) { if (cb->args[1]) nf_ct_put((struct nf_conn *)cb->args[1]); + if (cb->data) + kfree(cb->data); return 0; } +struct ctnetlink_dump_filter { + struct { + u_int32_t val; + u_int32_t mask; + } mark; +}; + static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { @@ -703,11 +766,24 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_nulls_node *n; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; + int res; + spinlock_t *lockp; + +#ifdef CONFIG_NF_CONNTRACK_MARK + const struct ctnetlink_dump_filter *filter = cb->data; +#endif - spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; + + local_bh_disable(); for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: + lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; + spin_lock(lockp); + if (cb->args[0] >= net->ct.htable_size) { + spin_unlock(lockp); + goto out; + } hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -723,23 +799,34 @@ restart: continue; cb->args[1] = 0; } - if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - NFNL_MSG_TYPE( - cb->nlh->nlmsg_type), - ct) < 0) { +#ifdef CONFIG_NF_CONNTRACK_MARK + if (filter && !((ct->mark & filter->mark.mask) == + filter->mark.val)) { + continue; + } +#endif + rcu_read_lock(); + res = + ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; + spin_unlock(lockp); goto out; } } + spin_unlock(lockp); if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: - spin_unlock_bh(&nf_conntrack_lock); + local_bh_enable(); if (last) nf_ct_put(last); @@ -753,7 +840,9 @@ ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) struct nf_conntrack_l3proto *l3proto; int ret = 0; - nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); + ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); + if (ret < 0) + return ret; rcu_read_lock(); l3proto = __nf_ct_l3proto_find(tuple->src.l3num); @@ -820,7 +909,9 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], memset(tuple, 0, sizeof(*tuple)); - nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); + err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); + if (err < 0) + return err; if (!tb[CTA_TUPLE_IP]) return -EINVAL; @@ -863,21 +954,29 @@ ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) } static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { - [CTA_HELP_NAME] = { .type = NLA_NUL_STRING }, + [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, }; static inline int -ctnetlink_parse_help(const struct nlattr *attr, char **helper_name) +ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, + struct nlattr **helpinfo) { + int err; struct nlattr *tb[CTA_HELP_MAX+1]; - nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); + err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); + if (err < 0) + return err; if (!tb[CTA_HELP_NAME]) return -EINVAL; *helper_name = nla_data(tb[CTA_HELP_NAME]); + if (tb[CTA_HELP_INFO]) + *helpinfo = tb[CTA_HELP_INFO]; + return 0; } @@ -893,7 +992,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_ID] = { .type = NLA_U32 }, [CTA_NAT_DST] = { .type = NLA_NESTED }, [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, + [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED }, + [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED }, [CTA_ZONE] = { .type = NLA_U16 }, + [CTA_MARK_MASK] = { .type = NLA_U32 }, + [CTA_LABELS] = { .type = NLA_BINARY, + .len = NF_CT_LABELS_MAX_SIZE }, + [CTA_LABELS_MASK] = { .type = NLA_BINARY, + .len = NF_CT_LABELS_MAX_SIZE }, }; static int @@ -921,7 +1027,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, else { /* Flush the whole table */ nf_conntrack_flush_report(net, - NETLINK_CB(skb).pid, + NETLINK_CB(skb).portid, nlmsg_report(nlh)); return 0; } @@ -943,21 +1049,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, } } - if (del_timer(&ct->timeout)) { - if (nf_conntrack_event_report(IPCT_DESTROY, ct, - NETLINK_CB(skb).pid, - nlmsg_report(nlh)) < 0) { - nf_ct_delete_from_lists(ct); - /* we failed to report the event, try later */ - nf_ct_insert_dying_list(ct); - nf_ct_put(ct); - return 0; - } - /* death_by_timeout would report the event again */ - set_bit(IPS_DYING_BIT, &ct->status); - nf_ct_delete_from_lists(ct); - nf_ct_put(ct); - } + if (del_timer(&ct->timeout)) + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); + nf_ct_put(ct); return 0; @@ -978,9 +1072,28 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, u16 zone; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) - return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, - ctnetlink_done, 0); + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_table, + .done = ctnetlink_done, + }; +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + struct ctnetlink_dump_filter *filter; + + filter = kzalloc(sizeof(struct ctnetlink_dump_filter), + GFP_ATOMIC); + if (filter == NULL) + return -ENOMEM; + + filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); + filter->mark.mask = + ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + c.data = filter; + } +#endif + return netlink_dump_start(ctnl, skb, nlh, &c); + } err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) @@ -1010,14 +1123,14 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, } rcu_read_lock(); - err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), ct); rcu_read_unlock(); nf_ct_put(ct); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (err < 0) goto out; @@ -1030,6 +1143,123 @@ out: return err == -EAGAIN ? -ENOBUFS : err; } +static int ctnetlink_done_list(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); + return 0; +} + +static int +ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) +{ + struct nf_conn *ct, *last; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + int res; + int cpu; + struct hlist_nulls_head *list; + struct net *net = sock_net(skb->sk); + + if (cb->args[2]) + return 0; + + last = (struct nf_conn *)cb->args[1]; + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + struct ct_pcpu *pcpu; + + if (!cpu_possible(cpu)) + continue; + + pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + spin_lock_bh(&pcpu->lock); + list = dying ? &pcpu->dying : &pcpu->unconfirmed; +restart: + hlist_nulls_for_each_entry(h, n, list, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (l3proto && nf_ct_l3num(ct) != l3proto) + continue; + if (cb->args[1]) { + if (ct != last) + continue; + cb->args[1] = 0; + } + rcu_read_lock(); + res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { + if (!atomic_inc_not_zero(&ct->ct_general.use)) + continue; + cb->args[0] = cpu; + cb->args[1] = (unsigned long)ct; + spin_unlock_bh(&pcpu->lock); + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + spin_unlock_bh(&pcpu->lock); + } + cb->args[2] = 1; +out: + if (last) + nf_ct_put(last); + + return skb->len; +} + +static int +ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) +{ + return ctnetlink_dump_list(skb, cb, true); +} + +static int +ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_dying, + .done = ctnetlink_done_list, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return -EOPNOTSUPP; +} + +static int +ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) +{ + return ctnetlink_dump_list(skb, cb, false); +} + +static int +ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_unconfirmed, + .done = ctnetlink_done_list, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return -EOPNOTSUPP; +} + #ifdef CONFIG_NF_NAT_NEEDED static int ctnetlink_parse_nat_setup(struct nf_conn *ct, @@ -1037,18 +1267,19 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, const struct nlattr *attr) { typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; + int err; parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook); if (!parse_nat_setup) { #ifdef CONFIG_MODULES rcu_read_unlock(); - nfnl_unlock(); - if (request_module("nf-nat-ipv4") < 0) { - nfnl_lock(); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); + if (request_module("nf-nat") < 0) { + nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); if (nfnetlink_parse_nat_setup_hook) return -EAGAIN; @@ -1056,7 +1287,23 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, return -EOPNOTSUPP; } - return parse_nat_setup(ct, manip, attr); + err = parse_nat_setup(ct, manip, attr); + if (err == -EAGAIN) { +#ifdef CONFIG_MODULES + rcu_read_unlock(); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); + if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) { + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + rcu_read_lock(); + return -EOPNOTSUPP; + } + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + rcu_read_lock(); +#else + err = -EOPNOTSUPP; +#endif + } + return err; } #endif @@ -1087,27 +1334,25 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) } static int -ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_NAT_NEEDED int ret; - if (cda[CTA_NAT_DST]) { - ret = ctnetlink_parse_nat_setup(ct, - NF_NAT_MANIP_DST, - cda[CTA_NAT_DST]); - if (ret < 0) - return ret; - } - if (cda[CTA_NAT_SRC]) { - ret = ctnetlink_parse_nat_setup(ct, - NF_NAT_MANIP_SRC, - cda[CTA_NAT_SRC]); - if (ret < 0) - return ret; - } - return 0; + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, + cda[CTA_NAT_DST]); + if (ret < 0) + return ret; + + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); + return ret; #else + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; return -EOPNOTSUPP; #endif } @@ -1118,13 +1363,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) struct nf_conntrack_helper *helper; struct nf_conn_help *help = nfct_help(ct); char *helpname = NULL; + struct nlattr *helpinfo = NULL; int err; /* don't change helper of sibling connections */ if (ct->master) return -EBUSY; - err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) return err; @@ -1142,14 +1388,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) nf_ct_protonum(ct)); if (helper == NULL) { #ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); return -EOPNOTSUPP; } - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper) @@ -1159,20 +1405,17 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) } if (help) { - if (help->helper == helper) + if (help->helper == helper) { + /* update private helper data if allowed. */ + if (helper->from_nlattr) + helper->from_nlattr(helpinfo, ct); return 0; - if (help->helper) + } else return -EBUSY; - /* need to zero data of old helper */ - memset(&help->help, 0, sizeof(help->help)); - } else { - /* we cannot set a helper for an existing conntrack */ - return -EOPNOTSUPP; } - rcu_assign_pointer(help->helper, helper); - - return 0; + /* we cannot set a helper for an existing conntrack */ + return -EOPNOTSUPP; } static inline int @@ -1203,7 +1446,9 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[] struct nf_conntrack_l4proto *l4proto; int err = 0; - nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); + err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); + if (err < 0) + return err; rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); @@ -1214,63 +1459,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[] return err; } -#ifdef CONFIG_NF_NAT_NEEDED -static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = { - [CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 }, +static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { + [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; static inline int -change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr) +change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) { - struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; + int err; + struct nlattr *cda[CTA_SEQADJ_MAX+1]; - nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy); + err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy); + if (err < 0) + return err; - if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) + if (!cda[CTA_SEQADJ_CORRECTION_POS]) return -EINVAL; - natseq->correction_pos = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); + seq->correction_pos = + ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); - if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) + if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) return -EINVAL; - natseq->offset_before = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); + seq->offset_before = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); - if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) + if (!cda[CTA_SEQADJ_OFFSET_AFTER]) return -EINVAL; - natseq->offset_after = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); + seq->offset_after = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); return 0; } static int -ctnetlink_change_nat_seq_adj(struct nf_conn *ct, - const struct nlattr * const cda[]) +ctnetlink_change_seq_adj(struct nf_conn *ct, + const struct nlattr * const cda[]) { + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); int ret = 0; - struct nf_conn_nat *nat = nfct_nat(ct); - if (!nat) + if (!seqadj) return 0; - if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], - cda[CTA_NAT_SEQ_ADJ_ORIG]); + if (cda[CTA_SEQ_ADJ_ORIG]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) return ret; ct->status |= IPS_SEQ_ADJUST; } - if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], - cda[CTA_NAT_SEQ_ADJ_REPLY]); + if (cda[CTA_SEQ_ADJ_REPLY]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], + cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) return ret; @@ -1279,7 +1526,31 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct, return 0; } + +static int +ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS + size_t len = nla_len(cda[CTA_LABELS]); + const void *mask = cda[CTA_LABELS_MASK]; + + if (len & (sizeof(u32)-1)) /* must be multiple of u32 */ + return -EINVAL; + + if (mask) { + if (nla_len(cda[CTA_LABELS_MASK]) == 0 || + nla_len(cda[CTA_LABELS_MASK]) != len) + return -EINVAL; + mask = nla_data(cda[CTA_LABELS_MASK]); + } + + len /= sizeof(u32); + + return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len); +#else + return -EOPNOTSUPP; #endif +} static int ctnetlink_change_conntrack(struct nf_conn *ct, @@ -1320,13 +1591,17 @@ ctnetlink_change_conntrack(struct nf_conn *ct, ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); #endif -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_LABELS]) { + err = ctnetlink_attach_labels(ct, cda); if (err < 0) return err; } -#endif return 0; } @@ -1356,8 +1631,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; - - err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); + struct nlattr *helpinfo = NULL; + + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) goto err2; @@ -1386,11 +1662,14 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } else { struct nf_conn_help *help; - help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC); if (help == NULL) { err = -ENOMEM; goto err2; } + /* set private helper data if allowed. */ + if (helper->from_nlattr) + helper->from_nlattr(helpinfo, ct); /* not in hash table yet so not strictly necessary */ RCU_INIT_POINTER(help->helper, helper); @@ -1402,15 +1681,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, goto err2; } - if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { - err = ctnetlink_change_nat(ct, cda); - if (err < 0) - goto err2; - } + err = ctnetlink_setup_nat(ct, cda); + if (err < 0) + goto err2; nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); + nf_ct_labels_ext_add(ct); + /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; @@ -1420,13 +1699,11 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, goto err2; } -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) goto err2; } -#endif memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { @@ -1518,6 +1795,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; + if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) + return -EINVAL; + ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) @@ -1529,13 +1809,17 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, else events = IPCT_NEW; + if (cda[CTA_LABELS] && + ctnetlink_attach_labels(ct, cda) == 0) + events |= (1 << IPCT_LABEL); + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | events, - ct, NETLINK_CB(skb).pid, + ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); } @@ -1547,17 +1831,18 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | + (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK), - ct, NETLINK_CB(skb).pid, + ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); } } @@ -1566,6 +1851,394 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; } +static int +ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, + __u16 cpu, const struct ip_conntrack_stat *st) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; + + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(cpu); + + if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) || + nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || + nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) || + nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || + nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || + nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) || + nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) || + nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || + nla_put_be32(skb, CTA_STATS_INSERT_FAILED, + htonl(st->insert_failed)) || + nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) || + nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || + nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || + nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, + htonl(st->search_restart))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int cpu; + struct net *net = sock_net(skb->sk); + + if (cb->args[0] == nr_cpu_ids) + return 0; + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + const struct ip_conntrack_stat *st; + + if (!cpu_possible(cpu)) + continue; + + st = per_cpu_ptr(net->ct.stat, cpu); + if (ctnetlink_ct_stat_cpu_fill_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + cpu, st) < 0) + break; + } + cb->args[0] = cpu; + + return skb->len; +} + +static int +ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_ct_stat_cpu_dump, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return 0; +} + +static int +ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + struct net *net) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; + unsigned int nr_conntracks = atomic_read(&net->ct.count); + + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct sk_buff *skb2; + int err; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + sock_net(skb->sk)); + if (err <= 0) + goto free; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (err < 0) + goto out; + + return 0; + +free: + kfree_skb(skb2); +out: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { + [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, + [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, + [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, + [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [CTA_EXPECT_ID] = { .type = NLA_U32 }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, + [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, + [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, +}; + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask); + +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +static size_t +ctnetlink_nfqueue_build_size(const struct nf_conn *ct) +{ + return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + + nla_total_size(0) /* CTA_PROTOINFO */ + + nla_total_size(0) /* CTA_HELP */ + + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + + ctnetlink_secctx_size(ct) +#ifdef CONFIG_NF_NAT_NEEDED + + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + + ctnetlink_proto_size(ct) + ; +} + +static int +ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + rcu_read_lock(); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + if (nf_ct_zone(ct)) { + if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + goto nla_put_failure; + } + + if (ctnetlink_dump_id(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_status(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_timeout(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0) + goto nla_put_failure; +#endif + if (ct->master && ctnetlink_dump_master(skb, ct) < 0) + goto nla_put_failure; + + if ((ct->status & IPS_SEQ_ADJUST) && + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_MARK + if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) + goto nla_put_failure; +#endif + if (ctnetlink_dump_labels(skb, ct) < 0) + goto nla_put_failure; + rcu_read_unlock(); + return 0; + +nla_put_failure: + rcu_read_unlock(); + return -ENOSPC; +} + +static int +ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) +{ + int err; + + if (cda[CTA_TIMEOUT]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_STATUS]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_HELP]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_LABELS]) { + err = ctnetlink_attach_labels(ct, cda); + if (err < 0) + return err; + } +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (cda[CTA_MARK]) { + u32 mask = 0, mark, newmark; + if (cda[CTA_MARK_MASK]) + mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + + mark = ntohl(nla_get_be32(cda[CTA_MARK])); + newmark = (ct->mark & mask) ^ mark; + if (newmark != ct->mark) + ct->mark = newmark; + } +#endif + return 0; +} + +static int +ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) +{ + struct nlattr *cda[CTA_MAX+1]; + int ret; + + ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); + if (ret < 0) + return ret; + + spin_lock_bh(&nf_conntrack_expect_lock); + ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_unlock_bh(&nf_conntrack_expect_lock); + + return ret; +} + +static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, + const struct nf_conn *ct, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) +{ + int err; + + err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, + nf_ct_l3num(ct)); + if (err < 0) + return err; + + return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, + nf_ct_l3num(ct)); +} + +static int +ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report) +{ + struct nlattr *cda[CTA_EXPECT_MAX+1]; + struct nf_conntrack_tuple tuple, mask; + struct nf_conntrack_helper *helper = NULL; + struct nf_conntrack_expect *exp; + int err; + + err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy); + if (err < 0) + return err; + + err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, + ct, &tuple, &mask); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) + return -EOPNOTSUPP; + } + + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, + helper, &tuple, &mask); + if (IS_ERR(exp)) + return PTR_ERR(exp); + + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) { + nf_ct_expect_put(exp); + return err; + } + + return 0; +} + +static struct nfq_ct_hook ctnetlink_nfqueue_hook = { + .build_size = ctnetlink_nfqueue_build_size, + .build = ctnetlink_nfqueue_build, + .parse = ctnetlink_nfqueue_parse, + .attach_expect = ctnetlink_nfqueue_attach_expect, + .seq_adjust = nf_ct_tcp_seqadj_set, +}; +#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ + /*********************************************************************** * EXPECT ***********************************************************************/ @@ -1610,14 +2283,16 @@ ctnetlink_exp_dump_mask(struct sk_buff *skb, if (!nest_parms) goto nla_put_failure; + rcu_read_lock(); l3proto = __nf_ct_l3proto_find(tuple->src.l3num); ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); - - if (unlikely(ret < 0)) - goto nla_put_failure; - - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); + if (ret >= 0) { + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, + tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); + } + rcu_read_unlock(); + if (unlikely(ret < 0)) goto nla_put_failure; @@ -1629,6 +2304,8 @@ nla_put_failure: return -1; } +static const union nf_inet_addr any_addr; + static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) @@ -1636,6 +2313,11 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, struct nf_conn *master = exp->master; long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; struct nf_conn_help *help; +#ifdef CONFIG_NF_NAT_NEEDED + struct nlattr *nest_parms; + struct nf_conntrack_tuple nat_tuple = {}; +#endif + struct nf_ct_helper_expectfn *expfn; if (timeout < 0) timeout = 0; @@ -1649,17 +2331,45 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, CTA_EXPECT_MASTER) < 0) goto nla_put_failure; - NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); - NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); - NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); +#ifdef CONFIG_NF_NAT_NEEDED + if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || + exp->saved_proto.all) { + nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir))) + goto nla_put_failure; + + nat_tuple.src.l3num = nf_ct_l3num(master); + nat_tuple.src.u3 = exp->saved_addr; + nat_tuple.dst.protonum = nf_ct_protonum(master); + nat_tuple.src.u = exp->saved_proto; + + if (ctnetlink_exp_dump_tuple(skb, &nat_tuple, + CTA_EXPECT_NAT_TUPLE) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + } +#endif + if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || + nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || + nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || + nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) + goto nla_put_failure; help = nfct_help(master); if (help) { struct nf_conntrack_helper *helper; helper = rcu_dereference(help->helper); - if (helper) - NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); + if (helper && + nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) + goto nla_put_failure; } + expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); + if (expfn != NULL && + nla_put_string(skb, CTA_EXPECT_FN, expfn->name)) + goto nla_put_failure; return 0; @@ -1668,15 +2378,15 @@ nla_put_failure: } static int -ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - unsigned int flags = pid ? NLM_F_MULTI : 0; + unsigned int flags = portid ? NLM_F_MULTI : 0; event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -1727,7 +2437,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) goto errout; type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); + nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -1742,7 +2452,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC); + nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); return 0; nla_put_failure: @@ -1768,14 +2478,13 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - struct hlist_node *n; u_int8_t l3proto = nfmsg->nfgen_family; rcu_read_lock(); last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]], + hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; @@ -1785,7 +2494,7 @@ restart: cb->args[1] = 0; } if (ctnetlink_exp_fill_info(skb, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { @@ -1808,16 +2517,91 @@ out: return skb->len; } -static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { - [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, - [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, - [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, - [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, - [CTA_EXPECT_ID] = { .type = NLA_U32 }, - [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, - [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, - [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, -}; +static int +ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nf_conn *ct = cb->data; + struct nf_conn_help *help = nfct_help(ct); + u_int8_t l3proto = nfmsg->nfgen_family; + + if (cb->args[0]) + return 0; + + rcu_read_lock(); + last = (struct nf_conntrack_expect *)cb->args[1]; +restart: + hlist_for_each_entry(exp, &help->expectations, lnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (cb->args[1]) { + if (exp != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { + if (!atomic_inc_not_zero(&exp->use)) + continue; + cb->args[1] = (unsigned long)exp; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + cb->args[0] = 1; +out: + rcu_read_unlock(); + if (last) + nf_ct_expect_put(last); + + return skb->len; +} + +static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + int err; + struct net *net = sock_net(ctnl); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + u16 zone = 0; + struct netlink_dump_control c = { + .dump = ctnetlink_exp_ct_dump_table, + .done = ctnetlink_exp_done, + }; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + } + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + c.data = ct; + + err = netlink_dump_start(ctnl, skb, nlh, &c); + nf_ct_put(ct); + + return err; +} static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, @@ -1834,9 +2618,15 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { - return netlink_dump_start(ctnl, skb, nlh, - ctnetlink_exp_dump_table, - ctnetlink_exp_done, 0); + if (cda[CTA_EXPECT_MASTER]) + return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda); + else { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_dump_table, + .done = ctnetlink_exp_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } } err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); @@ -1873,14 +2663,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, } rcu_read_lock(); - err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); nf_ct_expect_put(exp); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (err < 0) goto out; @@ -1902,7 +2692,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct hlist_node *n, *next; + struct hlist_node *next; u_int8_t u3 = nfmsg->nfgen_family; unsigned int i; u16 zone; @@ -1932,13 +2722,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } /* after list removal, usage count == 1 */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { - nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid, + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_expect_put(exp); } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); @@ -1947,38 +2737,38 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conn_help *m_help; /* delete all expectations for this helper */ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, n, next, + hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, - NETLINK_CB(skb).pid, + NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_expect_put(exp); } } } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } else { /* This basically means we have to flush everything*/ - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, n, next, + hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, - NETLINK_CB(skb).pid, + NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_expect_put(exp); } } } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } return 0; @@ -1987,48 +2777,79 @@ static int ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { - return -EOPNOTSUPP; + if (cda[CTA_EXPECT_TIMEOUT]) { + if (!del_timer(&x->timeout)) + return -ETIME; + + x->timeout.expires = jiffies + + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; + add_timer(&x->timeout); + } + return 0; } +static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { + [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, +}; + static int -ctnetlink_create_expect(struct net *net, u16 zone, - const struct nlattr * const cda[], - u_int8_t u3, - u32 pid, int report) +ctnetlink_parse_expect_nat(const struct nlattr *attr, + struct nf_conntrack_expect *exp, + u_int8_t u3) { - struct nf_conntrack_tuple tuple, mask, master_tuple; - struct nf_conntrack_tuple_hash *h = NULL; - struct nf_conntrack_expect *exp; - struct nf_conn *ct; - struct nf_conn_help *help; - int err = 0; +#ifdef CONFIG_NF_NAT_NEEDED + struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; + struct nf_conntrack_tuple nat_tuple = {}; + int err; - /* caller guarantees that those three CTA_EXPECT_* exist */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); - if (err < 0) - return err; - err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + + if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE]) + return -EINVAL; + + err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, + &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3); if (err < 0) return err; - /* Look for master conntrack of this expectation */ - h = nf_conntrack_find_get(net, zone, &master_tuple); - if (!h) - return -ENOENT; - ct = nf_ct_tuplehash_to_ctrack(h); - exp = nf_ct_expect_alloc(ct); - if (!exp) { - err = -ENOMEM; - goto out; + exp->saved_addr = nat_tuple.src.u3; + exp->saved_proto = nat_tuple.src.u; + exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR])); + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) +{ + u_int32_t class = 0; + struct nf_conntrack_expect *exp; + struct nf_conn_help *help; + int err; + + if (cda[CTA_EXPECT_CLASS] && helper) { + class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); + if (class > helper->expect_class_max) + return ERR_PTR(-EINVAL); } + exp = nf_ct_expect_alloc(ct); + if (!exp) + return ERR_PTR(-ENOMEM); + help = nfct_help(ct); if (!help) { if (!cda[CTA_EXPECT_TIMEOUT]) { err = -EINVAL; - goto out; + goto err_out; } exp->timeout.expires = jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; @@ -2045,20 +2866,105 @@ ctnetlink_create_expect(struct net *net, u16 zone, } else exp->flags = 0; } + if (cda[CTA_EXPECT_FN]) { + const char *name = nla_data(cda[CTA_EXPECT_FN]); + struct nf_ct_helper_expectfn *expfn; - exp->class = 0; - exp->expectfn = NULL; - exp->master = ct; - exp->helper = NULL; - memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); - memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3)); - exp->mask.src.u.all = mask.src.u.all; + expfn = nf_ct_helper_expectfn_find_by_name(name); + if (expfn == NULL) { + err = -EINVAL; + goto err_out; + } + exp->expectfn = expfn->expectfn; + } else + exp->expectfn = NULL; - err = nf_ct_expect_related_report(exp, pid, report); + exp->class = class; + exp->master = ct; + exp->helper = helper; + exp->tuple = *tuple; + exp->mask.src.u3 = mask->src.u3; + exp->mask.src.u.all = mask->src.u.all; + + if (cda[CTA_EXPECT_NAT]) { + err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], + exp, nf_ct_l3num(ct)); + if (err < 0) + goto err_out; + } + return exp; +err_out: nf_ct_expect_put(exp); + return ERR_PTR(err); +} -out: - nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); +static int +ctnetlink_create_expect(struct net *net, u16 zone, + const struct nlattr * const cda[], + u_int8_t u3, u32 portid, int report) +{ + struct nf_conntrack_tuple tuple, mask, master_tuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nf_conntrack_helper *helper = NULL; + struct nf_conntrack_expect *exp; + struct nf_conn *ct; + int err; + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = nf_conntrack_find_get(net, zone, &master_tuple); + if (!h) + return -ENOENT; + ct = nf_ct_tuplehash_to_ctrack(h); + + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, u3, + nf_ct_protonum(ct)); + if (helper == NULL) { +#ifdef CONFIG_MODULES + if (request_module("nfct-helper-%s", helpname) < 0) { + err = -EOPNOTSUPP; + goto err_ct; + } + helper = __nf_conntrack_helper_find(helpname, u3, + nf_ct_protonum(ct)); + if (helper) { + err = -EAGAIN; + goto err_ct; + } +#endif + err = -EOPNOTSUPP; + goto err_ct; + } + } + + exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto err_ct; + } + + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) + goto err_exp; + + return 0; +err_exp: + nf_ct_expect_put(exp); +err_ct: + nf_ct_put(ct); return err; } @@ -2088,16 +2994,16 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - spin_lock_bh(&nf_conntrack_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = __nf_ct_expect_find(net, zone, &tuple); if (!exp) { - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { err = ctnetlink_create_expect(net, zone, cda, u3, - NETLINK_CB(skb).pid, + NETLINK_CB(skb).portid, nlmsg_report(nlh)); } return err; @@ -2106,11 +3012,84 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, err = -EEXIST; if (!(nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return err; } +static int +ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, + const struct ip_conntrack_stat *st) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0, event; + + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(cpu); + + if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || + nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || + nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int cpu; + struct net *net = sock_net(skb->sk); + + if (cb->args[0] == nr_cpu_ids) + return 0; + + for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { + const struct ip_conntrack_stat *st; + + if (!cpu_possible(cpu)) + continue; + + st = per_cpu_ptr(net->ct.stat, cpu); + if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + cpu, st) < 0) + break; + } + cb->args[0] = cpu; + + return skb->len; +} + +static int +ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_stat_cpu_dump, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return 0; +} + #ifdef CONFIG_NF_CONNTRACK_EVENTS static struct nf_ct_event_notifier ctnl_notifier = { .fcn = ctnetlink_conntrack_event, @@ -2134,6 +3113,10 @@ static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, .attr_count = CTA_MAX, .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu }, + [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct }, + [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying }, + [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { @@ -2146,6 +3129,7 @@ static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, + [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu }, }; static const struct nfnetlink_subsystem ctnl_subsys = { @@ -2231,11 +3215,15 @@ static int __init ctnetlink_init(void) goto err_unreg_subsys; } - if (register_pernet_subsys(&ctnetlink_net_ops)) { + ret = register_pernet_subsys(&ctnetlink_net_ops); + if (ret < 0) { pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } - +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT + /* setup interaction between nf_queue and nf_conntrack_netlink. */ + RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook); +#endif return 0; err_unreg_exp_subsys: @@ -2253,6 +3241,9 @@ static void __exit ctnetlink_exit(void) unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); +#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT + RCU_INIT_POINTER(nfq_ct_hook, NULL); +#endif } module_init(ctnetlink_init); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 31d56b23b9e..825c3e3f830 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -11,10 +11,12 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * * Limitations: * - We blindly assume that control connections are always * established in PNS->PAC direction. This is a violation - * of RFFC2673 + * of RFC 2637 * - We can only support one single call within each session * TODO: * - testing of incoming PPTP calls @@ -45,14 +47,14 @@ static DEFINE_SPINLOCK(nf_pptp_lock); int (*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, + unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound); int (*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, + unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound); @@ -174,7 +176,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, static void pptp_destroy_siblings(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - const struct nf_conn_help *help = nfct_help(ct); + const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); struct nf_conntrack_tuple t; nf_ct_gre_keymap_destroy(ct); @@ -182,16 +184,16 @@ static void pptp_destroy_siblings(struct nf_conn *ct) /* try original (pns->pac) tuple */ memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t)); t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id; - t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id; + t.src.u.gre.key = ct_pptp_info->pns_call_id; + t.dst.u.gre.key = ct_pptp_info->pac_call_id; if (!destroy_sibling_or_exp(net, ct, &t)) pr_debug("failed to timeout original pns->pac ct/exp\n"); /* try reply (pac->pns) tuple */ memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t)); t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id; - t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id; + t.src.u.gre.key = ct_pptp_info->pac_call_id; + t.dst.u.gre.key = ct_pptp_info->pns_call_id; if (!destroy_sibling_or_exp(net, ct, &t)) pr_debug("failed to timeout reply pac->pns ct/exp\n"); } @@ -262,14 +264,14 @@ out_unexpect_orig: } static inline int -pptp_inbound_pkt(struct sk_buff *skb, +pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq, unsigned int reqlen, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + struct nf_ct_pptp_master *info = nfct_help_data(ct); u_int16_t msg; __be16 cid = 0, pcid = 0; typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; @@ -376,7 +378,8 @@ pptp_inbound_pkt(struct sk_buff *skb, nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound); if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_inbound(skb, ct, ctinfo, ctlh, pptpReq); + return nf_nat_pptp_inbound(skb, ct, ctinfo, + protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: @@ -389,14 +392,14 @@ invalid: } static inline int -pptp_outbound_pkt(struct sk_buff *skb, +pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq, unsigned int reqlen, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + struct nf_ct_pptp_master *info = nfct_help_data(ct); u_int16_t msg; __be16 cid = 0, pcid = 0; typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; @@ -471,7 +474,8 @@ pptp_outbound_pkt(struct sk_buff *skb, nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound); if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_outbound(skb, ct, ctinfo, ctlh, pptpReq); + return nf_nat_pptp_outbound(skb, ct, ctinfo, + protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: @@ -506,7 +510,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, { int dir = CTINFO2DIR(ctinfo); - const struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + const struct nf_ct_pptp_master *info = nfct_help_data(ct); const struct tcphdr *tcph; struct tcphdr _tcph; const struct pptp_pkt_hdr *pptph; @@ -570,11 +574,11 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, * established from PNS->PAC. However, RFC makes no guarantee */ if (dir == IP_CT_DIR_ORIGINAL) /* client -> server (PNS -> PAC) */ - ret = pptp_outbound_pkt(skb, ctlh, pptpReq, reqlen, ct, + ret = pptp_outbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct, ctinfo); else /* server -> client (PAC -> PNS) */ - ret = pptp_inbound_pkt(skb, ctlh, pptpReq, reqlen, ct, + ret = pptp_inbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct, ctinfo); pr_debug("sstate: %d->%d, cstate: %d->%d\n", oldsstate, info->sstate, oldcstate, info->cstate); @@ -592,6 +596,7 @@ static const struct nf_conntrack_expect_policy pptp_exp_policy = { static struct nf_conntrack_helper pptp __read_mostly = { .name = "pptp", .me = THIS_MODULE, + .data_len = sizeof(struct nf_ct_pptp_master), .tuple.src.l3num = AF_INET, .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT), .tuple.dst.protonum = IPPROTO_TCP, @@ -600,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = { .expect_policy = &pptp_exp_policy, }; -static void nf_conntrack_pptp_net_exit(struct net *net) -{ - nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations nf_conntrack_pptp_net_ops = { - .exit = nf_conntrack_pptp_net_exit, -}; - static int __init nf_conntrack_pptp_init(void) { - int rv; - - rv = nf_conntrack_helper_register(&pptp); - if (rv < 0) - return rv; - rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); - if (rv < 0) - nf_conntrack_helper_unregister(&pptp); - return rv; + return nf_conntrack_helper_register(&pptp); } static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); } module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 5701c8dd783..b65d5864b6d 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,7 +22,6 @@ #include <linux/notifier.h> #include <linux/kernel.h> #include <linux/netdevice.h> -#include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l3proto.h> @@ -36,28 +36,32 @@ static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL static int -nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path, - struct ctl_table *table, unsigned int *users) +nf_ct_register_sysctl(struct net *net, + struct ctl_table_header **header, + const char *path, + struct ctl_table *table) { if (*header == NULL) { - *header = register_sysctl_paths(path, table); + *header = register_net_sysctl(net, path, table); if (*header == NULL) return -ENOMEM; } - if (users != NULL) - (*users)++; + return 0; } static void nf_ct_unregister_sysctl(struct ctl_table_header **header, - struct ctl_table *table, unsigned int *users) + struct ctl_table **table, + unsigned int users) { - if (users != NULL && --*users > 0) + if (users > 0) return; - unregister_sysctl_table(*header); + unregister_net_sysctl_table(*header); + kfree(*table); *header = NULL; + *table = NULL; } #endif @@ -88,12 +92,6 @@ nf_ct_l3proto_find_get(u_int16_t l3proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); -void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_put); - int nf_ct_l3proto_try_module_get(unsigned short l3proto) { @@ -127,6 +125,27 @@ void nf_ct_l3proto_module_put(unsigned short l3proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); +struct nf_conntrack_l4proto * +nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) +{ + struct nf_conntrack_l4proto *p; + + rcu_read_lock(); + p = __nf_ct_l4proto_find(l3num, l4num); + if (!try_module_get(p->me)) + p = &nf_conntrack_l4proto_generic; + rcu_read_unlock(); + + return p; +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); + +void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) +{ + module_put(p->me); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); + static int kill_l3proto(struct nf_conn *i, void *data) { return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto; @@ -140,30 +159,55 @@ static int kill_l4proto(struct nf_conn *i, void *data) nf_ct_l3num(i) == l4proto->l3proto; } -static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto) +static struct nf_ip_net *nf_ct_l3proto_net(struct net *net, + struct nf_conntrack_l3proto *l3proto) { - int err = 0; + if (l3proto->l3proto == PF_INET) + return &net->ct.nf_ct_proto; + else + return NULL; +} -#ifdef CONFIG_SYSCTL - if (l3proto->ctl_table != NULL) { - err = nf_ct_register_sysctl(&l3proto->ctl_table_header, +static int nf_ct_l3proto_register_sysctl(struct net *net, + struct nf_conntrack_l3proto *l3proto) +{ + int err = 0; + struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); + /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */ + if (in == NULL) + return 0; + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + if (in->ctl_table != NULL) { + err = nf_ct_register_sysctl(net, + &in->ctl_table_header, l3proto->ctl_table_path, - l3proto->ctl_table, NULL); + in->ctl_table); + if (err < 0) { + kfree(in->ctl_table); + in->ctl_table = NULL; + } } #endif return err; } -static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto) +static void nf_ct_l3proto_unregister_sysctl(struct net *net, + struct nf_conntrack_l3proto *l3proto) { -#ifdef CONFIG_SYSCTL - if (l3proto->ctl_table_header != NULL) - nf_ct_unregister_sysctl(&l3proto->ctl_table_header, - l3proto->ctl_table, NULL); + struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); + + if (in == NULL) + return; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + if (in->ctl_table_header != NULL) + nf_ct_unregister_sysctl(&in->ctl_table_header, + &in->ctl_table, + 0); #endif } -int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; struct nf_conntrack_l3proto *old; @@ -182,10 +226,6 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) goto out_unlock; } - ret = nf_ct_l3proto_register_sysctl(proto); - if (ret < 0) - goto out_unlock; - if (proto->nlattr_tuple_size) proto->nla_size = 3 * proto->nlattr_tuple_size(); @@ -194,13 +234,27 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) out_unlock: mutex_unlock(&nf_ct_proto_mutex); return ret; + } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); -void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +int nf_ct_l3proto_pernet_register(struct net *net, + struct nf_conntrack_l3proto *proto) { - struct net *net; + int ret = 0; + + if (proto->init_net) { + ret = proto->init_net(net); + if (ret < 0) + return ret; + } + return nf_ct_l3proto_register_sysctl(net, proto); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); + +void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) +{ BUG_ON(proto->l3proto >= AF_MAX); mutex_lock(&nf_ct_proto_mutex); @@ -209,68 +263,102 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) ) != proto); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], &nf_conntrack_l3proto_generic); - nf_ct_l3proto_unregister_sysctl(proto); mutex_unlock(&nf_ct_proto_mutex); synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); + +void nf_ct_l3proto_pernet_unregister(struct net *net, + struct nf_conntrack_l3proto *proto) +{ + nf_ct_l3proto_unregister_sysctl(net, proto); /* Remove all contrack entries for this protocol */ - rtnl_lock(); - for_each_net(net) - nf_ct_iterate_cleanup(net, kill_l3proto, proto); - rtnl_unlock(); + nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); -static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto) +static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, + struct nf_conntrack_l4proto *l4proto) +{ + if (l4proto->get_net_proto) { + /* statically built-in protocols use static per-net */ + return l4proto->get_net_proto(net); + } else if (l4proto->net_id) { + /* ... and loadable protocols use dynamic per-net */ + return net_generic(net, *l4proto->net_id); + } + return NULL; +} + +static +int nf_ct_l4proto_register_sysctl(struct net *net, + struct nf_proto_net *pn, + struct nf_conntrack_l4proto *l4proto) { int err = 0; #ifdef CONFIG_SYSCTL - if (l4proto->ctl_table != NULL) { - err = nf_ct_register_sysctl(l4proto->ctl_table_header, - nf_net_netfilter_sysctl_path, - l4proto->ctl_table, - l4proto->ctl_table_users); - if (err < 0) - goto out; + if (pn->ctl_table != NULL) { + err = nf_ct_register_sysctl(net, + &pn->ctl_table_header, + "net/netfilter", + pn->ctl_table); + if (err < 0) { + if (!pn->users) { + kfree(pn->ctl_table); + pn->ctl_table = NULL; + } + } } #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->ctl_compat_table != NULL) { - err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header, - nf_net_ipv4_netfilter_sysctl_path, - l4proto->ctl_compat_table, NULL); + if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) { + if (err < 0) { + nf_ct_kfree_compat_sysctl_table(pn); + goto out; + } + err = nf_ct_register_sysctl(net, + &pn->ctl_compat_header, + "net/ipv4/netfilter", + pn->ctl_compat_table); if (err == 0) goto out; - nf_ct_unregister_sysctl(l4proto->ctl_table_header, - l4proto->ctl_table, - l4proto->ctl_table_users); + + nf_ct_kfree_compat_sysctl_table(pn); + nf_ct_unregister_sysctl(&pn->ctl_table_header, + &pn->ctl_table, + pn->users); } -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ out: +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ return err; } -static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto) +static +void nf_ct_l4proto_unregister_sysctl(struct net *net, + struct nf_proto_net *pn, + struct nf_conntrack_l4proto *l4proto) { #ifdef CONFIG_SYSCTL - if (l4proto->ctl_table_header != NULL && - *l4proto->ctl_table_header != NULL) - nf_ct_unregister_sysctl(l4proto->ctl_table_header, - l4proto->ctl_table, - l4proto->ctl_table_users); + if (pn->ctl_table_header != NULL) + nf_ct_unregister_sysctl(&pn->ctl_table_header, + &pn->ctl_table, + pn->users); + #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->ctl_compat_table_header != NULL) - nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header, - l4proto->ctl_compat_table, NULL); + if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL) + nf_ct_unregister_sysctl(&pn->ctl_compat_header, + &pn->ctl_compat_table, + 0); #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ } /* FIXME: Allow NULL functions and sub in pointers to generic for them. --RR */ -int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto) { int ret = 0; @@ -312,10 +400,6 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) goto out_unlock; } - ret = nf_ct_l4proto_register_sysctl(l4proto); - if (ret < 0) - goto out_unlock; - l4proto->nla_size = 0; if (l4proto->nlattr_size) l4proto->nla_size += l4proto->nlattr_size(); @@ -324,17 +408,40 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); - out_unlock: mutex_unlock(&nf_ct_proto_mutex); return ret; } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); -void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_pernet_register(struct net *net, + struct nf_conntrack_l4proto *l4proto) { - struct net *net; + int ret = 0; + struct nf_proto_net *pn = NULL; + if (l4proto->init_net) { + ret = l4proto->init_net(net, l4proto->l3proto); + if (ret < 0) + goto out; + } + + pn = nf_ct_l4proto_net(net, l4proto); + if (pn == NULL) + goto out; + + ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto); + if (ret < 0) + goto out; + + pn->users++; +out: + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); + +void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) +{ BUG_ON(l4proto->l3proto >= PF_MAX); mutex_lock(&nf_ct_proto_mutex); @@ -344,28 +451,63 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) ) != l4proto); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], &nf_conntrack_l4proto_generic); - nf_ct_l4proto_unregister_sysctl(l4proto); mutex_unlock(&nf_ct_proto_mutex); synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); + +void nf_ct_l4proto_pernet_unregister(struct net *net, + struct nf_conntrack_l4proto *l4proto) +{ + struct nf_proto_net *pn = NULL; + + pn = nf_ct_l4proto_net(net, l4proto); + if (pn == NULL) + return; + + pn->users--; + nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); /* Remove all contrack entries for this protocol */ - rtnl_lock(); - for_each_net(net) - nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); - rtnl_unlock(); + nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0); } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); -int nf_conntrack_proto_init(void) +int nf_conntrack_proto_pernet_init(struct net *net) { - unsigned int i; int err; + struct nf_proto_net *pn = nf_ct_l4proto_net(net, + &nf_conntrack_l4proto_generic); - err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); + err = nf_conntrack_l4proto_generic.init_net(net, + nf_conntrack_l4proto_generic.l3proto); if (err < 0) return err; + err = nf_ct_l4proto_register_sysctl(net, + pn, + &nf_conntrack_l4proto_generic); + if (err < 0) + return err; + + pn->users++; + return 0; +} + +void nf_conntrack_proto_pernet_fini(struct net *net) +{ + struct nf_proto_net *pn = nf_ct_l4proto_net(net, + &nf_conntrack_l4proto_generic); + + pn->users--; + nf_ct_l4proto_unregister_sysctl(net, + pn, + &nf_conntrack_l4proto_generic); +} +int nf_conntrack_proto_init(void) +{ + unsigned int i; for (i = 0; i < AF_MAX; i++) rcu_assign_pointer(nf_ct_l3protos[i], &nf_conntrack_l3proto_generic); @@ -375,9 +517,6 @@ int nf_conntrack_proto_init(void) void nf_conntrack_proto_fini(void) { unsigned int i; - - nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); - /* free l3proto protocol tables */ for (i = 0; i < PF_MAX; i++) kfree(nf_ct_protos[i]); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index d6dde6dc09e..cb372f96f10 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -387,12 +387,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = /* this module per-net specifics */ static int dccp_net_id __read_mostly; struct dccp_net { + struct nf_proto_net pn; int dccp_loose; unsigned int dccp_timeout[CT_DCCP_MAX + 1]; -#ifdef CONFIG_SYSCTL - struct ctl_table_header *sysctl_header; - struct ctl_table *sysctl_table; -#endif }; static inline struct dccp_net *dccp_pernet(struct net *net) @@ -423,7 +420,7 @@ static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, } static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, unsigned int *timeouts) { struct net *net = nf_ct_net(ct); struct dccp_net *dn; @@ -431,7 +428,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, const char *msg; u_int8_t state; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); BUG_ON(dh == NULL); state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; @@ -459,7 +456,8 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); + nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, + NULL, "%s", msg); return false; } @@ -472,18 +470,23 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) ntohl(dhack->dccph_ack_nr_low); } +static unsigned int *dccp_get_timeouts(struct net *net) +{ + return dccp_pernet(net)->dccp_timeout; +} + static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int hooknum) + u_int8_t pf, unsigned int hooknum, + unsigned int *timeouts) { struct net *net = nf_ct_net(ct); - struct dccp_net *dn; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); BUG_ON(dh == NULL); type = dh->dccph_type; @@ -540,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid packet ignored "); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid state transition "); return -NF_ACCEPT; } @@ -559,8 +562,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); - dn = dccp_pernet(net); - nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; } @@ -575,7 +577,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, unsigned int cscov; const char *msg; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); if (dh == NULL) { msg = "nf_ct_dccp: short packet "; goto out_invalid; @@ -612,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg); return -NF_ACCEPT; } @@ -639,11 +641,12 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; - NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state); - NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE, - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]); - NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, - cpu_to_be64(ct->proto.dccp.handshake_seq)); + if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) || + nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || + nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, + cpu_to_be64(ct->proto.dccp.handshake_seq))) + goto nla_put_failure; nla_nest_end(skb, nest_parms); spin_unlock_bh(&ct->lock); return 0; @@ -702,8 +705,62 @@ static int dccp_nlattr_size(void) return nla_total_size(0) /* CTA_PROTOINFO_DCCP */ + nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1); } + #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + struct dccp_net *dn = dccp_pernet(net); + unsigned int *timeouts = data; + int i; + + /* set default DCCP timeouts. */ + for (i=0; i<CT_DCCP_MAX; i++) + timeouts[i] = dn->dccp_timeout[i]; + + /* there's a 1:1 mapping between attributes and protocol states. */ + for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { + if (tb[i]) { + timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; + } + } + return 0; +} + +static int +dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + int i; + + for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { + if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) + goto nla_put_failure; + } + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { + [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL /* template, data assigned later */ static struct ctl_table dccp_sysctl_table[] = { @@ -759,6 +816,55 @@ static struct ctl_table dccp_sysctl_table[] = { }; #endif /* CONFIG_SYSCTL */ +static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, + struct dccp_net *dn) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(dccp_sysctl_table, + sizeof(dccp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; + pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; + pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; + pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; + pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; + pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; + pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; + pn->ctl_table[7].data = &dn->dccp_loose; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + pn->ctl_table[0].procname = NULL; +#endif + return 0; +} + +static int dccp_init_net(struct net *net, u_int16_t proto) +{ + struct dccp_net *dn = dccp_pernet(net); + struct nf_proto_net *pn = &dn->pn; + + if (!pn->users) { + /* default values */ + dn->dccp_loose = 1; + dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; + } + + return dccp_kmemdup_sysctl_table(net, pn, dn); +} + static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, @@ -767,6 +873,7 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, + .get_timeouts = dccp_get_timeouts, .error = dccp_error, .print_tuple = dccp_print_tuple, .print_conntrack = dccp_print_conntrack, @@ -779,6 +886,17 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = dccp_timeout_nlattr_to_obj, + .obj_to_nlattr = dccp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_DCCP_MAX, + .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, + .nla_policy = dccp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &dccp_net_id, + .init_net = dccp_init_net, }; static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { @@ -789,6 +907,7 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, + .get_timeouts = dccp_get_timeouts, .error = dccp_error, .print_tuple = dccp_print_tuple, .print_conntrack = dccp_print_conntrack, @@ -801,55 +920,43 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = dccp_timeout_nlattr_to_obj, + .obj_to_nlattr = dccp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_DCCP_MAX, + .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, + .nla_policy = dccp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &dccp_net_id, + .init_net = dccp_init_net, }; static __net_init int dccp_net_init(struct net *net) { - struct dccp_net *dn = dccp_pernet(net); - - /* default values */ - dn->dccp_loose = 1; - dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; - -#ifdef CONFIG_SYSCTL - dn->sysctl_table = kmemdup(dccp_sysctl_table, - sizeof(dccp_sysctl_table), GFP_KERNEL); - if (!dn->sysctl_table) - return -ENOMEM; - - dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; - dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; - dn->sysctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; - dn->sysctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; - dn->sysctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; - dn->sysctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; - dn->sysctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; - dn->sysctl_table[7].data = &dn->dccp_loose; - - dn->sysctl_header = register_net_sysctl_table(net, - nf_net_netfilter_sysctl_path, dn->sysctl_table); - if (!dn->sysctl_header) { - kfree(dn->sysctl_table); - return -ENOMEM; + int ret = 0; + ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4); + if (ret < 0) { + pr_err("nf_conntrack_dccp4: pernet registration failed.\n"); + goto out; + } + ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6); + if (ret < 0) { + pr_err("nf_conntrack_dccp6: pernet registration failed.\n"); + goto cleanup_dccp4; } -#endif - return 0; +cleanup_dccp4: + nf_ct_l4proto_pernet_unregister(net, &dccp_proto4); +out: + return ret; } static __net_exit void dccp_net_exit(struct net *net) { - struct dccp_net *dn = dccp_pernet(net); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(dn->sysctl_header); - kfree(dn->sysctl_table); -#endif + nf_ct_l4proto_pernet_unregister(net, &dccp_proto6); + nf_ct_l4proto_pernet_unregister(net, &dccp_proto4); } static struct pernet_operations dccp_net_ops = { @@ -861,34 +968,34 @@ static struct pernet_operations dccp_net_ops = { static int __init nf_conntrack_proto_dccp_init(void) { - int err; + int ret; - err = register_pernet_subsys(&dccp_net_ops); - if (err < 0) - goto err1; + ret = register_pernet_subsys(&dccp_net_ops); + if (ret < 0) + goto out_pernet; - err = nf_conntrack_l4proto_register(&dccp_proto4); - if (err < 0) - goto err2; + ret = nf_ct_l4proto_register(&dccp_proto4); + if (ret < 0) + goto out_dccp4; - err = nf_conntrack_l4proto_register(&dccp_proto6); - if (err < 0) - goto err3; - return 0; + ret = nf_ct_l4proto_register(&dccp_proto6); + if (ret < 0) + goto out_dccp6; -err3: - nf_conntrack_l4proto_unregister(&dccp_proto4); -err2: + return 0; +out_dccp6: + nf_ct_l4proto_unregister(&dccp_proto4); +out_dccp4: unregister_pernet_subsys(&dccp_net_ops); -err1: - return err; +out_pernet: + return ret; } static void __exit nf_conntrack_proto_dccp_fini(void) { + nf_ct_l4proto_unregister(&dccp_proto6); + nf_ct_l4proto_unregister(&dccp_proto4); unregister_pernet_subsys(&dccp_net_ops); - nf_conntrack_l4proto_unregister(&dccp_proto6); - nf_conntrack_l4proto_unregister(&dccp_proto4); } module_init(nf_conntrack_proto_dccp_init); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e2091d0c7a2..d25f2937764 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -14,6 +14,11 @@ static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; +static inline struct nf_generic_net *generic_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.generic; +} + static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) @@ -40,31 +45,77 @@ static int generic_print_tuple(struct seq_file *s, return 0; } +static unsigned int *generic_get_timeouts(struct net *net) +{ + return &(generic_pernet(net)->timeout); +} + /* Returns verdict for packet, or -1 for invalid. */ -static int packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - u_int8_t pf, - unsigned int hooknum) +static int generic_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeout) { - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static bool new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) +static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) { return true; } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_generic_net *gn = generic_pernet(net); + + if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ; + else { + /* Set default generic timeout. */ + *timeout = gn->timeout; + } + + return 0; +} + +static int +generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { + [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static struct ctl_table_header *generic_sysctl_header; static struct ctl_table generic_sysctl_table[] = { { .procname = "nf_conntrack_generic_timeout", - .data = &nf_ct_generic_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -75,7 +126,6 @@ static struct ctl_table generic_sysctl_table[] = { static struct ctl_table generic_compat_sysctl_table[] = { { .procname = "ip_conntrack_generic_timeout", - .data = &nf_ct_generic_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -85,6 +135,62 @@ static struct ctl_table generic_compat_sysctl_table[] = { #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ +static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_generic_net *gn) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(generic_sysctl_table, + sizeof(generic_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &gn->timeout; +#endif + return 0; +} + +static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_generic_net *gn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table, + sizeof(generic_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &gn->timeout; +#endif +#endif + return 0; +} + +static int generic_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_generic_net *gn = generic_pernet(net); + struct nf_proto_net *pn = &gn->pn; + + gn->timeout = nf_ct_generic_timeout; + + ret = generic_kmemdup_compat_sysctl_table(pn, gn); + if (ret < 0) + return ret; + + ret = generic_kmemdup_sysctl_table(pn, gn); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + + return ret; +} + +static struct nf_proto_net *generic_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.generic.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = { .l3proto = PF_UNSPEC, @@ -93,13 +199,18 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, .print_tuple = generic_print_tuple, - .packet = packet, - .new = new, -#ifdef CONFIG_SYSCTL - .ctl_table_header = &generic_sysctl_header, - .ctl_table = generic_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = generic_compat_sysctl_table, -#endif -#endif + .packet = generic_packet, + .get_timeouts = generic_get_timeouts, + .new = generic_new, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = generic_timeout_nlattr_to_obj, + .obj_to_nlattr = generic_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_GENERIC_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = generic_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = generic_init_net, + .get_net_proto = generic_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index f0338791b82..d5665739e3b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -21,6 +21,7 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> @@ -41,18 +42,33 @@ #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> -#define GRE_TIMEOUT (30 * HZ) -#define GRE_STREAM_TIMEOUT (180 * HZ) +enum grep_conntrack { + GRE_CT_UNREPLIED, + GRE_CT_REPLIED, + GRE_CT_MAX +}; + +static unsigned int gre_timeouts[GRE_CT_MAX] = { + [GRE_CT_UNREPLIED] = 30*HZ, + [GRE_CT_REPLIED] = 180*HZ, +}; static int proto_gre_net_id __read_mostly; struct netns_proto_gre { + struct nf_proto_net nf; rwlock_t keymap_lock; struct list_head keymap_list; + unsigned int gre_timeouts[GRE_CT_MAX]; }; -void nf_ct_gre_keymap_flush(struct net *net) +static inline struct netns_proto_gre *gre_pernet(struct net *net) +{ + return net_generic(net, proto_gre_net_id); +} + +static void nf_ct_gre_keymap_flush(struct net *net) { - struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km, *tmp; write_lock_bh(&net_gre->keymap_lock); @@ -62,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net) } write_unlock_bh(&net_gre->keymap_lock); } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush); static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) @@ -77,7 +92,7 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, /* look up the source key for a given tuple */ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) { - struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km; __be16 key = 0; @@ -101,11 +116,11 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t) { struct net *net = nf_ct_net(ct); - struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); - struct nf_conn_help *help = nfct_help(ct); + struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); struct nf_ct_gre_keymap **kmp, *km; - kmp = &help->help.ct_pptp_info.keymap[dir]; + kmp = &ct_pptp_info->keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ read_lock_bh(&net_gre->keymap_lock); @@ -142,20 +157,20 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); void nf_ct_gre_keymap_destroy(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); - struct nf_conn_help *help = nfct_help(ct); + struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); enum ip_conntrack_dir dir; pr_debug("entering for ct %p\n", ct); write_lock_bh(&net_gre->keymap_lock); for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { - if (help->help.ct_pptp_info.keymap[dir]) { + if (ct_pptp_info->keymap[dir]) { pr_debug("removing %p from list\n", - help->help.ct_pptp_info.keymap[dir]); - list_del(&help->help.ct_pptp_info.keymap[dir]->list); - kfree(help->help.ct_pptp_info.keymap[dir]); - help->help.ct_pptp_info.keymap[dir] = NULL; + ct_pptp_info->keymap[dir]); + list_del(&ct_pptp_info->keymap[dir]->list); + kfree(ct_pptp_info->keymap[dir]); + ct_pptp_info->keymap[dir] = NULL; } } write_unlock_bh(&net_gre->keymap_lock); @@ -227,13 +242,19 @@ static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) (ct->proto.gre.stream_timeout / HZ)); } +static unsigned int *gre_get_timeouts(struct net *net) +{ + return gre_pernet(net)->gre_timeouts; +} + /* Returns verdict for packet, and may modify conntrack */ static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, - unsigned int hooknum) + unsigned int hooknum, + unsigned int *timeouts) { /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ @@ -252,15 +273,15 @@ static int gre_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, unsigned int *timeouts) { pr_debug(": "); nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); /* initialize to sane value. Ideally a conntrack helper * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; - ct->proto.gre.timeout = GRE_TIMEOUT; + ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; + ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; return true; } @@ -278,6 +299,68 @@ static void gre_destroy(struct nf_conn *ct) nf_ct_gre_keymap_destroy(master); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct netns_proto_gre *net_gre = gre_pernet(net); + + /* set default timeouts for GRE. */ + timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; + timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) { + timeouts[GRE_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_GRE_REPLIED]) { + timeouts[GRE_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ; + } + return 0; +} + +static int +gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED, + htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED, + htonl(timeouts[GRE_CT_REPLIED] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { + [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +static int gre_init_net(struct net *net, u_int16_t proto) +{ + struct netns_proto_gre *net_gre = gre_pernet(net); + int i; + + rwlock_init(&net_gre->keymap_lock); + INIT_LIST_HEAD(&net_gre->keymap_list); + for (i = 0; i < GRE_CT_MAX; i++) + net_gre->gre_timeouts[i] = gre_timeouts[i]; + + return 0; +} + /* protocol helper struct */ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { .l3proto = AF_INET, @@ -287,6 +370,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { .invert_tuple = gre_invert_tuple, .print_tuple = gre_print_tuple, .print_conntrack = gre_print_conntrack, + .get_timeouts = gre_get_timeouts, .packet = gre_packet, .new = gre_new, .destroy = gre_destroy, @@ -297,20 +381,31 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = gre_timeout_nlattr_to_obj, + .obj_to_nlattr = gre_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_GRE_MAX, + .obj_size = sizeof(unsigned int) * GRE_CT_MAX, + .nla_policy = gre_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &proto_gre_net_id, + .init_net = gre_init_net, }; static int proto_gre_net_init(struct net *net) { - struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); - - rwlock_init(&net_gre->keymap_lock); - INIT_LIST_HEAD(&net_gre->keymap_list); - - return 0; + int ret = 0; + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4); + if (ret < 0) + pr_err("nf_conntrack_gre4: pernet registration failed.\n"); + return ret; } static void proto_gre_net_exit(struct net *net) { + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4); nf_ct_gre_keymap_flush(net); } @@ -323,20 +418,26 @@ static struct pernet_operations proto_gre_net_ops = { static int __init nf_ct_proto_gre_init(void) { - int rv; - - rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4); - if (rv < 0) - return rv; - rv = register_pernet_subsys(&proto_gre_net_ops); - if (rv < 0) - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); - return rv; + int ret; + + ret = register_pernet_subsys(&proto_gre_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); + if (ret < 0) + goto out_gre4; + + return 0; +out_gre4: + unregister_pernet_subsys(&proto_gre_net_ops); +out_pernet: + return ret; } static void __exit nf_ct_proto_gre_fini(void) { - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4); unregister_pernet_subsys(&proto_gre_net_ops); } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index afa69136061..1314d33f6bc 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -1,6 +1,9 @@ /* * Connection tracking protocol helper module for SCTP. * + * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> + * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> + * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. * @@ -127,6 +130,17 @@ static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { } }; +static int sctp_net_id __read_mostly; +struct sctp_net { + struct nf_proto_net pn; + unsigned int timeouts[SCTP_CONNTRACK_MAX]; +}; + +static inline struct sctp_net *sctp_pernet(struct net *net) +{ + return net_generic(net, sctp_net_id); +} + static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { @@ -279,13 +293,19 @@ static int sctp_new_state(enum ip_conntrack_dir dir, return sctp_conntracks[dir][i][cur_state]; } +static unsigned int *sctp_get_timeouts(struct net *net) +{ + return sctp_pernet(net)->timeouts; +} + /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, - unsigned int hooknum) + unsigned int hooknum, + unsigned int *timeouts) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -370,7 +390,7 @@ static int sctp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); - nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && dir == IP_CT_DIR_REPLY && @@ -390,7 +410,7 @@ out: /* Called when a new connection for this protocol found. */ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, unsigned int *timeouts) { enum sctp_conntrack new_state; const struct sctphdr *sh; @@ -476,15 +496,12 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - NLA_PUT_U8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state); - - NLA_PUT_BE32(skb, - CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, - ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]); - - NLA_PUT_BE32(skb, - CTA_PROTOINFO_SCTP_VTAG_REPLY, - ct->proto.sctp.vtag[IP_CT_DIR_REPLY]); + if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) || + nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || + nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, + ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) + goto nla_put_failure; spin_unlock_bh(&ct->lock); @@ -543,55 +560,100 @@ static int sctp_nlattr_size(void) } #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct sctp_net *sn = sctp_pernet(net); + int i; + + /* set default SCTP timeouts. */ + for (i=0; i<SCTP_CONNTRACK_MAX; i++) + timeouts[i] = sn->timeouts[i]; + + /* there's a 1:1 mapping between attributes and protocol states. */ + for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { + if (tb[i]) { + timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; + } + } + return 0; +} + +static int +sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + int i; + + for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { + if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) + goto nla_put_failure; + } + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { + [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + + #ifdef CONFIG_SYSCTL -static unsigned int sctp_sysctl_table_users; -static struct ctl_table_header *sctp_sysctl_header; static struct ctl_table sctp_sysctl_table[] = { { .procname = "nf_conntrack_sctp_timeout_closed", - .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_cookie_wait", - .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_cookie_echoed", - .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_established", - .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_shutdown_sent", - .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_shutdown_recd", - .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -603,49 +665,42 @@ static struct ctl_table sctp_sysctl_table[] = { static struct ctl_table sctp_compat_sysctl_table[] = { { .procname = "ip_conntrack_sctp_timeout_closed", - .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_cookie_wait", - .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_cookie_echoed", - .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_established", - .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_shutdown_sent", - .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_shutdown_recd", - .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -655,6 +710,80 @@ static struct ctl_table sctp_compat_sysctl_table[] = { #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif +static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct sctp_net *sn) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(sctp_sysctl_table, + sizeof(sctp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; + pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; + pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; + pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; + pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; + pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; + pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; +#endif + return 0; +} + +static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct sctp_net *sn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table, + sizeof(sctp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; + pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; + pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; + pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; + pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; + pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; + pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; +#endif +#endif + return 0; +} + +static int sctp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct sctp_net *sn = sctp_pernet(net); + struct nf_proto_net *pn = &sn->pn; + + if (!pn->users) { + int i; + + for (i = 0; i < SCTP_CONNTRACK_MAX; i++) + sn->timeouts[i] = sctp_timeouts[i]; + } + + if (proto == AF_INET) { + ret = sctp_kmemdup_compat_sysctl_table(pn, sn); + if (ret < 0) + return ret; + + ret = sctp_kmemdup_sysctl_table(pn, sn); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + } else + ret = sctp_kmemdup_sysctl_table(pn, sn); + + return ret; +} + static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, @@ -664,6 +793,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .print_tuple = sctp_print_tuple, .print_conntrack = sctp_print_conntrack, .packet = sctp_packet, + .get_timeouts = sctp_get_timeouts, .new = sctp_new, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -675,14 +805,17 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &sctp_sysctl_table_users, - .ctl_table_header = &sctp_sysctl_header, - .ctl_table = sctp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = sctp_compat_sysctl_table, -#endif -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = sctp_timeout_nlattr_to_obj, + .obj_to_nlattr = sctp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_SCTP_MAX, + .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, + .nla_policy = sctp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &sctp_net_id, + .init_net = sctp_init_net, }; static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { @@ -694,6 +827,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .print_tuple = sctp_print_tuple, .print_conntrack = sctp_print_conntrack, .packet = sctp_packet, + .get_timeouts = sctp_get_timeouts, .new = sctp_new, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -704,41 +838,85 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = sctp_timeout_nlattr_to_obj, + .obj_to_nlattr = sctp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_SCTP_MAX, + .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, + .nla_policy = sctp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &sctp_sysctl_table_users, - .ctl_table_header = &sctp_sysctl_header, - .ctl_table = sctp_sysctl_table, -#endif + .net_id = &sctp_net_id, + .init_net = sctp_init_net, }; -static int __init nf_conntrack_proto_sctp_init(void) +static int sctp_net_init(struct net *net) { - int ret; + int ret = 0; - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4); - if (ret) { - pr_err("nf_conntrack_l4proto_sctp4: protocol register failed\n"); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4); + if (ret < 0) { + pr_err("nf_conntrack_sctp4: pernet registration failed.\n"); goto out; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6); - if (ret) { - pr_err("nf_conntrack_l4proto_sctp6: protocol register failed\n"); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6); + if (ret < 0) { + pr_err("nf_conntrack_sctp6: pernet registration failed.\n"); goto cleanup_sctp4; } + return 0; +cleanup_sctp4: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); +out: return ret; +} + +static void sctp_net_exit(struct net *net) +{ + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); +} + +static struct pernet_operations sctp_net_ops = { + .init = sctp_net_init, + .exit = sctp_net_exit, + .id = &sctp_net_id, + .size = sizeof(struct sctp_net), +}; - cleanup_sctp4: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); - out: +static int __init nf_conntrack_proto_sctp_init(void) +{ + int ret; + + ret = register_pernet_subsys(&sctp_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4); + if (ret < 0) + goto out_sctp4; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6); + if (ret < 0) + goto out_sctp6; + + return 0; +out_sctp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +out_sctp4: + unregister_pernet_subsys(&sctp_net_ops); +out_pernet: return ret; } static void __exit nf_conntrack_proto_sctp_fini(void) { - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); + unregister_pernet_subsys(&sctp_net_ops); } module_init(nf_conntrack_proto_sctp_init); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 97b9f3ebf28..44d1ea32570 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,5 +1,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -25,6 +27,8 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> @@ -64,13 +68,7 @@ static const char *const tcp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -/* RFC1122 says the R2 limit should be at least 100 seconds. - Linux uses 15 packets as limit, which corresponds - to ~13-30min depending on RTO. */ -static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; -static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly = 5 MINS; - -static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { +static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, [TCP_CONNTRACK_SYN_RECV] = 60 SECS, [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, @@ -80,6 +78,11 @@ static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE] = 10 SECS, [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ + [TCP_CONNTRACK_RETRANS] = 5 MINS, + [TCP_CONNTRACK_UNACK] = 5 MINS, }; #define sNO TCP_CONNTRACK_NONE @@ -159,21 +162,18 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ -/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open - * sSR -> sIG - * sES -> sIG Error: SYNs in window outside the SYN_SENT state - * are errors. Receiver will reply with RST - * and close the connection. - * Or we are not in sync and hold a dead connection. - * sFW -> sIG - * sCW -> sIG - * sLA -> sIG - * sTW -> sIG - * sCL -> sIG + * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open + * sES -> sIV Invalid SYN/ACK packets sent by the client + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV + * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, @@ -271,6 +271,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { } }; +static inline struct nf_tcp_net *tcp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.tcp; +} + static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { @@ -492,21 +497,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -#ifdef CONFIG_NF_NAT_NEEDED -static inline s16 nat_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); - - return get_offset != NULL ? get_offset(ct, dir, seq) : 0; -} -#define NAT_OFFSET(pf, ct, dir, seq) \ - (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0) -#else -#define NAT_OFFSET(pf, ct, dir, seq) 0 -#endif - static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp *state, enum ip_conntrack_dir dir, @@ -517,12 +507,13 @@ static bool tcp_in_window(const struct nf_conn *ct, u_int8_t pf) { struct net *net = nf_ct_net(ct); + struct nf_tcp_net *tn = tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; - s16 receiver_offset; - bool res; + s32 receiver_offset; + bool res, in_recv_win; /* * Get the required data from the packet. @@ -536,7 +527,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ - receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1); + receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; @@ -585,8 +576,8 @@ static bool tcp_in_window(const struct nf_conn *ct, * Let's try to use the data from the packet. */ sender->td_end = end; - win <<= sender->td_scale; - sender->td_maxwin = (win == 0 ? 1 : win); + swin = win << sender->td_scale; + sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; /* * We haven't seen traffic in the other direction yet @@ -628,15 +619,9 @@ static bool tcp_in_window(const struct nf_conn *ct, ack = sack = receiver->td_end; } - if (seq == end - && (!tcph->rst - || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* - * Packets contains no data: we assume it is valid - * and check the ack value only. - * However RST segments are always validated by their - * SEQ number, except when seq == 0 (reset sent answering - * SYN. + * RST sent answering SYN. */ seq = end = sender->td_end; @@ -651,14 +636,18 @@ static bool tcp_in_window(const struct nf_conn *ct, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); + /* Is the ending sequence in the receive window (if available)? */ + in_recv_win = !receiver->td_maxwin || + after(end, sender->td_end - receiver->td_maxwin - 1); + pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", before(seq, sender->td_maxend + 1), - after(end, sender->td_end - receiver->td_maxwin - 1), + (in_recv_win ? 1 : 0), before(sack, receiver->td_end + 1), after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && - after(end, sender->td_end - receiver->td_maxwin - 1) && + in_recv_win && before(sack, receiver->td_end + 1) && after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* @@ -721,13 +710,13 @@ static bool tcp_in_window(const struct nf_conn *ct, } else { res = false; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - nf_ct_tcp_be_liberal) + tn->tcp_be_liberal) res = true; if (!res && LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? - after(end, sender->td_end - receiver->td_maxwin - 1) ? + in_recv_win ? before(sack, receiver->td_end + 1) ? after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" @@ -776,7 +765,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; } @@ -784,7 +773,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -797,7 +786,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } @@ -806,7 +795,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -814,15 +803,22 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } +static unsigned int *tcp_get_timeouts(struct net *net) +{ + return tcp_pernet(net)->timeouts; +} + /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, - unsigned int hooknum) + unsigned int hooknum, + unsigned int *timeouts) { struct net *net = nf_ct_net(ct); + struct nf_tcp_net *tn = tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; enum ip_conntrack_dir dir; @@ -946,16 +942,32 @@ static int tcp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid packet ignored "); + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid packet ignored in " + "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: + /* Special case for SYN proxy: when the SYN to the server or + * the SYN/ACK from the server is lost, the client may transmit + * a keep-alive packet while in SYN_SENT state. This needs to + * be associated with the original conntrack entry in order to + * generate a new SYN with the correct sequence number. + */ + if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { + pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + } + /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_CLOSE: @@ -965,8 +977,8 @@ static int tcp_packet(struct nf_conn *ct, /* Invalid RST */ spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid RST "); + nf_log_packet(net, pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid RST "); return -NF_ACCEPT; } if (index == TCP_RST_SET @@ -1014,15 +1026,15 @@ static int tcp_packet(struct nf_conn *ct, && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans && - tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans) - timeout = nf_ct_tcp_timeout_max_retrans; + if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && + timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) + timeout = timeouts[TCP_CONNTRACK_RETRANS]; else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && - tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged) - timeout = nf_ct_tcp_timeout_unacknowledged; + timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) + timeout = timeouts[TCP_CONNTRACK_UNACK]; else - timeout = tcp_timeouts[new_state]; + timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); if (new_state != old_state) @@ -1037,6 +1049,12 @@ static int tcp_packet(struct nf_conn *ct, nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection + * pickup with loose=1. Avoid large ESTABLISHED timeout. + */ + if (new_state == TCP_CONNTRACK_ESTABLISHED && + timeout > timeouts[TCP_CONNTRACK_UNACK]) + timeout = timeouts[TCP_CONNTRACK_UNACK]; } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) @@ -1054,11 +1072,13 @@ static int tcp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, unsigned int *timeouts) { enum tcp_conntrack new_state; const struct tcphdr *th; struct tcphdr _tcph; + struct net *net = nf_ct_net(ct); + struct nf_tcp_net *tn = tcp_pernet(net); const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; @@ -1087,7 +1107,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.tcp.seen[0].td_end; tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); - } else if (nf_ct_tcp_loose == 0) { + } else if (tn->tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { @@ -1142,21 +1162,22 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state); - - NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, - ct->proto.tcp.seen[0].td_scale); - - NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, - ct->proto.tcp.seen[1].td_scale); + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) || + nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + ct->proto.tcp.seen[0].td_scale) || + nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, + ct->proto.tcp.seen[1].td_scale)) + goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[0].flags; - NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, - sizeof(struct nf_ct_tcp_flags), &tmp); + if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, + sizeof(struct nf_ct_tcp_flags), &tmp)) + goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[1].flags; - NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, - sizeof(struct nf_ct_tcp_flags), &tmp); + if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, + sizeof(struct nf_ct_tcp_flags), &tmp)) + goto nla_put_failure; spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); @@ -1239,97 +1260,194 @@ static int tcp_nlattr_tuple_size(void) } #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct nf_tcp_net *tn = tcp_pernet(net); + int i; + + /* set default TCP timeouts. */ + for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) + timeouts[i] = tn->timeouts[i]; + + if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { + timeouts[TCP_CONNTRACK_SYN_SENT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { + timeouts[TCP_CONNTRACK_SYN_RECV] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { + timeouts[TCP_CONNTRACK_ESTABLISHED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { + timeouts[TCP_CONNTRACK_FIN_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { + timeouts[TCP_CONNTRACK_CLOSE_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { + timeouts[TCP_CONNTRACK_LAST_ACK] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { + timeouts[TCP_CONNTRACK_TIME_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_CLOSE]) { + timeouts[TCP_CONNTRACK_CLOSE] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { + timeouts[TCP_CONNTRACK_SYN_SENT2] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_RETRANS]) { + timeouts[TCP_CONNTRACK_RETRANS] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_UNACK]) { + timeouts[TCP_CONNTRACK_UNACK] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; + } + return 0; +} + +static int +tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, + htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, + htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, + htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, + htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, + htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, + htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, + htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, + htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, + htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, + htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, + htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { + [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static unsigned int tcp_sysctl_table_users; -static struct ctl_table_header *tcp_sysctl_header; static struct ctl_table tcp_sysctl_table[] = { { .procname = "nf_conntrack_tcp_timeout_syn_sent", - .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_syn_recv", - .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_established", - .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_fin_wait", - .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close_wait", - .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_last_ack", - .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_time_wait", - .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close", - .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_max_retrans", - .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_unacknowledged", - .data = &nf_ct_tcp_timeout_unacknowledged, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_loose", - .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nf_conntrack_tcp_be_liberal", - .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nf_conntrack_tcp_max_retrans", - .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1341,91 +1459,78 @@ static struct ctl_table tcp_sysctl_table[] = { static struct ctl_table tcp_compat_sysctl_table[] = { { .procname = "ip_conntrack_tcp_timeout_syn_sent", - .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_syn_sent2", - .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_syn_recv", - .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_established", - .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_fin_wait", - .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close_wait", - .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_last_ack", - .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_time_wait", - .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close", - .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_max_retrans", - .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_loose", - .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_tcp_be_liberal", - .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_tcp_max_retrans", - .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1435,6 +1540,101 @@ static struct ctl_table tcp_compat_sysctl_table[] = { #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ +static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_tcp_net *tn) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(tcp_sysctl_table, + sizeof(tcp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; + pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; + pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; + pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; + pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; + pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; + pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; + pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; + pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK]; + pn->ctl_table[10].data = &tn->tcp_loose; + pn->ctl_table[11].data = &tn->tcp_be_liberal; + pn->ctl_table[12].data = &tn->tcp_max_retrans; +#endif + return 0; +} + +static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_tcp_net *tn) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table, + sizeof(tcp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; + pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2]; + pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; + pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; + pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; + pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; + pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; + pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; + pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; + pn->ctl_compat_table[10].data = &tn->tcp_loose; + pn->ctl_compat_table[11].data = &tn->tcp_be_liberal; + pn->ctl_compat_table[12].data = &tn->tcp_max_retrans; +#endif +#endif + return 0; +} + +static int tcp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_proto_net *pn = &tn->pn; + + if (!pn->users) { + int i; + + for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) + tn->timeouts[i] = tcp_timeouts[i]; + + tn->tcp_loose = nf_ct_tcp_loose; + tn->tcp_be_liberal = nf_ct_tcp_be_liberal; + tn->tcp_max_retrans = nf_ct_tcp_max_retrans; + } + + if (proto == AF_INET) { + ret = tcp_kmemdup_compat_sysctl_table(pn, tn); + if (ret < 0) + return ret; + + ret = tcp_kmemdup_sysctl_table(pn, tn); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + } else + ret = tcp_kmemdup_sysctl_table(pn, tn); + + return ret; +} + +static struct nf_proto_net *tcp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.tcp.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = { .l3proto = PF_INET, @@ -1445,6 +1645,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = .print_tuple = tcp_print_tuple, .print_conntrack = tcp_print_conntrack, .packet = tcp_packet, + .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -1456,14 +1657,18 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &tcp_sysctl_table_users, - .ctl_table_header = &tcp_sysctl_header, - .ctl_table = tcp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = tcp_compat_sysctl_table, -#endif -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = tcp_timeout_nlattr_to_obj, + .obj_to_nlattr = tcp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_TCP_MAX, + .obj_size = sizeof(unsigned int) * + TCP_CONNTRACK_TIMEOUT_MAX, + .nla_policy = tcp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = tcp_init_net, + .get_net_proto = tcp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); @@ -1477,6 +1682,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = .print_tuple = tcp_print_tuple, .print_conntrack = tcp_print_conntrack, .packet = tcp_packet, + .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -1488,10 +1694,17 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &tcp_sysctl_table_users, - .ctl_table_header = &tcp_sysctl_header, - .ctl_table = tcp_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = tcp_timeout_nlattr_to_obj, + .obj_to_nlattr = tcp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_TCP_MAX, + .obj_size = sizeof(unsigned int) * + TCP_CONNTRACK_TIMEOUT_MAX, + .nla_policy = tcp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = tcp_init_net, + .get_net_proto = tcp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 5f35757fbff..9d7721cbce4 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -25,8 +26,15 @@ #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> -static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ; -static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ; +static unsigned int udp_timeouts[UDP_CT_MAX] = { + [UDP_CT_UNREPLIED] = 30*HZ, + [UDP_CT_REPLIED] = 180*HZ, +}; + +static inline struct nf_udp_net *udp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.udp; +} static bool udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -63,30 +71,38 @@ static int udp_print_tuple(struct seq_file *s, ntohs(tuple->dst.u.udp.port)); } +static unsigned int *udp_get_timeouts(struct net *net) +{ + return udp_pernet(net)->timeouts; +} + /* Returns verdict for packet, and may modify conntracktype */ static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, - unsigned int hooknum) + unsigned int hooknum, + unsigned int *timeouts) { /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream); + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_REPLIED]); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); - } else - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout); - + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_UNREPLIED]); + } return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, unsigned int *timeouts) { return true; } @@ -104,7 +120,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: short packet "); return -NF_ACCEPT; } @@ -112,7 +128,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -128,7 +144,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } @@ -136,20 +152,65 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct nf_udp_net *un = udp_pernet(net); + + /* set default timeouts for UDP. */ + timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; + timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { + timeouts[UDP_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_UDP_REPLIED]) { + timeouts[UDP_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; + } + return 0; +} + +static int +udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED, + htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED, + htonl(timeouts[UDP_CT_REPLIED] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { + [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static unsigned int udp_sysctl_table_users; -static struct ctl_table_header *udp_sysctl_header; static struct ctl_table udp_sysctl_table[] = { { .procname = "nf_conntrack_udp_timeout", - .data = &nf_ct_udp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_udp_timeout_stream", - .data = &nf_ct_udp_timeout_stream, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -160,14 +221,12 @@ static struct ctl_table udp_sysctl_table[] = { static struct ctl_table udp_compat_sysctl_table[] = { { .procname = "ip_conntrack_udp_timeout", - .data = &nf_ct_udp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_udp_timeout_stream", - .data = &nf_ct_udp_timeout_stream, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -177,6 +236,73 @@ static struct ctl_table udp_compat_sysctl_table[] = { #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ +static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_udp_net *un) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + pn->ctl_table = kmemdup(udp_sysctl_table, + sizeof(udp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; + pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED]; +#endif + return 0; +} + +static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_udp_net *un) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table, + sizeof(udp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; + pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED]; +#endif +#endif + return 0; +} + +static int udp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_udp_net *un = udp_pernet(net); + struct nf_proto_net *pn = &un->pn; + + if (!pn->users) { + int i; + + for (i = 0; i < UDP_CT_MAX; i++) + un->timeouts[i] = udp_timeouts[i]; + } + + if (proto == AF_INET) { + ret = udp_kmemdup_compat_sysctl_table(pn, un); + if (ret < 0) + return ret; + + ret = udp_kmemdup_sysctl_table(pn, un); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + } else + ret = udp_kmemdup_sysctl_table(pn, un); + + return ret; +} + +static struct nf_proto_net *udp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.udp.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = { .l3proto = PF_INET, @@ -186,6 +312,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, .packet = udp_packet, + .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -194,14 +321,17 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &udp_sysctl_table_users, - .ctl_table_header = &udp_sysctl_header, - .ctl_table = udp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = udp_compat_sysctl_table, -#endif -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = udp_init_net, + .get_net_proto = udp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); @@ -214,6 +344,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, .packet = udp_packet, + .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -222,10 +353,16 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &udp_sysctl_table_users, - .ctl_table_header = &udp_sysctl_header, - .ctl_table = udp_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = udp_init_net, + .get_net_proto = udp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index f52ca118101..2750e6c69f8 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -24,8 +24,27 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_udplite_timeout __read_mostly = 30*HZ; -static unsigned int nf_ct_udplite_timeout_stream __read_mostly = 180*HZ; +enum udplite_conntrack { + UDPLITE_CT_UNREPLIED, + UDPLITE_CT_REPLIED, + UDPLITE_CT_MAX +}; + +static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = { + [UDPLITE_CT_UNREPLIED] = 30*HZ, + [UDPLITE_CT_REPLIED] = 180*HZ, +}; + +static int udplite_net_id __read_mostly; +struct udplite_net { + struct nf_proto_net pn; + unsigned int timeouts[UDPLITE_CT_MAX]; +}; + +static inline struct udplite_net *udplite_pernet(struct net *net) +{ + return net_generic(net, udplite_net_id); +} static bool udplite_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -60,31 +79,38 @@ static int udplite_print_tuple(struct seq_file *s, ntohs(tuple->dst.u.udp.port)); } +static unsigned int *udplite_get_timeouts(struct net *net) +{ + return udplite_pernet(net)->timeouts; +} + /* Returns verdict for packet, and may modify conntracktype */ static int udplite_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, - unsigned int hooknum) + unsigned int hooknum, + unsigned int *timeouts) { /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, - nf_ct_udplite_timeout_stream); + timeouts[UDPLITE_CT_REPLIED]); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); - } else - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout); - + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDPLITE_CT_UNREPLIED]); + } return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, unsigned int *timeouts) { return true; } @@ -105,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: short packet "); return -NF_ACCEPT; } @@ -115,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, cscov = udplen; else if (cscov < sizeof(*hdr) || cscov > udplen) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: invalid checksum coverage "); return -NF_ACCEPT; } @@ -123,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* UDPLITE mandates checksums */ if (!hdr->check) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: checksum missing "); return -NF_ACCEPT; } @@ -133,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: bad UDPLite checksum "); return -NF_ACCEPT; } @@ -141,20 +167,65 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeouts = data; + struct udplite_net *un = udplite_pernet(net); + + /* set default timeouts for UDPlite. */ + timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED]; + timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) { + timeouts[UDPLITE_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) { + timeouts[UDPLITE_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ; + } + return 0; +} + +static int +udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED, + htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) || + nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED, + htonl(timeouts[UDPLITE_CT_REPLIED] / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = { + [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static unsigned int udplite_sysctl_table_users; -static struct ctl_table_header *udplite_sysctl_header; static struct ctl_table udplite_sysctl_table[] = { { .procname = "nf_conntrack_udplite_timeout", - .data = &nf_ct_udplite_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_udplite_timeout_stream", - .data = &nf_ct_udplite_timeout_stream, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -163,6 +234,40 @@ static struct ctl_table udplite_sysctl_table[] = { }; #endif /* CONFIG_SYSCTL */ +static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct udplite_net *un) +{ +#ifdef CONFIG_SYSCTL + if (pn->ctl_table) + return 0; + + pn->ctl_table = kmemdup(udplite_sysctl_table, + sizeof(udplite_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED]; + pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED]; +#endif + return 0; +} + +static int udplite_init_net(struct net *net, u_int16_t proto) +{ + struct udplite_net *un = udplite_pernet(net); + struct nf_proto_net *pn = &un->pn; + + if (!pn->users) { + int i; + + for (i = 0 ; i < UDPLITE_CT_MAX; i++) + un->timeouts[i] = udplite_timeouts[i]; + } + + return udplite_kmemdup_sysctl_table(pn, un); +} + static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = { .l3proto = PF_INET, @@ -172,6 +277,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, .packet = udplite_packet, + .get_timeouts = udplite_get_timeouts, .new = udplite_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -180,11 +286,18 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &udplite_sysctl_table_users, - .ctl_table_header = &udplite_sysctl_header, - .ctl_table = udplite_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udplite_timeout_nlattr_to_obj, + .obj_to_nlattr = udplite_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, + .obj_size = sizeof(unsigned int) * + CTA_TIMEOUT_UDPLITE_MAX, + .nla_policy = udplite_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &udplite_net_id, + .init_net = udplite_init_net, }; static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = @@ -196,6 +309,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, .packet = udplite_packet, + .get_timeouts = udplite_get_timeouts, .new = udplite_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -204,34 +318,85 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &udplite_sysctl_table_users, - .ctl_table_header = &udplite_sysctl_header, - .ctl_table = udplite_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udplite_timeout_nlattr_to_obj, + .obj_to_nlattr = udplite_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, + .obj_size = sizeof(unsigned int) * + CTA_TIMEOUT_UDPLITE_MAX, + .nla_policy = udplite_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .net_id = &udplite_net_id, + .init_net = udplite_init_net, +}; + +static int udplite_net_init(struct net *net) +{ + int ret = 0; + + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4); + if (ret < 0) { + pr_err("nf_conntrack_udplite4: pernet registration failed.\n"); + goto out; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6); + if (ret < 0) { + pr_err("nf_conntrack_udplite6: pernet registration failed.\n"); + goto cleanup_udplite4; + } + return 0; + +cleanup_udplite4: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); +out: + return ret; +} + +static void udplite_net_exit(struct net *net) +{ + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); +} + +static struct pernet_operations udplite_net_ops = { + .init = udplite_net_init, + .exit = udplite_net_exit, + .id = &udplite_net_id, + .size = sizeof(struct udplite_net), }; static int __init nf_conntrack_proto_udplite_init(void) { - int err; + int ret; + + ret = register_pernet_subsys(&udplite_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4); + if (ret < 0) + goto out_udplite4; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6); + if (ret < 0) + goto out_udplite6; - err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite4); - if (err < 0) - goto err1; - err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite6); - if (err < 0) - goto err2; return 0; -err2: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); -err1: - return err; +out_udplite6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +out_udplite4: + unregister_pernet_subsys(&udplite_net_ops); +out_pernet: + return ret; } static void __exit nf_conntrack_proto_udplite_exit(void) { - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); + unregister_pernet_subsys(&udplite_net_ops); } module_init(nf_conntrack_proto_udplite_init); diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 8501823b3f9..4a2134fd3fc 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -69,13 +69,12 @@ static int help(struct sk_buff *skb, void *sb_ptr; int ret = NF_ACCEPT; int dir = CTINFO2DIR(ctinfo); - struct nf_ct_sane_master *ct_sane_info; + struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; struct nf_conntrack_tuple *tuple; struct sane_request *req; struct sane_reply_net_start *reply; - ct_sane_info = &nfct_help(ct)->help.ct_sane_info; /* Until there's been traffic both ways, don't look in packets. */ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) @@ -139,6 +138,7 @@ static int help(struct sk_buff *skb, exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -152,8 +152,10 @@ static int help(struct sk_buff *skb, nf_ct_dump_tuple(&exp->tuple); /* Can't expect this? Best to drop packet now. */ - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); @@ -163,7 +165,6 @@ out: } static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; -static char sane_names[MAX_PORTS][2][sizeof("sane-65535")] __read_mostly; static const struct nf_conntrack_expect_policy sane_exp_policy = { .max_expected = 1, @@ -190,7 +191,6 @@ static void nf_conntrack_sane_fini(void) static int __init nf_conntrack_sane_init(void) { int i, j = -1, ret = 0; - char *tmpname; sane_buffer = kmalloc(65536, GFP_KERNEL); if (!sane_buffer) @@ -205,17 +205,16 @@ static int __init nf_conntrack_sane_init(void) sane[i][0].tuple.src.l3num = PF_INET; sane[i][1].tuple.src.l3num = PF_INET6; for (j = 0; j < 2; j++) { + sane[i][j].data_len = sizeof(struct nf_ct_sane_master); sane[i][j].tuple.src.u.tcp.port = htons(ports[i]); sane[i][j].tuple.dst.protonum = IPPROTO_TCP; sane[i][j].expect_policy = &sane_exp_policy; sane[i][j].me = THIS_MODULE; sane[i][j].help = help; - tmpname = &sane_names[i][j][0]; if (ports[i] == SANE_PORT) - sprintf(tmpname, "sane"); + sprintf(sane[i][j].name, "sane"); else - sprintf(tmpname, "sane-%d", ports[i]); - sane[i][j].name = tmpname; + sprintf(sane[i][j].name, "sane-%d", ports[i]); pr_debug("nf_ct_sane: registering helper for pf: %d " "port: %d\n", diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c new file mode 100644 index 00000000000..f6e2ae91a80 --- /dev/null +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -0,0 +1,243 @@ +#include <linux/types.h> +#include <linux/netfilter.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> + +int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_conn_seqadj *seqadj; + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + seqadj = nfct_seqadj(ct); + this_way = &seqadj->seq[dir]; + this_way->offset_before = off; + this_way->offset_after = off; + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); + +int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + if (unlikely(!seqadj)) { + WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); + return 0; + } + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + spin_lock_bh(&ct->lock); + this_way = &seqadj->seq[dir]; + if (this_way->offset_before == this_way->offset_after || + before(this_way->correction_pos, ntohl(seq))) { + this_way->correction_pos = ntohl(seq); + this_way->offset_before = this_way->offset_after; + this_way->offset_after += off; + } + spin_unlock_bh(&ct->lock); + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); + +void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP) + return; + + th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} +EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); + +/* Adjust one found SACK option including checksum correction */ +static void nf_ct_sack_block_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct nf_ct_seqadj *seq) +{ + while (sackoff < sackend) { + struct tcp_sack_block_wire *sack; + __be32 new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - seq->offset_before, + seq->correction_pos)) + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_after); + else + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_before); + + if (after(ntohl(sack->end_seq) - seq->offset_before, + seq->correction_pos)) + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_after); + else + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_before); + + pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *tcph, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + tcph->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK && + op[1] >= 2+TCPOLEN_SACK_PERBLOCK && + ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + nf_ct_sack_block_adjust(skb, tcph, optoff + 2, + optoff+op[1], + &seqadj->seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct tcphdr *tcph; + __be32 newseq, newack; + s32 seqoff, ackoff; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way, *other_way; + int res; + + this_way = &seqadj->seq[dir]; + other_way = &seqadj->seq[!dir]; + + if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + return 0; + + tcph = (void *)skb->data + protoff; + spin_lock_bh(&ct->lock); + if (after(ntohl(tcph->seq), this_way->correction_pos)) + seqoff = this_way->offset_after; + else + seqoff = this_way->offset_before; + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + ackoff = other_way->offset_after; + else + ackoff = other_way->offset_before; + + newseq = htonl(ntohl(tcph->seq) + seqoff); + newack = htonl(ntohl(tcph->ack_seq) - ackoff); + + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + + pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); + spin_unlock_bh(&ct->lock); + + return res; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); + +s32 nf_ct_seq_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way; + + if (!seqadj) + return 0; + + this_way = &seqadj->seq[dir]; + return after(seq, this_way->correction_pos) ? + this_way->offset_after : this_way->offset_before; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_offset); + +static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = { + .len = sizeof(struct nf_conn_seqadj), + .align = __alignof__(struct nf_conn_seqadj), + .id = NF_CT_EXT_SEQADJ, +}; + +int nf_conntrack_seqadj_init(void) +{ + return nf_ct_extend_register(&nf_ct_seqadj_extend); +} + +void nf_conntrack_seqadj_fini(void) +{ + nf_ct_extend_unregister(&nf_ct_seqadj_extend); +} diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 93faf6a3a63..4c3ba1c8d68 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -52,60 +52,8 @@ module_param(sip_direct_media, int, 0600); MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling " "endpoints only (default 1)"); -unsigned int (*nf_nat_sip_hook)(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, - unsigned int *datalen) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_hook); - -void (*nf_nat_sip_seq_adjust_hook)(struct sk_buff *skb, s16 off) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_seq_adjust_hook); - -unsigned int (*nf_nat_sip_expect_hook)(struct sk_buff *skb, - unsigned int dataoff, - const char **dptr, - unsigned int *datalen, - struct nf_conntrack_expect *exp, - unsigned int matchoff, - unsigned int matchlen) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sip_expect_hook); - -unsigned int (*nf_nat_sdp_addr_hook)(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, - unsigned int *datalen, - unsigned int sdpoff, - enum sdp_header_types type, - enum sdp_header_types term, - const union nf_inet_addr *addr) - __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_addr_hook); - -unsigned int (*nf_nat_sdp_port_hook)(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, - unsigned int *datalen, - unsigned int matchoff, - unsigned int matchlen, - u_int16_t port) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_port_hook); - -unsigned int (*nf_nat_sdp_session_hook)(struct sk_buff *skb, - unsigned int dataoff, - const char **dptr, - unsigned int *datalen, - unsigned int sdpoff, - const union nf_inet_addr *addr) - __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_session_hook); - -unsigned int (*nf_nat_sdp_media_hook)(struct sk_buff *skb, unsigned int dataoff, - const char **dptr, - unsigned int *datalen, - struct nf_conntrack_expect *rtp_exp, - struct nf_conntrack_expect *rtcp_exp, - unsigned int mediaoff, - unsigned int medialen, - union nf_inet_addr *rtp_addr) - __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_sdp_media_hook); +const struct nf_nat_sip_hooks *nf_nat_sip_hooks; +EXPORT_SYMBOL_GPL(nf_nat_sip_hooks); static int string_len(const struct nf_conn *ct, const char *dptr, const char *limit, int *shift) @@ -183,12 +131,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr, return len + digits_len(ct, dptr, limit, shift); } -static int parse_addr(const struct nf_conn *ct, const char *cp, - const char **endp, union nf_inet_addr *addr, - const char *limit) +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit, bool delim) { const char *end; - int ret = 0; + int ret; if (!ct) return 0; @@ -197,16 +145,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp, switch (nf_ct_l3num(ct)) { case AF_INET: ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + if (ret == 0) + return 0; break; case AF_INET6: + if (cp < limit && *cp == '[') + cp++; + else if (delim) + return 0; + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + if (ret == 0) + return 0; + + if (end < limit && *end == ']') + end++; + else if (delim) + return 0; break; default: BUG(); } - if (ret == 0 || end == cp) - return 0; if (endp) *endp = end; return 1; @@ -219,7 +179,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr, union nf_inet_addr addr; const char *aux = dptr; - if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) { pr_debug("ip: %s parse failed.!\n", dptr); return 0; } @@ -296,7 +256,7 @@ int ct_sip_parse_request(const struct nf_conn *ct, return 0; dptr += shift; - if (!parse_addr(ct, dptr, &end, addr, limit)) + if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) return -1; if (end < limit && *end == ':') { end++; @@ -550,7 +510,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, if (ret == 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) return -1; if (*c == ':') { c++; @@ -599,7 +559,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr) + union nf_inet_addr *addr, bool delim) { const char *limit = dptr + datalen; const char *start, *end; @@ -613,7 +573,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, return 0; start += strlen(name); - if (!parse_addr(ct, start, &end, addr, limit)) + if (!sip_parse_addr(ct, start, &end, addr, limit, delim)) return 0; *matchoff = start - dptr; *matchlen = end - start; @@ -675,6 +635,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, return 1; } +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + return dptr - aux; +} + /* SDP header parsing: a SDP session description contains an ordered set of * headers, starting with a section containing general session parameters, * optionally followed by multiple media descriptions. @@ -684,13 +685,18 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, * be tolerant and also accept records terminated with a single newline * character". We handle both cases. */ -static const struct sip_header ct_sdp_hdrs[] = { - [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), - [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), - [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), - [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), - [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), - [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), +static const struct sip_header ct_sdp_hdrs_v4[] = { + [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), + [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), +}; + +static const struct sip_header ct_sdp_hdrs_v6[] = { + [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), + [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), }; /* Linear string search within SDP header values */ @@ -716,11 +722,14 @@ int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr, enum sdp_header_types term, unsigned int *matchoff, unsigned int *matchlen) { - const struct sip_header *hdr = &ct_sdp_hdrs[type]; - const struct sip_header *thdr = &ct_sdp_hdrs[term]; + const struct sip_header *hdrs, *hdr, *thdr; const char *start = dptr, *limit = dptr + datalen; int shift = 0; + hdrs = nf_ct_l3num(ct) == NFPROTO_IPV4 ? ct_sdp_hdrs_v4 : ct_sdp_hdrs_v6; + hdr = &hdrs[type]; + thdr = &hdrs[term]; + for (dptr += dataoff; dptr < limit; dptr++) { /* Find beginning of line */ if (*dptr != '\r' && *dptr != '\n') @@ -775,8 +784,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, if (ret <= 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, NULL, addr, - dptr + *matchoff + *matchlen)) + if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) return -1; return 1; } @@ -788,11 +797,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct, { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; - struct hlist_node *n, *next; + struct hlist_node *next; int found = 0; - spin_lock_bh(&nf_conntrack_lock); - hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + spin_lock_bh(&nf_conntrack_expect_lock); + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (exp->class != SIP_EXPECT_SIGNALLING || !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || exp->tuple.dst.protonum != proto || @@ -806,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct, found = 1; break; } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); return found; } @@ -814,10 +823,10 @@ static void flush_expectations(struct nf_conn *ct, bool media) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; - struct hlist_node *n, *next; + struct hlist_node *next; - spin_lock_bh(&nf_conntrack_lock); - hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + spin_lock_bh(&nf_conntrack_expect_lock); + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) continue; if (!del_timer(&exp->timeout)) @@ -827,10 +836,11 @@ static void flush_expectations(struct nf_conn *ct, bool media) if (!media) break; } - spin_unlock_bh(&nf_conntrack_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } -static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, +static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, union nf_inet_addr *daddr, __be16 port, enum sip_expectation_classes class, @@ -846,8 +856,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, int direct_rtp = 0, skip_expect = 0, ret = NF_DROP; u_int16_t base_port; __be16 rtp_port, rtcp_port; - typeof(nf_nat_sdp_port_hook) nf_nat_sdp_port; - typeof(nf_nat_sdp_media_hook) nf_nat_sdp_media; + const struct nf_nat_sip_hooks *hooks; saddr = NULL; if (sip_direct_media) { @@ -886,34 +895,35 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, exp->class != class) break; #ifdef CONFIG_NF_NAT_NEEDED - if (exp->tuple.src.l3num == AF_INET && !direct_rtp && - (exp->saved_ip != exp->tuple.dst.u3.ip || + if (!direct_rtp && + (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) || exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) && ct->status & IPS_NAT_MASK) { - daddr->ip = exp->saved_ip; - tuple.dst.u3.ip = exp->saved_ip; + *daddr = exp->saved_addr; + tuple.dst.u3 = exp->saved_addr; tuple.dst.u.udp.port = exp->saved_proto.udp.port; direct_rtp = 1; } else #endif skip_expect = 1; } while (!skip_expect); - rcu_read_unlock(); base_port = ntohs(tuple.dst.u.udp.port) & ~1; rtp_port = htons(base_port); rtcp_port = htons(base_port + 1); if (direct_rtp) { - nf_nat_sdp_port = rcu_dereference(nf_nat_sdp_port_hook); - if (nf_nat_sdp_port && - !nf_nat_sdp_port(skb, dataoff, dptr, datalen, + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && + !hooks->sdp_port(skb, protoff, dataoff, dptr, datalen, mediaoff, medialen, ntohs(rtp_port))) goto err1; } - if (skip_expect) + if (skip_expect) { + rcu_read_unlock(); return NF_ACCEPT; + } rtp_exp = nf_ct_expect_alloc(ct); if (rtp_exp == NULL) @@ -927,10 +937,10 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr, IPPROTO_UDP, NULL, &rtcp_port); - nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook); - if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp) - ret = nf_nat_sdp_media(skb, dataoff, dptr, datalen, - rtp_exp, rtcp_exp, + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && ct->status & IPS_NAT_MASK && !direct_rtp) + ret = hooks->sdp_media(skb, protoff, dataoff, dptr, + datalen, rtp_exp, rtcp_exp, mediaoff, medialen, daddr); else { if (nf_ct_expect_related(rtp_exp) == 0) { @@ -944,6 +954,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, err2: nf_ct_expect_put(rtp_exp); err1: + rcu_read_unlock(); return ret; } @@ -970,7 +981,8 @@ static const struct sdp_media_type *sdp_media_type(const char *dptr, return NULL; } -static int process_sdp(struct sk_buff *skb, unsigned int dataoff, +static int process_sdp(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq) { @@ -982,16 +994,12 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff, unsigned int caddr_len, maddr_len; unsigned int i; union nf_inet_addr caddr, maddr, rtp_addr; + const struct nf_nat_sip_hooks *hooks; unsigned int port; - enum sdp_header_types c_hdr; const struct sdp_media_type *t; int ret = NF_ACCEPT; - typeof(nf_nat_sdp_addr_hook) nf_nat_sdp_addr; - typeof(nf_nat_sdp_session_hook) nf_nat_sdp_session; - nf_nat_sdp_addr = rcu_dereference(nf_nat_sdp_addr_hook); - c_hdr = nf_ct_l3num(ct) == AF_INET ? SDP_HDR_CONNECTION_IP4 : - SDP_HDR_CONNECTION_IP6; + hooks = rcu_dereference(nf_nat_sip_hooks); /* Find beginning of session description */ if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, @@ -1005,7 +1013,7 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff, * the end of the session description. */ caddr_len = 0; if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen, - c_hdr, SDP_HDR_MEDIA, + SDP_HDR_CONNECTION, SDP_HDR_MEDIA, &matchoff, &matchlen, &caddr) > 0) caddr_len = matchlen; @@ -1029,111 +1037,129 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff, port = simple_strtoul(*dptr + mediaoff, NULL, 10); if (port == 0) continue; - if (port < 1024 || port > 65535) + if (port < 1024 || port > 65535) { + nf_ct_helper_log(skb, ct, "wrong port %u", port); return NF_DROP; + } /* The media description overrides the session description. */ maddr_len = 0; if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen, - c_hdr, SDP_HDR_MEDIA, + SDP_HDR_CONNECTION, SDP_HDR_MEDIA, &matchoff, &matchlen, &maddr) > 0) { maddr_len = matchlen; memcpy(&rtp_addr, &maddr, sizeof(rtp_addr)); } else if (caddr_len) memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); - else + else { + nf_ct_helper_log(skb, ct, "cannot parse SDP message"); return NF_DROP; + } - ret = set_expected_rtp_rtcp(skb, dataoff, dptr, datalen, + ret = set_expected_rtp_rtcp(skb, protoff, dataoff, + dptr, datalen, &rtp_addr, htons(port), t->class, mediaoff, medialen); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, + "cannot add expectation for voice"); return ret; + } /* Update media connection address if present */ - if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) { - ret = nf_nat_sdp_addr(skb, dataoff, dptr, datalen, - mediaoff, c_hdr, SDP_HDR_MEDIA, + if (maddr_len && hooks && ct->status & IPS_NAT_MASK) { + ret = hooks->sdp_addr(skb, protoff, dataoff, + dptr, datalen, mediaoff, + SDP_HDR_CONNECTION, + SDP_HDR_MEDIA, &rtp_addr); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, "cannot mangle SDP"); return ret; + } } i++; } /* Update session connection and owner addresses */ - nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook); - if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK) - ret = nf_nat_sdp_session(skb, dataoff, dptr, datalen, sdpoff, + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && ct->status & IPS_NAT_MASK) + ret = hooks->sdp_session(skb, protoff, dataoff, + dptr, datalen, sdpoff, &rtp_addr); return ret; } -static int process_invite_response(struct sk_buff *skb, unsigned int dataoff, +static int process_invite_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq, unsigned int code) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); if ((code >= 100 && code <= 199) || (code >= 200 && code <= 299)) - return process_sdp(skb, dataoff, dptr, datalen, cseq); - else if (help->help.ct_sip_info.invite_cseq == cseq) + return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); + else if (ct_sip_info->invite_cseq == cseq) flush_expectations(ct, true); return NF_ACCEPT; } -static int process_update_response(struct sk_buff *skb, unsigned int dataoff, +static int process_update_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq, unsigned int code) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); if ((code >= 100 && code <= 199) || (code >= 200 && code <= 299)) - return process_sdp(skb, dataoff, dptr, datalen, cseq); - else if (help->help.ct_sip_info.invite_cseq == cseq) + return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); + else if (ct_sip_info->invite_cseq == cseq) flush_expectations(ct, true); return NF_ACCEPT; } -static int process_prack_response(struct sk_buff *skb, unsigned int dataoff, +static int process_prack_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq, unsigned int code) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); if ((code >= 100 && code <= 199) || (code >= 200 && code <= 299)) - return process_sdp(skb, dataoff, dptr, datalen, cseq); - else if (help->help.ct_sip_info.invite_cseq == cseq) + return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); + else if (ct_sip_info->invite_cseq == cseq) flush_expectations(ct, true); return NF_ACCEPT; } -static int process_invite_request(struct sk_buff *skb, unsigned int dataoff, +static int process_invite_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); unsigned int ret; flush_expectations(ct, true); - ret = process_sdp(skb, dataoff, dptr, datalen, cseq); + ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq); if (ret == NF_ACCEPT) - help->help.ct_sip_info.invite_cseq = cseq; + ct_sip_info->invite_cseq = cseq; return ret; } -static int process_bye_request(struct sk_buff *skb, unsigned int dataoff, +static int process_bye_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq) { @@ -1148,22 +1174,23 @@ static int process_bye_request(struct sk_buff *skb, unsigned int dataoff, * signalling connections. The expectation is marked inactive and is activated * when receiving a response indicating success from the registrar. */ -static int process_register_request(struct sk_buff *skb, unsigned int dataoff, +static int process_register_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int matchoff, matchlen; struct nf_conntrack_expect *exp; union nf_inet_addr *saddr, daddr; + const struct nf_nat_sip_hooks *hooks; __be16 port; u8 proto; unsigned int expires = 0; int ret; - typeof(nf_nat_sip_expect_hook) nf_nat_sip_expect; /* Expected connections can not register again. */ if (ct->status & IPS_EXPECTED) @@ -1184,9 +1211,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff, ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, SIP_HDR_CONTACT, NULL, &matchoff, &matchlen, &daddr, &port); - if (ret < 0) + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse contact"); return NF_DROP; - else if (ret == 0) + } else if (ret == 0) return NF_ACCEPT; /* We don't support third-party registrations */ @@ -1199,8 +1227,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff, if (ct_sip_parse_numerical_param(ct, *dptr, matchoff + matchlen, *datalen, - "expires=", NULL, NULL, &expires) < 0) + "expires=", NULL, NULL, &expires) < 0) { + nf_ct_helper_log(skb, ct, "cannot parse expires"); return NF_DROP; + } if (expires == 0) { ret = NF_ACCEPT; @@ -1208,8 +1238,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff, } exp = nf_ct_expect_alloc(ct); - if (!exp) + if (!exp) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); return NF_DROP; + } saddr = NULL; if (sip_direct_signalling) @@ -1221,31 +1253,33 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff, exp->helper = nfct_help(ct)->helper; exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; - nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook); - if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK) - ret = nf_nat_sip_expect(skb, dataoff, dptr, datalen, exp, - matchoff, matchlen); + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && ct->status & IPS_NAT_MASK) + ret = hooks->expect(skb, protoff, dataoff, dptr, datalen, + exp, matchoff, matchlen); else { - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - else + } else ret = NF_ACCEPT; } nf_ct_expect_put(exp); store_cseq: if (ret == NF_ACCEPT) - help->help.ct_sip_info.register_cseq = cseq; + ct_sip_info->register_cseq = cseq; return ret; } -static int process_register_response(struct sk_buff *skb, unsigned int dataoff, +static int process_register_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int cseq, unsigned int code) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); union nf_inet_addr addr; __be16 port; @@ -1262,7 +1296,7 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff, * responses, so we store the sequence number of the last valid * request and compare it here. */ - if (help->help.ct_sip_info.register_cseq != cseq) + if (ct_sip_info->register_cseq != cseq) return NF_ACCEPT; if (code >= 100 && code <= 199) @@ -1281,9 +1315,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff, SIP_HDR_CONTACT, &in_contact, &matchoff, &matchlen, &addr, &port); - if (ret < 0) + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse contact"); return NF_DROP; - else if (ret == 0) + } else if (ret == 0) break; /* We don't support third-party registrations */ @@ -1298,8 +1333,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff, matchoff + matchlen, *datalen, "expires=", NULL, NULL, &c_expires); - if (ret < 0) + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse expires"); return NF_DROP; + } if (c_expires == 0) break; if (refresh_signalling_expectation(ct, &addr, proto, port, @@ -1321,7 +1358,8 @@ static const struct sip_handler sip_handlers[] = { SIP_HANDLER("REGISTER", process_register_request, process_register_response), }; -static int process_sip_response(struct sk_buff *skb, unsigned int dataoff, +static int process_sip_response(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen) { enum ip_conntrack_info ctinfo; @@ -1332,15 +1370,21 @@ static int process_sip_response(struct sk_buff *skb, unsigned int dataoff, if (*datalen < strlen("SIP/2.0 200")) return NF_ACCEPT; code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10); - if (!code) + if (!code) { + nf_ct_helper_log(skb, ct, "cannot get code"); return NF_DROP; + } if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, - &matchoff, &matchlen) <= 0) + &matchoff, &matchlen) <= 0) { + nf_ct_helper_log(skb, ct, "cannot parse cseq"); return NF_DROP; + } cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq) + if (!cseq) { + nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; + } matchend = matchoff + matchlen + 1; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { @@ -1352,19 +1396,37 @@ static int process_sip_response(struct sk_buff *skb, unsigned int dataoff, if (*datalen < matchend + handler->len || strnicmp(*dptr + matchend, handler->method, handler->len)) continue; - return handler->response(skb, dataoff, dptr, datalen, + return handler->response(skb, protoff, dataoff, dptr, datalen, cseq, code); } return NF_ACCEPT; } -static int process_sip_request(struct sk_buff *skb, unsigned int dataoff, +static int process_sip_request(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, const char **dptr, unsigned int *datalen) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int matchoff, matchlen; unsigned int cseq, i; + union nf_inet_addr addr; + __be16 port; + + /* Many Cisco IP phones use a high source port for SIP requests, but + * listen for the response on port 5060. If we are the local + * router for one of these phones, save the port number from the + * Via: header so that nf_nat_sip can redirect the responses to + * the correct port. + */ + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + SIP_HDR_VIA_UDP, NULL, &matchoff, + &matchlen, &addr, &port) > 0 && + port != ct->tuplehash[dir].tuple.src.u.udp.port && + nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3)) + ct_sip_info->forced_dport = port; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { const struct sip_handler *handler; @@ -1377,33 +1439,41 @@ static int process_sip_request(struct sk_buff *skb, unsigned int dataoff, continue; if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, - &matchoff, &matchlen) <= 0) + &matchoff, &matchlen) <= 0) { + nf_ct_helper_log(skb, ct, "cannot parse cseq"); return NF_DROP; + } cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq) + if (!cseq) { + nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; + } - return handler->request(skb, dataoff, dptr, datalen, cseq); + return handler->request(skb, protoff, dataoff, dptr, datalen, + cseq); } return NF_ACCEPT; } static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct, - unsigned int dataoff, const char **dptr, - unsigned int *datalen) + unsigned int protoff, unsigned int dataoff, + const char **dptr, unsigned int *datalen) { - typeof(nf_nat_sip_hook) nf_nat_sip; + const struct nf_nat_sip_hooks *hooks; int ret; if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0) - ret = process_sip_request(skb, dataoff, dptr, datalen); + ret = process_sip_request(skb, protoff, dataoff, dptr, datalen); else - ret = process_sip_response(skb, dataoff, dptr, datalen); + ret = process_sip_response(skb, protoff, dataoff, dptr, datalen); if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { - nf_nat_sip = rcu_dereference(nf_nat_sip_hook); - if (nf_nat_sip && !nf_nat_sip(skb, dataoff, dptr, datalen)) + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks && !hooks->msg(skb, protoff, dataoff, + dptr, datalen)) { + nf_ct_helper_log(skb, ct, "cannot NAT SIP message"); ret = NF_DROP; + } } return ret; @@ -1420,7 +1490,6 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, s16 diff, tdiff = 0; int ret = NF_ACCEPT; bool term; - typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust; if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) @@ -1468,9 +1537,11 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, msglen = origlen = end - dptr; if (msglen > datalen) - return NF_DROP; + return NF_ACCEPT; - ret = process_sip_msg(skb, ct, dataoff, &dptr, &msglen); + ret = process_sip_msg(skb, ct, protoff, dataoff, + &dptr, &msglen); + /* process_sip_* functions report why this packet is dropped */ if (ret != NF_ACCEPT) break; diff = msglen - origlen; @@ -1482,9 +1553,11 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, } if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { - nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook); - if (nf_nat_sip_seq_adjust) - nf_nat_sip_seq_adjust(skb, tdiff); + const struct nf_nat_sip_hooks *hooks; + + hooks = rcu_dereference(nf_nat_sip_hooks); + if (hooks) + hooks->seq_adjust(skb, protoff, tdiff); } return ret; @@ -1511,11 +1584,10 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, if (datalen < strlen("SIP/2.0 200")) return NF_ACCEPT; - return process_sip_msg(skb, ct, dataoff, &dptr, &datalen); + return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen); } static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; -static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly; static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { [SIP_EXPECT_SIGNALLING] = { @@ -1556,7 +1628,6 @@ static void nf_conntrack_sip_fini(void) static int __init nf_conntrack_sip_init(void) { int i, j, ret; - char *tmpname; if (ports_c == 0) ports[ports_c++] = SIP_PORT; @@ -1579,17 +1650,16 @@ static int __init nf_conntrack_sip_init(void) sip[i][3].help = sip_help_tcp; for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { + sip[i][j].data_len = sizeof(struct nf_ct_sip_master); sip[i][j].tuple.src.u.udp.port = htons(ports[i]); sip[i][j].expect_policy = sip_exp_policy; sip[i][j].expect_class_max = SIP_EXPECT_MAX; sip[i][j].me = THIS_MODULE; - tmpname = &sip_names[i][j][0]; if (ports[i] == SIP_PORT) - sprintf(tmpname, "sip"); + sprintf(sip[i][j].name, "sip"); else - sprintf(tmpname, "sip-%u", i); - sip[i][j].name = tmpname; + sprintf(sip[i][j].name, "sip-%u", i); pr_debug("port #%u: %u\n", i, ports[i]); diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c index 6e545e26289..87b95a2c270 100644 --- a/net/netfilter/nf_conntrack_snmp.c +++ b/net/netfilter/nf_conntrack_snmp.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_snmp.h> #define SNMP_PORT 161 diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 885f5ab9bc2..f641751dba9 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -366,7 +367,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); + pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops); if (!pde) goto out_nf_conntrack; @@ -377,7 +378,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net) return 0; out_stat_nf_conntrack: - proc_net_remove(net, "nf_conntrack"); + remove_proc_entry("nf_conntrack", net->proc_net); out_nf_conntrack: return -ENOMEM; } @@ -385,7 +386,7 @@ out_nf_conntrack: static void nf_conntrack_standalone_fini_proc(struct net *net) { remove_proc_entry("nf_conntrack", net->proc_net_stat); - proc_net_remove(net, "nf_conntrack"); + remove_proc_entry("nf_conntrack", net->proc_net); } #else static int nf_conntrack_standalone_init_proc(struct net *net) @@ -407,7 +408,7 @@ static int log_invalid_proto_max = 255; static struct ctl_table_header *nf_ct_netfilter_header; -static ctl_table nf_ct_sysctl_table[] = { +static struct ctl_table nf_ct_sysctl_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, @@ -457,7 +458,7 @@ static ctl_table nf_ct_sysctl_table[] = { #define NET_NF_CONNTRACK_MAX 2089 -static ctl_table nf_ct_netfilter_table[] = { +static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, @@ -468,22 +469,10 @@ static ctl_table nf_ct_netfilter_table[] = { { } }; -static struct ctl_path nf_ct_path[] = { - { .procname = "net", }, - { } -}; - static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct ctl_table *table; - if (net_eq(net, &init_net)) { - nf_ct_netfilter_header = - register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); - if (!nf_ct_netfilter_header) - goto out; - } - table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), GFP_KERNEL); if (!table) @@ -494,8 +483,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[3].data = &net->ct.sysctl_checksum; table[4].data = &net->ct.sysctl_log_invalid; - net->ct.sysctl_header = register_net_sysctl_table(net, - nf_net_netfilter_sysctl_path, table); + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.sysctl_header) goto out_unregister_netfilter; @@ -504,10 +496,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) out_unregister_netfilter: kfree(table); out_kmemdup: - if (net_eq(net, &init_net)) - unregister_sysctl_table(nf_ct_netfilter_header); -out: - printk(KERN_ERR "nf_conntrack: can't register to sysctl.\n"); return -ENOMEM; } @@ -515,8 +503,6 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct ctl_table *table; - if (net_eq(net, &init_net)) - unregister_sysctl_table(nf_ct_netfilter_header); table = net->ct.sysctl_header->ctl_table_arg; unregister_net_sysctl_table(net->ct.sysctl_header); kfree(table); @@ -532,51 +518,91 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net) } #endif /* CONFIG_SYSCTL */ -static int nf_conntrack_net_init(struct net *net) +static int nf_conntrack_pernet_init(struct net *net) { int ret; - ret = nf_conntrack_init(net); + ret = nf_conntrack_init_net(net); if (ret < 0) goto out_init; + ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; + net->ct.sysctl_checksum = 1; net->ct.sysctl_log_invalid = 0; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) goto out_sysctl; + return 0; out_sysctl: nf_conntrack_standalone_fini_proc(net); out_proc: - nf_conntrack_cleanup(net); + nf_conntrack_cleanup_net(net); out_init: return ret; } -static void nf_conntrack_net_exit(struct net *net) +static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { - nf_conntrack_standalone_fini_sysctl(net); - nf_conntrack_standalone_fini_proc(net); - nf_conntrack_cleanup(net); + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { + nf_conntrack_standalone_fini_sysctl(net); + nf_conntrack_standalone_fini_proc(net); + } + nf_conntrack_cleanup_net_list(net_exit_list); } static struct pernet_operations nf_conntrack_net_ops = { - .init = nf_conntrack_net_init, - .exit = nf_conntrack_net_exit, + .init = nf_conntrack_pernet_init, + .exit_batch = nf_conntrack_pernet_exit, }; static int __init nf_conntrack_standalone_init(void) { - return register_pernet_subsys(&nf_conntrack_net_ops); + int ret = nf_conntrack_init_start(); + if (ret < 0) + goto out_start; + +#ifdef CONFIG_SYSCTL + nf_ct_netfilter_header = + register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); + if (!nf_ct_netfilter_header) { + pr_err("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto out_sysctl; + } +#endif + + ret = register_pernet_subsys(&nf_conntrack_net_ops); + if (ret < 0) + goto out_pernet; + + nf_conntrack_init_end(); + return 0; + +out_pernet: +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(nf_ct_netfilter_header); +out_sysctl: +#endif + nf_conntrack_cleanup_end(); +out_start: + return ret; } static void __exit nf_conntrack_standalone_fini(void) { + nf_conntrack_cleanup_start(); unregister_pernet_subsys(&nf_conntrack_net_ops); +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(nf_ct_netfilter_header); +#endif + nf_conntrack_cleanup_end(); } module_init(nf_conntrack_standalone_init); diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 75466fd72f4..e68ab4fbd71 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -1,5 +1,5 @@ /* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -60,8 +60,10 @@ static int tftp_help(struct sk_buff *skb, nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); exp = nf_ct_expect_alloc(ct); - if (exp == NULL) + if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); return NF_DROP; + } tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), @@ -74,8 +76,10 @@ static int tftp_help(struct sk_buff *skb, nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook); if (nf_nat_tftp && ct->status & IPS_NAT_MASK) ret = nf_nat_tftp(skb, ctinfo, exp); - else if (nf_ct_expect_related(exp) != 0) + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); break; case TFTP_OPCODE_DATA: @@ -92,7 +96,6 @@ static int tftp_help(struct sk_buff *skb, } static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; -static char tftp_names[MAX_PORTS][2][sizeof("tftp-65535")] __read_mostly; static const struct nf_conntrack_expect_policy tftp_exp_policy = { .max_expected = 1, @@ -112,7 +115,6 @@ static void nf_conntrack_tftp_fini(void) static int __init nf_conntrack_tftp_init(void) { int i, j, ret; - char *tmpname; if (ports_c == 0) ports[ports_c++] = TFTP_PORT; @@ -129,12 +131,10 @@ static int __init nf_conntrack_tftp_init(void) tftp[i][j].me = THIS_MODULE; tftp[i][j].help = tftp_help; - tmpname = &tftp_names[i][j][0]; if (ports[i] == TFTP_PORT) - sprintf(tmpname, "tftp"); + sprintf(tftp[i][j].name, "tftp"); else - sprintf(tmpname, "tftp-%u", i); - tftp[i][j].name = tmpname; + sprintf(tftp[i][j].name, "tftp-%u", i); ret = nf_conntrack_helper_register(&tftp[i][j]); if (ret) { diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c new file mode 100644 index 00000000000..93da609d9d2 --- /dev/null +++ b/net/netfilter/nf_conntrack_timeout.c @@ -0,0 +1,51 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +struct ctnl_timeout * +(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); + +void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); + +static struct nf_ct_ext_type timeout_extend __read_mostly = { + .len = sizeof(struct nf_conn_timeout), + .align = __alignof__(struct nf_conn_timeout), + .id = NF_CT_EXT_TIMEOUT, +}; + +int nf_conntrack_timeout_init(void) +{ + int ret = nf_ct_extend_register(&timeout_extend); + if (ret < 0) + pr_err("nf_ct_timeout: Unable to register timeout extension.\n"); + return ret; +} + +void nf_conntrack_timeout_fini(void) +{ + nf_ct_extend_unregister(&timeout_extend); +} diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index e8d27afbbdb..7a394df0deb 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -51,8 +51,12 @@ static int nf_conntrack_tstamp_init_sysctl(struct net *net) table[0].data = &net->ct.sysctl_tstamp; - net->ct.tstamp_sysctl_header = register_net_sysctl_table(net, - nf_net_netfilter_sysctl_path, table); + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + + net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter", + table); if (!net->ct.tstamp_sysctl_header) { printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); goto out_register; @@ -84,37 +88,27 @@ static void nf_conntrack_tstamp_fini_sysctl(struct net *net) } #endif -int nf_conntrack_tstamp_init(struct net *net) +int nf_conntrack_tstamp_pernet_init(struct net *net) { - int ret; - net->ct.sysctl_tstamp = nf_ct_tstamp; + return nf_conntrack_tstamp_init_sysctl(net); +} - if (net_eq(net, &init_net)) { - ret = nf_ct_extend_register(&tstamp_extend); - if (ret < 0) { - printk(KERN_ERR "nf_ct_tstamp: Unable to register " - "extension\n"); - goto out_extend_register; - } - } +void nf_conntrack_tstamp_pernet_fini(struct net *net) +{ + nf_conntrack_tstamp_fini_sysctl(net); +} - ret = nf_conntrack_tstamp_init_sysctl(net); +int nf_conntrack_tstamp_init(void) +{ + int ret; + ret = nf_ct_extend_register(&tstamp_extend); if (ret < 0) - goto out_sysctl; - - return 0; - -out_sysctl: - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&tstamp_extend); -out_extend_register: + pr_err("nf_ct_tstamp: Unable to register extension\n"); return ret; } -void nf_conntrack_tstamp_fini(struct net *net) +void nf_conntrack_tstamp_fini(void) { - nf_conntrack_tstamp_fini_sysctl(net); - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&tstamp_extend); + nf_ct_extend_unregister(&tstamp_extend); } diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 770f76432ad..61a3c927e63 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -13,26 +13,20 @@ /* core.c */ -extern unsigned int nf_iterate(struct list_head *head, - struct sk_buff *skb, - unsigned int hook, - const struct net_device *indev, - const struct net_device *outdev, - struct list_head **i, - int (*okfn)(struct sk_buff *), - int hook_thresh); +unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, + unsigned int hook, const struct net_device *indev, + const struct net_device *outdev, + struct nf_hook_ops **elemp, + int (*okfn)(struct sk_buff *), int hook_thresh); /* nf_queue.c */ -extern int nf_queue(struct sk_buff *skb, - struct list_head *elem, - u_int8_t pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum); -extern int __init netfilter_queue_init(void); +int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf, + unsigned int hook, struct net_device *indev, + struct net_device *outdev, int (*okfn)(struct sk_buff *), + unsigned int queuenum); +int __init netfilter_queue_init(void); /* nf_log.c */ -extern int __init netfilter_log_init(void); +int __init netfilter_log_init(void); #endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 957374a234d..85296d4eac0 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,7 +16,6 @@ #define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 -static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); @@ -32,13 +31,46 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) return NULL; } -/* return EEXIST if the same logger is registred, 0 on success. */ +void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +{ + const struct nf_logger *log; + + if (pf == NFPROTO_UNSPEC) + return; + + mutex_lock(&nf_log_mutex); + log = rcu_dereference_protected(net->nf.nf_loggers[pf], + lockdep_is_held(&nf_log_mutex)); + if (log == NULL) + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); + + mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_set); + +void nf_log_unset(struct net *net, const struct nf_logger *logger) +{ + int i; + const struct nf_logger *log; + + mutex_lock(&nf_log_mutex); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = rcu_dereference_protected(net->nf.nf_loggers[i], + lockdep_is_held(&nf_log_mutex)); + if (log == logger) + RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL); + } + mutex_unlock(&nf_log_mutex); + synchronize_rcu(); +} +EXPORT_SYMBOL(nf_log_unset); + +/* return EEXIST if the same logger is registered, 0 on success. */ int nf_log_register(u_int8_t pf, struct nf_logger *logger) { - const struct nf_logger *llog; int i; - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(logger->list); i++) @@ -52,10 +84,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) } else { /* register at end of list to honor first register win */ list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); - llog = rcu_dereference_protected(nf_loggers[pf], - lockdep_is_held(&nf_log_mutex)); - if (llog == NULL) - rcu_assign_pointer(nf_loggers[pf], logger); } mutex_unlock(&nf_log_mutex); @@ -66,49 +94,43 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { - const struct nf_logger *c_logger; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { - c_logger = rcu_dereference_protected(nf_loggers[i], - lockdep_is_held(&nf_log_mutex)); - if (c_logger == logger) - RCU_INIT_POINTER(nf_loggers[i], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) list_del(&logger->list[i]); - } mutex_unlock(&nf_log_mutex); - - synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); -int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) +int nf_log_bind_pf(struct net *net, u_int8_t pf, + const struct nf_logger *logger) { - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return -EINVAL; mutex_lock(&nf_log_mutex); if (__find_logger(pf, logger->name) == NULL) { mutex_unlock(&nf_log_mutex); return -ENOENT; } - rcu_assign_pointer(nf_loggers[pf], logger); + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); return 0; } EXPORT_SYMBOL(nf_log_bind_pf); -void nf_log_unbind_pf(u_int8_t pf) +void nf_log_unbind_pf(struct net *net, u_int8_t pf) { - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return; mutex_lock(&nf_log_mutex); - RCU_INIT_POINTER(nf_loggers[pf], NULL); + RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL); mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unbind_pf); -void nf_log_packet(u_int8_t pf, +void nf_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -121,12 +143,12 @@ void nf_log_packet(u_int8_t pf, const struct nf_logger *logger; rcu_read_lock(); - logger = rcu_dereference(nf_loggers[pf]); + logger = rcu_dereference(net->nf.nf_loggers[pf]); if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); - logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); } rcu_read_unlock(); } @@ -135,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet); #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); + mutex_lock(&nf_log_mutex); - if (*pos >= ARRAY_SIZE(nf_loggers)) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -145,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos) static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct net *net = seq_file_net(s); + (*pos)++; - if (*pos >= ARRAY_SIZE(nf_loggers)) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -164,8 +190,9 @@ static int seq_show(struct seq_file *s, void *v) const struct nf_logger *logger; struct nf_logger *t; int ret; + struct net *net = seq_file_net(s); - logger = rcu_dereference_protected(nf_loggers[*pos], + logger = rcu_dereference_protected(net->nf.nf_loggers[*pos], lockdep_is_held(&nf_log_mutex)); if (!logger) @@ -199,7 +226,8 @@ static const struct seq_operations nflog_seq_ops = { static int nflog_open(struct inode *inode, struct file *file) { - return seq_open(file, &nflog_seq_ops); + return seq_open_net(inode, file, &nflog_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations nflog_file_ops = { @@ -207,25 +235,17 @@ static const struct file_operations nflog_file_ops = { .open = nflog_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; #endif /* PROC_FS */ #ifdef CONFIG_SYSCTL -static struct ctl_path nf_log_sysctl_path[] = { - { .procname = "net", }, - { .procname = "netfilter", }, - { .procname = "nf_log", }, - { } -}; - static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; -static struct ctl_table_header *nf_log_dir_header; -static int nf_log_proc_dostring(ctl_table *table, int write, +static int nf_log_proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; @@ -233,6 +253,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, size_t size = *lenp; int r = 0; int tindex = (unsigned long)table->extra1; + struct net *net = current->nsproxy->net_ns; if (write) { if (size > sizeof(buf)) @@ -241,7 +262,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, return -EFAULT; if (!strcmp(buf, "NONE")) { - nf_log_unbind_pf(tindex); + nf_log_unbind_pf(net, tindex); return 0; } mutex_lock(&nf_log_mutex); @@ -250,11 +271,11 @@ static int nf_log_proc_dostring(ctl_table *table, int write, mutex_unlock(&nf_log_mutex); return -ENOENT; } - rcu_assign_pointer(nf_loggers[tindex], logger); + rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); mutex_unlock(&nf_log_mutex); } else { mutex_lock(&nf_log_mutex); - logger = rcu_dereference_protected(nf_loggers[tindex], + logger = rcu_dereference_protected(net->nf.nf_loggers[tindex], lockdep_is_held(&nf_log_mutex)); if (!logger) table->data = "NONE"; @@ -267,49 +288,112 @@ static int nf_log_proc_dostring(ctl_table *table, int write, return r; } -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net) { int i; - - for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { - snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i); - nf_log_sysctl_table[i].procname = - nf_log_sysctl_fnames[i-NFPROTO_UNSPEC]; - nf_log_sysctl_table[i].data = NULL; - nf_log_sysctl_table[i].maxlen = - NFLOGGER_NAME_LEN * sizeof(char); - nf_log_sysctl_table[i].mode = 0644; - nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; - nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; + struct ctl_table *table; + + table = nf_log_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_log_sysctl_table, + sizeof(nf_log_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } else { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + snprintf(nf_log_sysctl_fnames[i], + 3, "%d", i); + nf_log_sysctl_table[i].procname = + nf_log_sysctl_fnames[i]; + nf_log_sysctl_table[i].data = NULL; + nf_log_sysctl_table[i].maxlen = + NFLOGGER_NAME_LEN * sizeof(char); + nf_log_sysctl_table[i].mode = 0644; + nf_log_sysctl_table[i].proc_handler = + nf_log_proc_dostring; + nf_log_sysctl_table[i].extra1 = + (void *)(unsigned long) i; + } } - nf_log_dir_header = register_sysctl_paths(nf_log_sysctl_path, - nf_log_sysctl_table); - if (!nf_log_dir_header) - return -ENOMEM; + net->nf.nf_log_dir_header = register_net_sysctl(net, + "net/netfilter/nf_log", + table); + if (!net->nf.nf_log_dir_header) + goto err_reg; return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void netfilter_log_sysctl_exit(struct net *net) +{ + struct ctl_table *table; + + table = net->nf.nf_log_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_log_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); } #else -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net) { return 0; } + +static void netfilter_log_sysctl_exit(struct net *net) +{ +} #endif /* CONFIG_SYSCTL */ -int __init netfilter_log_init(void) +static int __net_init nf_log_net_init(struct net *net) { - int i, r; + int ret = -ENOMEM; + #ifdef CONFIG_PROC_FS if (!proc_create("nf_log", S_IRUGO, - proc_net_netfilter, &nflog_file_ops)) - return -1; + net->nf.proc_netfilter, &nflog_file_ops)) + return ret; #endif + ret = netfilter_log_sysctl_init(net); + if (ret < 0) + goto out_sysctl; - /* Errors will trigger panic, unroll on error is unnecessary. */ - r = netfilter_log_sysctl_init(); - if (r < 0) - return r; + return 0; + +out_sysctl: +#ifdef CONFIG_PROC_FS + remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif + return ret; +} + +static void __net_exit nf_log_net_exit(struct net *net) +{ + netfilter_log_sysctl_exit(net); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif +} + +static struct pernet_operations nf_log_net_ops = { + .init = nf_log_net_init, + .exit = nf_log_net_exit, +}; + +int __init netfilter_log_init(void) +{ + int i, ret; + + ret = register_pernet_subsys(&nf_log_net_ops); + if (ret < 0) + return ret; for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) INIT_LIST_HEAD(&(nf_loggers_l[i])); diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c new file mode 100644 index 00000000000..eb772380a20 --- /dev/null +++ b/net/netfilter/nf_nat_amanda.c @@ -0,0 +1,90 @@ +/* Amanda extension for TCP NAT alteration. + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on a copy of HW's ip_nat_irc.c as well as other modules + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/udp.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_amanda.h> + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_amanda"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + char buffer[sizeof("65535")]; + u_int16_t port; + unsigned int ret; + + /* Connection comes from client. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_ORIGINAL; + + /* When you see the packet, we need to NAT it the same as the + * this one (ie. same IP: it will be TCP and master is UDP). */ + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int res; + + exp->tuple.dst.u.tcp.port = htons(port); + res = nf_ct_expect_related(exp); + if (res == 0) + break; + else if (res != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, exp->master, "all ports in use"); + return NF_DROP; + } + + sprintf(buffer, "%u", port); + ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, + protoff, matchoff, matchlen, + buffer, strlen(buffer)); + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); + nf_ct_unexpect_related(exp); + } + return ret; +} + +static void __exit nf_nat_amanda_fini(void) +{ + RCU_INIT_POINTER(nf_nat_amanda_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_amanda_init(void) +{ + BUG_ON(nf_nat_amanda_hook != NULL); + RCU_INIT_POINTER(nf_nat_amanda_hook, help); + return 0; +} + +module_init(nf_nat_amanda_init); +module_exit(nf_nat_amanda_fini); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c new file mode 100644 index 00000000000..a49907b1dab --- /dev/null +++ b/net/netfilter/nf_nat_core.c @@ -0,0 +1,898 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/gfp.h> +#include <net/xfrm.h> +#include <linux/jhash.h> +#include <linux/rtnetlink.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <linux/netfilter/nf_nat.h> + +static DEFINE_SPINLOCK(nf_nat_lock); + +static DEFINE_MUTEX(nf_nat_proto_mutex); +static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] + __read_mostly; +static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] + __read_mostly; + + +inline const struct nf_nat_l3proto * +__nf_nat_l3proto_find(u8 family) +{ + return rcu_dereference(nf_nat_l3protos[family]); +} + +inline const struct nf_nat_l4proto * +__nf_nat_l4proto_find(u8 family, u8 protonum) +{ + return rcu_dereference(nf_nat_l4protos[family][protonum]); +} +EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find); + +#ifdef CONFIG_XFRM +static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) +{ + const struct nf_nat_l3proto *l3proto; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + unsigned long statusbit; + u8 family; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return; + + family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + rcu_read_lock(); + l3proto = __nf_nat_l3proto_find(family); + if (l3proto == NULL) + goto out; + + dir = CTINFO2DIR(ctinfo); + if (dir == IP_CT_DIR_ORIGINAL) + statusbit = IPS_DST_NAT; + else + statusbit = IPS_SRC_NAT; + + l3proto->decode_session(skb, ct, dir, statusbit, fl); +out: + rcu_read_unlock(); +} + +int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) +{ + struct flowi fl; + unsigned int hh_len; + struct dst_entry *dst; + int err; + + err = xfrm_decode_session(skb, &fl, family); + if (err < 0) + return err; + + dst = skb_dst(skb); + if (dst->xfrm) + dst = ((struct xfrm_dst *)dst)->route; + dst_hold(dst); + + dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(nf_xfrm_me_harder); +#endif /* CONFIG_XFRM */ + +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + unsigned int hash; + + /* Original src, to ensure we map it consistently if poss. */ + hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), + tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); + return ((u64)hash * net->ct.nat_htable_size) >> 32; +} + +/* Is this tuple already taken? (not by us) */ +int +nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + /* Conntrack tracking doesn't keep track of outgoing tuples; only + * incoming ones. NAT means they don't have a fixed mapping, + * so we invert the tuple and look for the incoming reply. + * + * We could keep a separate hash if this proves too slow. + */ + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuplepr(&reply, tuple); + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); +} +EXPORT_SYMBOL(nf_nat_used_tuple); + +/* If we source map this tuple so reply looks like reply_tuple, will + * that meet the constraints of range. + */ +static int in_range(const struct nf_nat_l3proto *l3proto, + const struct nf_nat_l4proto *l4proto, + const struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range) +{ + /* If we are supposed to map IPs, then we must be in the + * range specified, otherwise let this drag us onto a new src IP. + */ + if (range->flags & NF_NAT_RANGE_MAP_IPS && + !l3proto->in_range(tuple, range)) + return 0; + + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) || + l4proto->in_range(tuple, NF_NAT_MANIP_SRC, + &range->min_proto, &range->max_proto)) + return 1; + + return 0; +} + +static inline int +same_src(const struct nf_conn *ct, + const struct nf_conntrack_tuple *tuple) +{ + const struct nf_conntrack_tuple *t; + + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + return (t->dst.protonum == tuple->dst.protonum && + nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && + t->src.u.all == tuple->src.u.all); +} + +/* Only called for SRC manip */ +static int +find_appropriate_src(struct net *net, u16 zone, + const struct nf_nat_l3proto *l3proto, + const struct nf_nat_l4proto *l4proto, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *result, + const struct nf_nat_range *range) +{ + unsigned int h = hash_by_src(net, zone, tuple); + const struct nf_conn_nat *nat; + const struct nf_conn *ct; + + hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { + ct = nat->ct; + if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { + /* Copy source part from reply tuple. */ + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(l3proto, l4proto, result, range)) + return 1; + } + } + return 0; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus + * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + * 1-65535, we don't do pro-rata allocation based on ports; we choose + * the ip with the lowest src-ip/dst-ip/proto usage. + */ +static void +find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + const struct nf_conn *ct, + enum nf_nat_manip_type maniptype) +{ + union nf_inet_addr *var_ipp; + unsigned int i, max; + /* Host order */ + u32 minip, maxip, j, dist; + bool full_range; + + /* No IP mapping? Do nothing. */ + if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) + return; + + if (maniptype == NF_NAT_MANIP_SRC) + var_ipp = &tuple->src.u3; + else + var_ipp = &tuple->dst.u3; + + /* Fast path: only one choice. */ + if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { + *var_ipp = range->min_addr; + return; + } + + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + max = sizeof(var_ipp->ip) / sizeof(u32) - 1; + else + max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; + + /* Hashing source and destination IPs gives a fairly even + * spread in practice (if there are a small number of IPs + * involved, there usually aren't that many connections + * anyway). The consistency means that servers see the same + * client coming from the same IP (some Internet Banking sites + * like this), even across reboots. + */ + j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), + range->flags & NF_NAT_RANGE_PERSISTENT ? + 0 : (__force u32)tuple->dst.u3.all[max] ^ zone); + + full_range = false; + for (i = 0; i <= max; i++) { + /* If first bytes of the address are at the maximum, use the + * distance. Otherwise use the full range. + */ + if (!full_range) { + minip = ntohl((__force __be32)range->min_addr.all[i]); + maxip = ntohl((__force __be32)range->max_addr.all[i]); + dist = maxip - minip + 1; + } else { + minip = 0; + dist = ~0; + } + + var_ipp->all[i] = (__force __u32) + htonl(minip + (((u64)j * dist) >> 32)); + if (var_ipp->all[i] != range->max_addr.all[i]) + full_range = true; + + if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) + j ^= (__force u32)tuple->dst.u3.all[i]; + } +} + +/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, + * we change the source to map into the range. For NF_INET_PRE_ROUTING + * and NF_INET_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void +get_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig_tuple, + const struct nf_nat_range *range, + struct nf_conn *ct, + enum nf_nat_manip_type maniptype) +{ + const struct nf_nat_l3proto *l3proto; + const struct nf_nat_l4proto *l4proto; + struct net *net = nf_ct_net(ct); + u16 zone = nf_ct_zone(ct); + + rcu_read_lock(); + l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); + l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, + orig_tuple->dst.protonum); + + /* 1) If this srcip/proto/src-proto-part is currently mapped, + * and that same mapping gives a unique tuple within the given + * range, use that. + * + * This is only required for source (ie. NAT/masq) mappings. + * So far, we don't do local source mappings, so multiple + * manips not an issue. + */ + if (maniptype == NF_NAT_MANIP_SRC && + !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { + /* try the original tuple first */ + if (in_range(l3proto, l4proto, orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + goto out; + } + } else if (find_appropriate_src(net, zone, l3proto, l4proto, + orig_tuple, tuple, range)) { + pr_debug("get_unique_tuple: Found current src map\n"); + if (!nf_nat_used_tuple(tuple, ct)) + goto out; + } + } + + /* 2) Select the least-used IP/proto combination in the given range */ + *tuple = *orig_tuple; + find_best_ips_proto(zone, tuple, range, ct, maniptype); + + /* 3) The per-protocol part of the manip is made to map into + * the range to make a unique tuple. + */ + + /* Only bother mapping if it's not already in range and unique */ + if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + if (l4proto->in_range(tuple, maniptype, + &range->min_proto, + &range->max_proto) && + (range->min_proto.all == range->max_proto.all || + !nf_nat_used_tuple(tuple, ct))) + goto out; + } else if (!nf_nat_used_tuple(tuple, ct)) { + goto out; + } + } + + /* Last change: get protocol to try to obtain unique tuple. */ + l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); +out: + rcu_read_unlock(); +} + +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + if (nat) + return nat; + + if (!nf_ct_is_confirmed(ct)) + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + + return nat; +} +EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); + +unsigned int +nf_nat_setup_info(struct nf_conn *ct, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_tuple curr_tuple, new_tuple; + struct nf_conn_nat *nat; + + /* nat helper or nfctnetlink also setup binding */ + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || + maniptype == NF_NAT_MANIP_DST); + BUG_ON(nf_nat_initialized(ct, maniptype)); + + /* What we've got will look like inverse of reply. Normally + * this is what is in the conntrack, except for prior + * manipulations (future optimization: if num_manips == 0, + * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) + */ + nf_ct_invert_tuplepr(&curr_tuple, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); + + if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { + struct nf_conntrack_tuple reply; + + /* Alter conntrack table so will recognize replies. */ + nf_ct_invert_tuplepr(&reply, &new_tuple); + nf_conntrack_alter_reply(ct, &reply); + + /* Non-atomic: we own this at the moment. */ + if (maniptype == NF_NAT_MANIP_SRC) + ct->status |= IPS_SRC_NAT; + else + ct->status |= IPS_DST_NAT; + + if (nfct_help(ct)) + nfct_seqadj_ext_add(ct); + } + + if (maniptype == NF_NAT_MANIP_SRC) { + unsigned int srchash; + + srchash = hash_by_src(net, nf_ct_zone(ct), + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + /* nf_conntrack_alter_reply might re-allocate extension aera */ + nat = nfct_nat(ct); + nat->ct = ct; + hlist_add_head_rcu(&nat->bysource, + &net->ct.nat_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); + } + + /* It's done. */ + if (maniptype == NF_NAT_MANIP_DST) + ct->status |= IPS_DST_NAT_DONE; + else + ct->status |= IPS_SRC_NAT_DONE; + + return NF_ACCEPT; +} +EXPORT_SYMBOL(nf_nat_setup_info); + +static unsigned int +__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) +{ + /* Force range to this IP; let proto decide mapping for + * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + * Use reply in case it's already been mangled (eg local packet). + */ + union nf_inet_addr ip = + (manip == NF_NAT_MANIP_SRC ? + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); + struct nf_nat_range range = { + .flags = NF_NAT_RANGE_MAP_IPS, + .min_addr = ip, + .max_addr = ip, + }; + return nf_nat_setup_info(ct, &range, manip); +} + +unsigned int +nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); +} +EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); + +/* Do packet manipulations according to nf_nat_setup_info. */ +unsigned int nf_nat_packet(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff *skb) +{ + const struct nf_nat_l3proto *l3proto; + const struct nf_nat_l4proto *l4proto; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned long statusbit; + enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); + + if (mtype == NF_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + /* Non-atomic: these bits don't change. */ + if (ct->status & statusbit) { + struct nf_conntrack_tuple target; + + /* We are aiming to look like inverse of other direction. */ + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + + l3proto = __nf_nat_l3proto_find(target.src.l3num); + l4proto = __nf_nat_l4proto_find(target.src.l3num, + target.dst.protonum); + if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) + return NF_DROP; + } + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_nat_packet); + +struct nf_nat_proto_clean { + u8 l3proto; + u8 l4proto; +}; + +/* kill conntracks with affected NAT section */ +static int nf_nat_proto_remove(struct nf_conn *i, void *data) +{ + const struct nf_nat_proto_clean *clean = data; + struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + + if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || + (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) + return 0; + + return i->status & IPS_NAT_MASK ? 1 : 0; +} + +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + + if (nf_nat_proto_remove(ct, data)) + return 1; + + if (!nat || !nat->ct) + return 0; + + /* This netns is being destroyed, and conntrack has nat null binding. + * Remove it from bysource hash, as the table will be freed soon. + * + * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() + * will delete entry from already-freed table. + */ + if (!del_timer(&ct->timeout)) + return 1; + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + ct->status &= ~IPS_NAT_DONE_MASK; + nat->ct = NULL; + spin_unlock_bh(&nf_nat_lock); + + add_timer(&ct->timeout); + + /* don't delete conntrack. Although that would make things a lot + * simpler, we'd end up flushing all conntracks on nat rmmod. + */ + return 0; +} + +static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) +{ + struct nf_nat_proto_clean clean = { + .l3proto = l3proto, + .l4proto = l4proto, + }; + struct net *net; + + rtnl_lock(); + for_each_net(net) + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); + rtnl_unlock(); +} + +static void nf_nat_l3proto_clean(u8 l3proto) +{ + struct nf_nat_proto_clean clean = { + .l3proto = l3proto, + }; + struct net *net; + + rtnl_lock(); + + for_each_net(net) + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); + rtnl_unlock(); +} + +/* Protocol registration. */ +int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto) +{ + const struct nf_nat_l4proto **l4protos; + unsigned int i; + int ret = 0; + + mutex_lock(&nf_nat_proto_mutex); + if (nf_nat_l4protos[l3proto] == NULL) { + l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *), + GFP_KERNEL); + if (l4protos == NULL) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < IPPROTO_MAX; i++) + RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown); + + /* Before making proto_array visible to lockless readers, + * we must make sure its content is committed to memory. + */ + smp_wmb(); + + nf_nat_l4protos[l3proto] = l4protos; + } + + if (rcu_dereference_protected( + nf_nat_l4protos[l3proto][l4proto->l4proto], + lockdep_is_held(&nf_nat_proto_mutex) + ) != &nf_nat_l4proto_unknown) { + ret = -EBUSY; + goto out; + } + RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto); + out: + mutex_unlock(&nf_nat_proto_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_register); + +/* No one stores the protocol anywhere; simply delete it. */ +void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto) +{ + mutex_lock(&nf_nat_proto_mutex); + RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], + &nf_nat_l4proto_unknown); + mutex_unlock(&nf_nat_proto_mutex); + synchronize_rcu(); + + nf_nat_l4proto_clean(l3proto, l4proto->l4proto); +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); + +int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) +{ + int err; + + err = nf_ct_l3proto_try_module_get(l3proto->l3proto); + if (err < 0) + return err; + + mutex_lock(&nf_nat_proto_mutex); + RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], + &nf_nat_l4proto_tcp); + RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP], + &nf_nat_l4proto_udp); + mutex_unlock(&nf_nat_proto_mutex); + + RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto); + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_register); + +void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto) +{ + mutex_lock(&nf_nat_proto_mutex); + RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL); + mutex_unlock(&nf_nat_proto_mutex); + synchronize_rcu(); + + nf_nat_l3proto_clean(l3proto->l3proto); + nf_ct_l3proto_module_put(l3proto->l3proto); +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); + +/* No one using conntrack by the time this called. */ +static void nf_nat_cleanup_conntrack(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); + + if (nat == NULL || nat->ct == NULL) + return; + + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + spin_unlock_bh(&nf_nat_lock); +} + +static void nf_nat_move_storage(void *new, void *old) +{ + struct nf_conn_nat *new_nat = new; + struct nf_conn_nat *old_nat = old; + struct nf_conn *ct = old_nat->ct; + + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) + return; + + spin_lock_bh(&nf_nat_lock); + hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); + spin_unlock_bh(&nf_nat_lock); +} + +static struct nf_ct_ext_type nat_extend __read_mostly = { + .len = sizeof(struct nf_conn_nat), + .align = __alignof__(struct nf_conn_nat), + .destroy = nf_nat_cleanup_conntrack, + .move = nf_nat_move_storage, + .id = NF_CT_EXT_NAT, + .flags = NF_CT_EXT_F_PREALLOC, +}; + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { + [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, + [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, +}; + +static int nfnetlink_parse_nat_proto(struct nlattr *attr, + const struct nf_conn *ct, + struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_PROTONAT_MAX+1]; + const struct nf_nat_l4proto *l4proto; + int err; + + err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); + if (err < 0) + return err; + + l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->nlattr_to_range) + err = l4proto->nlattr_to_range(tb, range); + + return err; +} + +static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { + [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, + [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, + [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, + [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, + [CTA_NAT_PROTO] = { .type = NLA_NESTED }, +}; + +static int +nfnetlink_parse_nat(const struct nlattr *nat, + const struct nf_conn *ct, struct nf_nat_range *range, + const struct nf_nat_l3proto *l3proto) +{ + struct nlattr *tb[CTA_NAT_MAX+1]; + int err; + + memset(range, 0, sizeof(*range)); + + err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); + if (err < 0) + return err; + + err = l3proto->nlattr_to_range(tb, range); + if (err < 0) + return err; + + if (!tb[CTA_NAT_PROTO]) + return 0; + + return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); +} + +/* This function is called under rcu_read_lock() */ +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + struct nf_nat_range range; + const struct nf_nat_l3proto *l3proto; + int err; + + /* Should not happen, restricted to creating new conntracks + * via ctnetlink. + */ + if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) + return -EEXIST; + + /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to + * attach the null binding, otherwise this may oops. + */ + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + if (l3proto == NULL) + return -EAGAIN; + + /* No NAT information has been passed, allocate the null-binding */ + if (attr == NULL) + return __nf_nat_alloc_null_binding(ct, manip); + + err = nfnetlink_parse_nat(attr, ct, &range, l3proto); + if (err < 0) + return err; + + return nf_nat_setup_info(ct, &range, manip); +} +#else +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +static int __net_init nf_nat_net_init(struct net *net) +{ + /* Leave them the same for the moment. */ + net->ct.nat_htable_size = net->ct.htable_size; + net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); + if (!net->ct.nat_bysource) + return -ENOMEM; + return 0; +} + +static void __net_exit nf_nat_net_exit(struct net *net) +{ + struct nf_nat_proto_clean clean = {}; + + nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); + synchronize_rcu(); + nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); +} + +static struct pernet_operations nf_nat_net_ops = { + .init = nf_nat_net_init, + .exit = nf_nat_net_exit, +}; + +static struct nf_ct_helper_expectfn follow_master_nat = { + .name = "nat-follow-master", + .expectfn = nf_nat_follow_master, +}; + +static int __init nf_nat_init(void) +{ + int ret; + + ret = nf_ct_extend_register(&nat_extend); + if (ret < 0) { + printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); + return ret; + } + + ret = register_pernet_subsys(&nf_nat_net_ops); + if (ret < 0) + goto cleanup_extend; + + nf_ct_helper_expectfn_register(&follow_master_nat); + + /* Initialize fake conntrack so that NAT will skip it */ + nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); + + BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); + RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, + nfnetlink_parse_nat_setup); +#ifdef CONFIG_XFRM + BUG_ON(nf_nat_decode_session_hook != NULL); + RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); +#endif + return 0; + + cleanup_extend: + nf_ct_extend_unregister(&nat_extend); + return ret; +} + +static void __exit nf_nat_cleanup(void) +{ + unsigned int i; + + unregister_pernet_subsys(&nf_nat_net_ops); + nf_ct_extend_unregister(&nat_extend); + nf_ct_helper_expectfn_unregister(&follow_master_nat); + RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); +#ifdef CONFIG_XFRM + RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); +#endif + for (i = 0; i < NFPROTO_NUMPROTO; i++) + kfree(nf_nat_l4protos[i]); + synchronize_net(); +} + +MODULE_LICENSE("GPL"); + +module_init(nf_nat_init); +module_exit(nf_nat_cleanup); diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c new file mode 100644 index 00000000000..e84a578dbe3 --- /dev/null +++ b/net/netfilter/nf_nat_ftp.c @@ -0,0 +1,146 @@ +/* FTP extension for TCP NAT alteration. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/inet.h> +#include <linux/tcp.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_ftp.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp NAT helper"); +MODULE_ALIAS("ip_nat_ftp"); + +/* FIXME: Time out? --RR */ + +static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type, + char *buffer, size_t buflen, + union nf_inet_addr *addr, u16 port) +{ + switch (type) { + case NF_CT_FTP_PORT: + case NF_CT_FTP_PASV: + return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&addr->ip)[0], + ((unsigned char *)&addr->ip)[1], + ((unsigned char *)&addr->ip)[2], + ((unsigned char *)&addr->ip)[3], + port >> 8, + port & 0xFF); + case NF_CT_FTP_EPRT: + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + return snprintf(buffer, buflen, "|1|%pI4|%u|", + &addr->ip, port); + else + return snprintf(buffer, buflen, "|2|%pI6|%u|", + &addr->ip6, port); + case NF_CT_FTP_EPSV: + return snprintf(buffer, buflen, "|||%u|", port); + } + + return 0; +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int nf_nat_ftp(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + union nf_inet_addr newaddr; + u_int16_t port; + int dir = CTINFO2DIR(ctinfo); + struct nf_conn *ct = exp->master; + char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN]; + unsigned int buflen; + + pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + + /* Connection will come from wherever this packet goes, hence !dir */ + newaddr = ct->tuplehash[!dir].tuple.dst.u3; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use"); + return NF_DROP; + } + + buflen = nf_nat_ftp_fmt_cmd(ct, type, buffer, sizeof(buffer), + &newaddr, port); + if (!buflen) + goto out; + + pr_debug("calling nf_nat_mangle_tcp_packet\n"); + + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, + matchlen, buffer, buflen)) + goto out; + + return NF_ACCEPT; + +out: + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + nf_ct_unexpect_related(exp); + return NF_DROP; +} + +static void __exit nf_nat_ftp_fini(void) +{ + RCU_INIT_POINTER(nf_nat_ftp_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_ftp_init(void) +{ + BUG_ON(nf_nat_ftp_hook != NULL); + RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp); + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO KBUILD_MODNAME + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_ftp_init); +module_exit(nf_nat_ftp_fini); diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c new file mode 100644 index 00000000000..2840abb5bb9 --- /dev/null +++ b/net/netfilter/nf_nat_helper.c @@ -0,0 +1,212 @@ +/* nf_nat_helper.c - generic support functions for NAT helpers + * + * (C) 2000-2002 Harald Welte <laforge@netfilter.org> + * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2007-2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> + +/* Frobs data inside this packet, which is linear. */ +static void mangle_contents(struct sk_buff *skb, + unsigned int dataoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + unsigned char *data; + + BUG_ON(skb_is_nonlinear(skb)); + data = skb_network_header(skb) + dataoff; + + /* move post-replacement */ + memmove(data + match_offset + rep_len, + data + match_offset + match_len, + skb_tail_pointer(skb) - (skb_network_header(skb) + dataoff + + match_offset + match_len)); + + /* insert data from buffer */ + memcpy(data + match_offset, rep_buffer, rep_len); + + /* update skb info */ + if (rep_len > match_len) { + pr_debug("nf_nat_mangle_packet: Extending packet by " + "%u from %u bytes\n", rep_len - match_len, skb->len); + skb_put(skb, rep_len - match_len); + } else { + pr_debug("nf_nat_mangle_packet: Shrinking packet from " + "%u from %u bytes\n", match_len - rep_len, skb->len); + __skb_trim(skb, skb->len + rep_len - match_len); + } + + if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) { + /* fix IP hdr checksum information */ + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); + } else + ipv6_hdr(skb)->payload_len = + htons(skb->len - sizeof(struct ipv6hdr)); +} + +/* Unusual, but possible case. */ +static int enlarge_skb(struct sk_buff *skb, unsigned int extra) +{ + if (skb->len + extra > 65535) + return 0; + + if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC)) + return 0; + + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX + * command in FTP). + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * */ +int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len, bool adjust) +{ + const struct nf_nat_l3proto *l3proto; + struct tcphdr *tcph; + int oldlen, datalen; + + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (rep_len > match_len && + rep_len - match_len > skb_tailroom(skb) && + !enlarge_skb(skb, rep_len - match_len)) + return 0; + + SKB_LINEAR_ASSERT(skb); + + tcph = (void *)skb->data + protoff; + + oldlen = skb->len - protoff; + mangle_contents(skb, protoff + tcph->doff*4, + match_offset, match_len, rep_buffer, rep_len); + + datalen = skb->len - protoff; + + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + l3proto->csum_recalc(skb, IPPROTO_TCP, tcph, &tcph->check, + datalen, oldlen); + + if (adjust && rep_len != match_len) + nf_ct_seqadj_set(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); + + return 1; +} +EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); + +/* Generic function for mangling variable-length address changes inside + * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX + * command in the Amanda protocol) + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * XXX - This function could be merged with nf_nat_mangle_tcp_packet which + * should be fairly easy to do. + */ +int +nf_nat_mangle_udp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + const struct nf_nat_l3proto *l3proto; + struct udphdr *udph; + int datalen, oldlen; + + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (rep_len > match_len && + rep_len - match_len > skb_tailroom(skb) && + !enlarge_skb(skb, rep_len - match_len)) + return 0; + + udph = (void *)skb->data + protoff; + + oldlen = skb->len - protoff; + mangle_contents(skb, protoff + sizeof(*udph), + match_offset, match_len, rep_buffer, rep_len); + + /* update the length of the UDP packet */ + datalen = skb->len - protoff; + udph->len = htons(datalen); + + /* fix udp checksum if udp checksum was previously calculated */ + if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) + return 1; + + l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); + l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check, + datalen, oldlen); + + return 1; +} +EXPORT_SYMBOL(nf_nat_mangle_udp_packet); + +/* Setup NAT on this expected conntrack so it follows master. */ +/* If we fail to get a free NAT slot, we'll get dropped on confirm */ +void nf_nat_follow_master(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.dst.u3; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); + range.min_proto = range.max_proto = exp->saved_proto; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.src.u3; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} +EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c new file mode 100644 index 00000000000..1fb2258c353 --- /dev/null +++ b/net/netfilter/nf_nat_irc.c @@ -0,0 +1,119 @@ +/* IRC extension for TCP NAT alteration. + * + * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> + * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * based on a copy of RR's ip_nat_ftp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/tcp.h> +#include <linux/kernel.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_irc.h> + +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("IRC (DCC) NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_irc"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + char buffer[sizeof("4294967296 65635")]; + struct nf_conn *ct = exp->master; + union nf_inet_addr newaddr; + u_int16_t port; + unsigned int ret; + + /* Reply comes from server. */ + newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3; + + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use"); + return NF_DROP; + } + + /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 + * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 + * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 + * + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, + * 255.255.255.255==4294967296, 10 digits) + * P: bound port (min 1 d, max 5d (65635)) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + /* AAA = "us", ie. where server normally talks to. */ + snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port); + pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", + buffer, &newaddr.ip, port); + + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, + matchlen, buffer, strlen(buffer)); + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + nf_ct_unexpect_related(exp); + } + + return ret; +} + +static void __exit nf_nat_irc_fini(void) +{ + RCU_INIT_POINTER(nf_nat_irc_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_irc_init(void) +{ + BUG_ON(nf_nat_irc_hook != NULL); + RCU_INIT_POINTER(nf_nat_irc_hook, help); + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO KBUILD_MODNAME + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_irc_init); +module_exit(nf_nat_irc_fini); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c new file mode 100644 index 00000000000..83a72a235ca --- /dev/null +++ b/net/netfilter/nf_nat_proto_common.c @@ -0,0 +1,114 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/random.h> +#include <linux/netfilter.h> +#include <linux/export.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + __be16 port; + + if (maniptype == NF_NAT_MANIP_SRC) + port = tuple->src.u.all; + else + port = tuple->dst.u.all; + + return ntohs(port) >= ntohs(min->all) && + ntohs(port) <= ntohs(max->all); +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range); + +void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct, + u16 *rover) +{ + unsigned int range_size, min, i; + __be16 *portptr; + u_int16_t off; + + if (maniptype == NF_NAT_MANIP_SRC) + portptr = &tuple->src.u.all; + else + portptr = &tuple->dst.u.all; + + /* If no range specified... */ + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == NF_NAT_MANIP_DST) + return; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr) < 512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min_proto.all); + range_size = ntohs(range->max_proto.all) - min + 1; + } + + if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) { + off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC + ? tuple->dst.u.all + : tuple->src.u.all); + } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { + off = prandom_u32(); + } else { + off = *rover; + } + + for (i = 0; ; ++off) { + *portptr = htons(min + off % range_size); + if (++i != range_size && nf_nat_used_tuple(tuple, ct)) + continue; + if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) + *rover = off; + return; + } +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range *range) +{ + if (tb[CTA_PROTONAT_PORT_MIN]) { + range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); + range->max_proto.all = range->min_proto.all; + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + if (tb[CTA_PROTONAT_PORT_MAX]) { + range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range); +#endif diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c new file mode 100644 index 00000000000..c8be2cdac0b --- /dev/null +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -0,0 +1,116 @@ +/* + * DCCP NAT protocol helper + * + * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/dccp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u_int16_t dccp_port_rover; + +static void +dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &dccp_port_rover); +} + +static bool +dccp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct dccp_hdr *hdr; + __be16 *portptr, oldport, newport; + int hdrsize = 8; /* DCCP connection tracking guarantees this much */ + + if (skb->len >= hdroff + sizeof(struct dccp_hdr)) + hdrsize = sizeof(struct dccp_hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + hdr = (struct dccp_hdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + newport = tuple->src.u.dccp.port; + portptr = &hdr->dccph_sport; + } else { + newport = tuple->dst.u.dccp.port; + portptr = &hdr->dccph_dport; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, + tuple, maniptype); + inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, + 0); + return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { + .l4proto = IPPROTO_DCCP, + .manip_pkt = dccp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = dccp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_dccp_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp); + if (err < 0) + goto err1; + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp); + if (err < 0) + goto err2; + return 0; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); +err1: + return err; +} + +static void __exit nf_nat_proto_dccp_fini(void) +{ + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); + +} + +module_init(nf_nat_proto_dccp_init); +module_exit(nf_nat_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("DCCP NAT protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c new file mode 100644 index 00000000000..754536f2c67 --- /dev/null +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sctp.h> +#include <linux/module.h> +#include <net/sctp/checksum.h> + +#include <net/netfilter/nf_nat_l4proto.h> + +static u_int16_t nf_sctp_port_rover; + +static void +sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &nf_sctp_port_rover); +} + +static bool +sctp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + sctp_sctphdr_t *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct sctphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + hdr->source = tuple->src.u.sctp.port; + } else { + /* Get rid of dst port */ + hdr->dest = tuple->dst.u.sctp.port; + } + + hdr->checksum = sctp_compute_cksum(skb, hdroff); + + return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { + .l4proto = IPPROTO_SCTP, + .manip_pkt = sctp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = sctp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_sctp_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp); + if (err < 0) + goto err1; + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp); + if (err < 0) + goto err2; + return 0; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); +err1: + return err; +} + +static void __exit nf_nat_proto_sctp_exit(void) +{ + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); +} + +module_init(nf_nat_proto_sctp_init); +module_exit(nf_nat_proto_sctp_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCTP NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c new file mode 100644 index 00000000000..83ec8a6e4c3 --- /dev/null +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -0,0 +1,85 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> +#include <net/netfilter/nf_nat_core.h> + +static u16 tcp_port_rover; + +static void +tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &tcp_port_rover); +} + +static bool +tcp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct tcphdr *hdr; + __be16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if (skb->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + hdr = (struct tcphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_tcp = { + .l4proto = IPPROTO_TCP, + .manip_pkt = tcp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = tcp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c new file mode 100644 index 00000000000..7df613fb34a --- /dev/null +++ b/net/netfilter/nf_nat_proto_udp.c @@ -0,0 +1,76 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u16 udp_port_rover; + +static void +udp_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &udp_port_rover); +} + +static bool +udp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct udphdr *hdr; + __be16 *portptr, newport; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + hdr = (struct udphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { + l3proto->csum_update(skb, iphdroff, &hdr->check, + tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, + 0); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + } + *portptr = newport; + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_udp = { + .l4proto = IPPROTO_UDP, + .manip_pkt = udp_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = udp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c new file mode 100644 index 00000000000..776a0d1317b --- /dev/null +++ b/net/netfilter/nf_nat_proto_udplite.c @@ -0,0 +1,106 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/module.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static u16 udplite_port_rover; + +static void +udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &udplite_port_rover); +} + +static bool +udplite_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct udphdr *hdr; + __be16 *portptr, newport; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct udphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of source port */ + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + + l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + + *portptr = newport; + return true; +} + +static const struct nf_nat_l4proto nf_nat_l4proto_udplite = { + .l4proto = IPPROTO_UDPLITE, + .manip_pkt = udplite_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = udplite_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_udplite_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite); + if (err < 0) + goto err1; + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite); + if (err < 0) + goto err2; + return 0; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); +err1: + return err; +} + +static void __exit nf_nat_proto_udplite_fini(void) +{ + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite); + nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); +} + +module_init(nf_nat_proto_udplite_init); +module_exit(nf_nat_proto_udplite_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c new file mode 100644 index 00000000000..6e494d58441 --- /dev/null +++ b/net/netfilter/nf_nat_proto_unknown.c @@ -0,0 +1,54 @@ +/* The "unknown" protocol. This is what is used for protocols we + * don't understand. It's returned by ip_ct_find_proto(). + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type manip_type, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + return true; +} + +static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + /* Sorry: we can't help you; if it's not unique, we can't frob + * anything. + */ + return; +} + +static bool +unknown_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_unknown = { + .manip_pkt = unknown_manip_pkt, + .in_range = unknown_in_range, + .unique_tuple = unknown_unique_tuple, +}; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c new file mode 100644 index 00000000000..b4d691db955 --- /dev/null +++ b/net/netfilter/nf_nat_sip.c @@ -0,0 +1,653 @@ +/* SIP extension for NAT alteration. + * + * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> + * based on RR's ip_nat_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008, 2011, 2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet.h> +#include <linux/udp.h> +#include <linux/tcp.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <linux/netfilter/nf_conntrack_sip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); +MODULE_DESCRIPTION("SIP NAT helper"); +MODULE_ALIAS("ip_nat_sip"); + + +static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + const char *buffer, unsigned int buflen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct tcphdr *th; + unsigned int baseoff; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) { + th = (struct tcphdr *)(skb->data + protoff); + baseoff = protoff + th->doff * 4; + matchoff += dataoff - baseoff; + + if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + protoff, matchoff, matchlen, + buffer, buflen, false)) + return 0; + } else { + baseoff = protoff + sizeof(struct udphdr); + matchoff += dataoff - baseoff; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, + protoff, matchoff, matchlen, + buffer, buflen)) + return 0; + } + + /* Reload data pointer and adjust datalen value */ + *dptr = skb->data + dataoff; + *datalen += buflen - matchlen; + return 1; +} + +static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer, + const union nf_inet_addr *addr, bool delim) +{ + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + return sprintf(buffer, "%pI4", &addr->ip); + else { + if (delim) + return sprintf(buffer, "[%pI6c]", &addr->ip6); + else + return sprintf(buffer, "%pI6c", &addr->ip6); + } +} + +static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer, + const union nf_inet_addr *addr, u16 port) +{ + if (nf_ct_l3num(ct) == NFPROTO_IPV4) + return sprintf(buffer, "%pI4:%u", &addr->ip, port); + else + return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port); +} + +static int map_addr(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + union nf_inet_addr *addr, __be16 port) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; + unsigned int buflen; + union nf_inet_addr newaddr; + __be16 newport; + + if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) && + ct->tuplehash[dir].tuple.src.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.dst.u3; + newport = ct->tuplehash[!dir].tuple.dst.u.udp.port; + } else if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, addr) && + ct->tuplehash[dir].tuple.dst.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.src.u3; + newport = ct_sip_info->forced_dport ? : + ct->tuplehash[!dir].tuple.src.u.udp.port; + } else + return 1; + + if (nf_inet_addr_cmp(&newaddr, addr) && newport == port) + return 1; + + buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport)); + return mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen); +} + +static int map_sip_addr(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + enum sip_header_types type) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + union nf_inet_addr addr; + __be16 port; + + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, + &matchoff, &matchlen, &addr, &port) <= 0) + return 1; + return map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, &addr, port); +} + +static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + unsigned int coff, matchoff, matchlen; + enum sip_header_types hdr; + union nf_inet_addr addr; + __be16 port; + int request, in_header; + + /* Basic rules: requests and responses. */ + if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) { + if (ct_sip_parse_request(ct, *dptr, *datalen, + &matchoff, &matchlen, + &addr, &port) > 0 && + !map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle SIP message"); + return NF_DROP; + } + request = 1; + } else + request = 0; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) + hdr = SIP_HDR_VIA_TCP; + else + hdr = SIP_HDR_VIA_UDP; + + /* Translate topmost Via header and parameters */ + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + hdr, NULL, &matchoff, &matchlen, + &addr, &port) > 0) { + unsigned int olen, matchend, poff, plen, buflen, n; + char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; + + /* We're only interested in headers related to this + * connection */ + if (request) { + if (!nf_inet_addr_cmp(&addr, + &ct->tuplehash[dir].tuple.src.u3) || + port != ct->tuplehash[dir].tuple.src.u.udp.port) + goto next; + } else { + if (!nf_inet_addr_cmp(&addr, + &ct->tuplehash[dir].tuple.dst.u3) || + port != ct->tuplehash[dir].tuple.dst.u.udp.port) + goto next; + } + + olen = *datalen; + if (!map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle Via header"); + return NF_DROP; + } + + matchend = matchoff + matchlen + *datalen - olen; + + /* The maddr= parameter (RFC 2361) specifies where to send + * the reply. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "maddr=", &poff, &plen, + &addr, true) > 0 && + nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) && + !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) { + buflen = sip_sprintf_addr(ct, buffer, + &ct->tuplehash[!dir].tuple.dst.u3, + true); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle maddr"); + return NF_DROP; + } + } + + /* The received= parameter (RFC 2361) contains the address + * from which the server received the request. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "received=", &poff, &plen, + &addr, false) > 0 && + nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) && + !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) { + buflen = sip_sprintf_addr(ct, buffer, + &ct->tuplehash[!dir].tuple.src.u3, + false); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle received"); + return NF_DROP; + } + } + + /* The rport= parameter (RFC 3581) contains the port number + * from which the server received the request. */ + if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, + "rport=", &poff, &plen, + &n) > 0 && + htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && + htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { + __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; + buflen = sprintf(buffer, "%u", ntohs(p)); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle rport"); + return NF_DROP; + } + } + } + +next: + /* Translate Contact headers */ + coff = 0; + in_header = 0; + while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, + SIP_HDR_CONTACT, &in_header, + &matchoff, &matchlen, + &addr, &port) > 0) { + if (!map_addr(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, + &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle contact"); + return NF_DROP; + } + } + + if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) || + !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) { + nf_ct_helper_log(skb, ct, "cannot mangle SIP from/to"); + return NF_DROP; + } + + /* Mangle destination port for Cisco phones, then fix up checksums */ + if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { + struct udphdr *uh; + + if (!skb_make_writable(skb, skb->len)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + + uh = (void *)skb->data + protoff; + uh->dest = ct_sip_info->forced_dport; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, protoff, + 0, 0, NULL, 0)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + } + + return NF_ACCEPT; +} + +static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, + s16 off) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) + return; + + th = (struct tcphdr *)(skb->data + protoff); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} + +/* Handles expected signalling connections and media streams */ +static void nf_nat_sip_expected(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); + range.min_proto = range.max_proto = exp->saved_proto; + range.min_addr = range.max_addr = exp->saved_addr; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); + + /* Change src to where master sends to, but only if the connection + * actually came from the same source. */ + if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, + &ct->master->tuplehash[exp->dir].tuple.src.u3)) { + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = range.max_addr + = ct->master->tuplehash[!exp->dir].tuple.dst.u3; + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); + } +} + +static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + struct nf_conntrack_expect *exp, + unsigned int matchoff, + unsigned int matchlen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + union nf_inet_addr newaddr; + u_int16_t port; + __be16 srcport; + char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; + unsigned int buflen; + + /* Connection will come from reply */ + if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3)) + newaddr = exp->tuple.dst.u3; + else + newaddr = ct->tuplehash[!dir].tuple.dst.u3; + + /* If the signalling port matches the connection's source port in the + * original direction, try to use the destination port in the opposite + * direction. */ + srcport = ct_sip_info->forced_dport ? : + ct->tuplehash[dir].tuple.src.u.udp.port; + if (exp->tuple.dst.u.udp.port == srcport) + port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); + else + port = ntohs(exp->tuple.dst.u.udp.port); + + exp->saved_addr = exp->tuple.dst.u3; + exp->tuple.dst.u3 = newaddr; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + exp->expectfn = nf_nat_sip_expected; + + for (; port != 0; port++) { + int ret; + + exp->tuple.dst.u.udp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use for SIP"); + return NF_DROP; + } + + if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) || + exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { + buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + goto err; + } + } + return NF_ACCEPT; + +err: + nf_ct_unexpect_related(exp); + return NF_DROP; +} + +static int mangle_content_len(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + char buffer[sizeof("65536")]; + int buflen, c_len; + + /* Get actual SDP length */ + if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, + SDP_HDR_VERSION, SDP_HDR_UNSPEC, + &matchoff, &matchlen) <= 0) + return 0; + c_len = *datalen - matchoff + strlen("v="); + + /* Now, update SDP length */ + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH, + &matchoff, &matchlen) <= 0) + return 0; + + buflen = sprintf(buffer, "%u", c_len); + return mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen); +} + +static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + char *buffer, int buflen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + + if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, + &matchoff, &matchlen) <= 0) + return -ENOENT; + return mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL; +} + +static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + const union nf_inet_addr *addr) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + char buffer[INET6_ADDRSTRLEN]; + unsigned int buflen; + + buflen = sip_sprintf_addr(ct, buffer, addr, false); + if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, + sdpoff, type, term, buffer, buflen)) + return 0; + + return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, + unsigned int matchlen, + u_int16_t port) +{ + char buffer[sizeof("nnnnn")]; + unsigned int buflen; + + buflen = sprintf(buffer, "%u", port); + if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen)) + return 0; + + return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + const union nf_inet_addr *addr) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + char buffer[INET6_ADDRSTRLEN]; + unsigned int buflen; + + /* Mangle session description owner and contact addresses */ + buflen = sip_sprintf_addr(ct, buffer, addr, false); + if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, + SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen)) + return 0; + + switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, + SDP_HDR_CONNECTION, SDP_HDR_MEDIA, + buffer, buflen)) { + case 0: + /* + * RFC 2327: + * + * Session description + * + * c=* (connection information - not required if included in all media) + */ + case -ENOENT: + break; + default: + return 0; + } + + return mangle_content_len(skb, protoff, dataoff, dptr, datalen); +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff, + unsigned int dataoff, + const char **dptr, unsigned int *datalen, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp, + unsigned int mediaoff, + unsigned int medialen, + union nf_inet_addr *rtp_addr) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + u_int16_t port; + + /* Connection will come from reply */ + if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3)) + *rtp_addr = rtp_exp->tuple.dst.u3; + else + *rtp_addr = ct->tuplehash[!dir].tuple.dst.u3; + + rtp_exp->saved_addr = rtp_exp->tuple.dst.u3; + rtp_exp->tuple.dst.u3 = *rtp_addr; + rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; + rtp_exp->dir = !dir; + rtp_exp->expectfn = nf_nat_sip_expected; + + rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3; + rtcp_exp->tuple.dst.u3 = *rtp_addr; + rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; + rtcp_exp->dir = !dir; + rtcp_exp->expectfn = nf_nat_sip_expected; + + /* Try to get same pair of ports: if not, try to change them. */ + for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); + port != 0; port += 2) { + int ret; + + rtp_exp->tuple.dst.u.udp.port = htons(port); + ret = nf_ct_expect_related(rtp_exp); + if (ret == -EBUSY) + continue; + else if (ret < 0) { + port = 0; + break; + } + rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) + break; + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { + nf_ct_unexpect_related(rtp_exp); + port = 0; + break; + } + } + + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use for SDP media"); + goto err1; + } + + /* Update media port. */ + if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && + !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen, + mediaoff, medialen, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle SDP message"); + goto err2; + } + + return NF_ACCEPT; + +err2: + nf_ct_unexpect_related(rtp_exp); + nf_ct_unexpect_related(rtcp_exp); +err1: + return NF_DROP; +} + +static struct nf_ct_helper_expectfn sip_nat = { + .name = "sip", + .expectfn = nf_nat_sip_expected, +}; + +static void __exit nf_nat_sip_fini(void) +{ + RCU_INIT_POINTER(nf_nat_sip_hooks, NULL); + + nf_ct_helper_expectfn_unregister(&sip_nat); + synchronize_rcu(); +} + +static const struct nf_nat_sip_hooks sip_hooks = { + .msg = nf_nat_sip, + .seq_adjust = nf_nat_sip_seq_adjust, + .expect = nf_nat_sip_expect, + .sdp_addr = nf_nat_sdp_addr, + .sdp_port = nf_nat_sdp_port, + .sdp_session = nf_nat_sdp_session, + .sdp_media = nf_nat_sdp_media, +}; + +static int __init nf_nat_sip_init(void) +{ + BUG_ON(nf_nat_sip_hooks != NULL); + RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks); + nf_ct_helper_expectfn_register(&sip_nat); + return 0; +} + +module_init(nf_nat_sip_init); +module_exit(nf_nat_sip_fini); diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c new file mode 100644 index 00000000000..7f67e1d5310 --- /dev/null +++ b/net/netfilter/nf_nat_tftp.c @@ -0,0 +1,52 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/udp.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_tftp.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("TFTP NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_tftp"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp) +{ + const struct nf_conn *ct = exp->master; + + exp->saved_proto.udp.port + = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = nf_nat_follow_master; + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, exp->master, "cannot add expectation"); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit nf_nat_tftp_fini(void) +{ + RCU_INIT_POINTER(nf_nat_tftp_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_tftp_init(void) +{ + BUG_ON(nf_nat_tftp_hook != NULL); + RCU_INIT_POINTER(nf_nat_tftp_hook, help); + return 0; +} + +module_init(nf_nat_tftp_init); +module_exit(nf_nat_tftp_fini); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index ce60cf0f6c1..5d24b1fdb59 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -1,3 +1,8 @@ +/* + * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 + */ + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> @@ -14,85 +19,33 @@ #include "nf_internals.h" /* - * A queue handler may be registered for each protocol. Each is protected by - * long term mutex. The handler must provide an an outfn() to accept packets - * for queueing and must reinject all packets it receives, no matter what. + * Hook for nfnetlink_queue to register its queue handler. + * We do this so that most of the NFQUEUE code can be modular. + * + * Once the queue is registered it must reinject all packets it + * receives, no matter what. */ -static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly; - -static DEFINE_MUTEX(queue_handler_mutex); +static const struct nf_queue_handler __rcu *queue_handler __read_mostly; /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +void nf_register_queue_handler(const struct nf_queue_handler *qh) { - int ret; - const struct nf_queue_handler *old; - - if (pf >= ARRAY_SIZE(queue_handler)) - return -EINVAL; - - mutex_lock(&queue_handler_mutex); - old = rcu_dereference_protected(queue_handler[pf], - lockdep_is_held(&queue_handler_mutex)); - if (old == qh) - ret = -EEXIST; - else if (old) - ret = -EBUSY; - else { - rcu_assign_pointer(queue_handler[pf], qh); - ret = 0; - } - mutex_unlock(&queue_handler_mutex); - - return ret; + /* should never happen, we only have one queueing backend in kernel */ + WARN_ON(rcu_access_pointer(queue_handler)); + rcu_assign_pointer(queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +void nf_unregister_queue_handler(void) { - const struct nf_queue_handler *old; - - if (pf >= ARRAY_SIZE(queue_handler)) - return -EINVAL; - - mutex_lock(&queue_handler_mutex); - old = rcu_dereference_protected(queue_handler[pf], - lockdep_is_held(&queue_handler_mutex)); - if (old && old != qh) { - mutex_unlock(&queue_handler_mutex); - return -EINVAL; - } - - RCU_INIT_POINTER(queue_handler[pf], NULL); - mutex_unlock(&queue_handler_mutex); - + RCU_INIT_POINTER(queue_handler, NULL); synchronize_rcu(); - - return 0; } EXPORT_SYMBOL(nf_unregister_queue_handler); -void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) -{ - u_int8_t pf; - - mutex_lock(&queue_handler_mutex); - for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { - if (rcu_dereference_protected( - queue_handler[pf], - lockdep_is_held(&queue_handler_mutex) - ) == qh) - RCU_INIT_POINTER(queue_handler[pf], NULL); - } - mutex_unlock(&queue_handler_mutex); - - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); - -static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { /* Release those devices we held, or Alexey will kill me. */ if (entry->indev) @@ -112,13 +65,42 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) /* Drop reference to owner of hook which queued us. */ module_put(entry->elem->owner); } +EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + +/* Bump dev refs so they don't vanish while packet is out */ +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) +{ + if (!try_module_get(entry->elem->owner)) + return false; + + if (entry->indev) + dev_hold(entry->indev); + if (entry->outdev) + dev_hold(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + struct net_device *physdev; + + physdev = nf_bridge->physindev; + if (physdev) + dev_hold(physdev); + physdev = nf_bridge->physoutdev; + if (physdev) + dev_hold(physdev); + } +#endif + + return true; +} +EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); /* * Any packet that leaves via this function must come back * through nf_reinject(). */ -static int __nf_queue(struct sk_buff *skb, - struct list_head *elem, +int nf_queue(struct sk_buff *skb, + struct nf_hook_ops *elem, u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, @@ -127,17 +109,13 @@ static int __nf_queue(struct sk_buff *skb, { int status = -ENOENT; struct nf_queue_entry *entry = NULL; -#ifdef CONFIG_BRIDGE_NETFILTER - struct net_device *physindev; - struct net_device *physoutdev; -#endif const struct nf_afinfo *afinfo; const struct nf_queue_handler *qh; /* QUEUE == DROP if no one is waiting, to be safe. */ rcu_read_lock(); - qh = rcu_dereference(queue_handler[pf]); + qh = rcu_dereference(queue_handler); if (!qh) { status = -ESRCH; goto err_unlock; @@ -155,34 +133,19 @@ static int __nf_queue(struct sk_buff *skb, *entry = (struct nf_queue_entry) { .skb = skb, - .elem = list_entry(elem, struct nf_hook_ops, list), + .elem = elem, .pf = pf, .hook = hook, .indev = indev, .outdev = outdev, .okfn = okfn, + .size = sizeof(*entry) + afinfo->route_key_size, }; - /* If it's going away, ignore hook. */ - if (!try_module_get(entry->elem->owner)) { + if (!nf_queue_entry_get_refs(entry)) { status = -ECANCELED; goto err_unlock; } - /* Bump dev refs so they don't vanish while packet is out */ - if (indev) - dev_hold(indev); - if (outdev) - dev_hold(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - physindev = skb->nf_bridge->physindev; - if (physindev) - dev_hold(physindev); - physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev) - dev_hold(physoutdev); - } -#endif skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); @@ -203,91 +166,10 @@ err: return status; } -#ifdef CONFIG_BRIDGE_NETFILTER -/* When called from bridge netfilter, skb->data must point to MAC header - * before calling skb_gso_segment(). Else, original MAC header is lost - * and segmented skbs will be sent to wrong destination. - */ -static void nf_bridge_adjust_skb_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_push(skb, skb->network_header - skb->mac_header); -} - -static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_pull(skb, skb->network_header - skb->mac_header); -} -#else -#define nf_bridge_adjust_skb_data(s) do {} while (0) -#define nf_bridge_adjust_segmented_data(s) do {} while (0) -#endif - -int nf_queue(struct sk_buff *skb, - struct list_head *elem, - u_int8_t pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum) -{ - struct sk_buff *segs; - int err = -EINVAL; - unsigned int queued; - - if (!skb_is_gso(skb)) - return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - queuenum); - - switch (pf) { - case NFPROTO_IPV4: - skb->protocol = htons(ETH_P_IP); - break; - case NFPROTO_IPV6: - skb->protocol = htons(ETH_P_IPV6); - break; - } - - nf_bridge_adjust_skb_data(skb); - segs = skb_gso_segment(skb, 0); - /* Does not use PTR_ERR to limit the number of error codes that can be - * returned by nf_queue. For instance, callers rely on -ECANCELED to mean - * 'ignore this hook'. - */ - if (IS_ERR(segs)) - goto out_err; - queued = 0; - err = 0; - do { - struct sk_buff *nskb = segs->next; - - segs->next = NULL; - if (err == 0) { - nf_bridge_adjust_segmented_data(segs); - err = __nf_queue(segs, elem, pf, hook, indev, - outdev, okfn, queuenum); - } - if (err == 0) - queued++; - else - kfree_skb(segs); - segs = nskb; - } while (segs); - - if (queued) { - kfree_skb(skb); - return 0; - } - out_err: - nf_bridge_adjust_segmented_data(skb); - return err; -} - void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { struct sk_buff *skb = entry->skb; - struct list_head *elem = &entry->elem->list; + struct nf_hook_ops *elem = entry->elem; const struct nf_afinfo *afinfo; int err; @@ -297,7 +179,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) { - elem = elem->prev; + elem = list_entry(elem->list.prev, struct nf_hook_ops, list); verdict = NF_ACCEPT; } @@ -323,9 +205,9 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - err = __nf_queue(skb, elem, entry->pf, entry->hook, - entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_QBITS); + err = nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ECANCELED) goto next_hook; @@ -344,77 +226,3 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) kfree(entry); } EXPORT_SYMBOL(nf_reinject); - -#ifdef CONFIG_PROC_FS -static void *seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos >= ARRAY_SIZE(queue_handler)) - return NULL; - - return pos; -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - - if (*pos >= ARRAY_SIZE(queue_handler)) - return NULL; - - return pos; -} - -static void seq_stop(struct seq_file *s, void *v) -{ - -} - -static int seq_show(struct seq_file *s, void *v) -{ - int ret; - loff_t *pos = v; - const struct nf_queue_handler *qh; - - rcu_read_lock(); - qh = rcu_dereference(queue_handler[*pos]); - if (!qh) - ret = seq_printf(s, "%2lld NONE\n", *pos); - else - ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); - rcu_read_unlock(); - - return ret; -} - -static const struct seq_operations nfqueue_seq_ops = { - .start = seq_start, - .next = seq_next, - .stop = seq_stop, - .show = seq_show, -}; - -static int nfqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &nfqueue_seq_ops); -} - -static const struct file_operations nfqueue_file_ops = { - .owner = THIS_MODULE, - .open = nfqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif /* PROC_FS */ - - -int __init netfilter_queue_init(void) -{ -#ifdef CONFIG_PROC_FS - if (!proc_create("nf_queue", S_IRUGO, - proc_net_netfilter, &nfqueue_file_ops)) - return -1; -#endif - return 0; -} - diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c new file mode 100644 index 00000000000..52e20c9a46a --- /dev/null +++ b/net/netfilter/nf_synproxy_core.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <asm/unaligned.h> +#include <net/tcp.h> +#include <net/netns/generic.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +int synproxy_net_id; +EXPORT_SYMBOL_GPL(synproxy_net_id); + +bool +synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, struct synproxy_options *opts) +{ + int length = (th->doff * 4) - sizeof(*th); + u8 buf[40], *ptr; + + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); + if (ptr == NULL) + return false; + + opts->options = 0; + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return true; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return true; + if (opsize > length) + return true; + + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS) { + opts->mss = get_unaligned_be16(ptr); + opts->options |= XT_SYNPROXY_OPT_MSS; + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW) { + opts->wscale = *ptr; + if (opts->wscale > 14) + opts->wscale = 14; + opts->options |= XT_SYNPROXY_OPT_WSCALE; + } + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP) { + opts->tsval = get_unaligned_be32(ptr); + opts->tsecr = get_unaligned_be32(ptr + 4); + opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) + opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + break; + } + + ptr += opsize - 2; + length -= opsize; + } + } + return true; +} +EXPORT_SYMBOL_GPL(synproxy_parse_options); + +unsigned int synproxy_options_size(const struct synproxy_options *opts) +{ + unsigned int size = 0; + + if (opts->options & XT_SYNPROXY_OPT_MSS) + size += TCPOLEN_MSS_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + size += TCPOLEN_TSTAMP_ALIGNED; + else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + size += TCPOLEN_SACKPERM_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + size += TCPOLEN_WSCALE_ALIGNED; + + return size; +} +EXPORT_SYMBOL_GPL(synproxy_options_size); + +void +synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) +{ + __be32 *ptr = (__be32 *)(th + 1); + u8 options = opts->options; + + if (options & XT_SYNPROXY_OPT_MSS) + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + + if (options & XT_SYNPROXY_OPT_TIMESTAMP) { + if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); + + if (options & XT_SYNPROXY_OPT_WSCALE) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + opts->wscale); +} +EXPORT_SYMBOL_GPL(synproxy_build_options); + +void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts) +{ + opts->tsecr = opts->tsval; + opts->tsval = tcp_time_stamp & ~0x3f; + + if (opts->options & XT_SYNPROXY_OPT_WSCALE) { + opts->tsval |= opts->wscale; + opts->wscale = info->wscale; + } else + opts->tsval |= 0xf; + + if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + opts->tsval |= 1 << 4; + + if (opts->options & XT_SYNPROXY_OPT_ECN) + opts->tsval |= 1 << 5; +} +EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); + +void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +{ + opts->wscale = opts->tsecr & 0xf; + if (opts->wscale != 0xf) + opts->options |= XT_SYNPROXY_OPT_WSCALE; + + opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + + opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; +} +EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); + +unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) +{ + unsigned int optoff, optend; + u32 *ptr, old; + + if (synproxy->tsoff == 0) + return 1; + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + th->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + while (optoff < optend) { + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_TIMESTAMP && + op[1] == TCPOLEN_TIMESTAMP) { + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + ptr = (u32 *)&op[2]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) - + synproxy->tsoff); + } else { + ptr = (u32 *)&op[6]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) + + synproxy->tsoff); + } + inet_proto_csum_replace4(&th->check, skb, + old, *ptr, 0); + return 1; + } + optoff += op[1]; + } + } + return 1; +} +EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); + +static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { + .len = sizeof(struct nf_conn_synproxy), + .align = __alignof__(struct nf_conn_synproxy), + .id = NF_CT_EXT_SYNPROXY, +}; + +#ifdef CONFIG_PROC_FS +static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) +{ + return; +} + +static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct synproxy_stats *stats = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries\t\tsyn_received\t" + "cookie_invalid\tcookie_valid\t" + "cookie_retrans\tconn_reopened\n"); + return 0; + } + + seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, + stats->syn_received, + stats->cookie_invalid, + stats->cookie_valid, + stats->cookie_retrans, + stats->conn_reopened); + + return 0; +} + +static const struct seq_operations synproxy_cpu_seq_ops = { + .start = synproxy_cpu_seq_start, + .next = synproxy_cpu_seq_next, + .stop = synproxy_cpu_seq_stop, + .show = synproxy_cpu_seq_show, +}; + +static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &synproxy_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations synproxy_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = synproxy_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init synproxy_proc_init(struct net *net) +{ + if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, + &synproxy_cpu_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + remove_proc_entry("synproxy", net->proc_net_stat); +} +#else +static int __net_init synproxy_proc_init(struct net *net) +{ + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init synproxy_net_init(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct nf_conntrack_tuple t; + struct nf_conn *ct; + int err = -ENOMEM; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); + if (IS_ERR(ct)) { + err = PTR_ERR(ct); + goto err1; + } + + if (!nfct_seqadj_ext_add(ct)) + goto err2; + if (!nfct_synproxy_ext_add(ct)) + goto err2; + + nf_conntrack_tmpl_insert(net, ct); + snet->tmpl = ct; + + snet->stats = alloc_percpu(struct synproxy_stats); + if (snet->stats == NULL) + goto err2; + + err = synproxy_proc_init(net); + if (err < 0) + goto err3; + + return 0; + +err3: + free_percpu(snet->stats); +err2: + nf_conntrack_free(ct); +err1: + return err; +} + +static void __net_exit synproxy_net_exit(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + + nf_ct_put(snet->tmpl); + synproxy_proc_exit(net); + free_percpu(snet->stats); +} + +static struct pernet_operations synproxy_net_ops = { + .init = synproxy_net_init, + .exit = synproxy_net_exit, + .id = &synproxy_net_id, + .size = sizeof(struct synproxy_net), +}; + +static int __init synproxy_core_init(void) +{ + int err; + + err = nf_ct_extend_register(&nf_ct_synproxy_extend); + if (err < 0) + goto err1; + + err = register_pernet_subsys(&synproxy_net_ops); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +err1: + return err; +} + +static void __exit synproxy_core_exit(void) +{ + unregister_pernet_subsys(&synproxy_net_ops); + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +} + +module_init(synproxy_core_init); +module_exit(synproxy_core_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c new file mode 100644 index 00000000000..8746ff9a835 --- /dev/null +++ b/net/netfilter/nf_tables_api.c @@ -0,0 +1,4041 @@ +/* + * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +static LIST_HEAD(nf_tables_expressions); + +/** + * nft_register_afinfo - register nf_tables address family info + * + * @afi: address family info to register + * + * Register the address family for use with nf_tables. Returns zero on + * success or a negative errno code otherwise. + */ +int nft_register_afinfo(struct net *net, struct nft_af_info *afi) +{ + INIT_LIST_HEAD(&afi->tables); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_tail_rcu(&afi->list, &net->nft.af_info); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_afinfo); + +/** + * nft_unregister_afinfo - unregister nf_tables address family info + * + * @afi: address family info to unregister + * + * Unregister the address family for use with nf_tables. + */ +void nft_unregister_afinfo(struct nft_af_info *afi) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&afi->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_afinfo); + +static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family) +{ + struct nft_af_info *afi; + + list_for_each_entry(afi, &net->nft.af_info, list) { + if (afi->family == family) + return afi; + } + return NULL; +} + +static struct nft_af_info * +nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) +{ + struct nft_af_info *afi; + + afi = nft_afinfo_lookup(net, family); + if (afi != NULL) + return afi; +#ifdef CONFIG_MODULES + if (autoload) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-afinfo-%u", family); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + afi = nft_afinfo_lookup(net, family); + if (afi != NULL) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-EAFNOSUPPORT); +} + +static void nft_ctx_init(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nft_af_info *afi, + struct nft_table *table, + struct nft_chain *chain, + const struct nlattr * const *nla) +{ + ctx->net = sock_net(skb->sk); + ctx->afi = afi; + ctx->table = table; + ctx->chain = chain; + ctx->nla = nla; + ctx->portid = NETLINK_CB(skb).portid; + ctx->report = nlmsg_report(nlh); + ctx->seq = nlh->nlmsg_seq; +} + +static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, + u32 size) +{ + struct nft_trans *trans; + + trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); + if (trans == NULL) + return NULL; + + trans->msg_type = msg_type; + trans->ctx = *ctx; + + return trans; +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + list_del(&trans->list); + kfree(trans); +} + +/* + * Tables + */ + +static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, + const struct nlattr *nla) +{ + struct nft_table *table; + + list_for_each_entry(table, &afi->tables, list) { + if (!nla_strcmp(nla, table->name)) + return table; + } + return NULL; +} + +static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, + const struct nlattr *nla) +{ + struct nft_table *table; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + table = nft_table_lookup(afi, nla); + if (table != NULL) + return table; + + return ERR_PTR(-ENOENT); +} + +static inline u64 nf_tables_alloc_handle(struct nft_table *table) +{ + return ++table->hgenerator; +} + +static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX]; + +static const struct nf_chain_type * +__nf_tables_chain_type_lookup(int family, const struct nlattr *nla) +{ + int i; + + for (i = 0; i < NFT_CHAIN_T_MAX; i++) { + if (chain_type[family][i] != NULL && + !nla_strcmp(nla, chain_type[family][i]->name)) + return chain_type[family][i]; + } + return NULL; +} + +static const struct nf_chain_type * +nf_tables_chain_type_lookup(const struct nft_af_info *afi, + const struct nlattr *nla, + bool autoload) +{ + const struct nf_chain_type *type; + + type = __nf_tables_chain_type_lookup(afi->family, nla); + if (type != NULL) + return type; +#ifdef CONFIG_MODULES + if (autoload) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-chain-%u-%.*s", afi->family, + nla_len(nla), (const char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + type = __nf_tables_chain_type_lookup(afi->family, nla); + if (type != NULL) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { + [NFTA_TABLE_NAME] = { .type = NLA_STRING }, + [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, +}; + +static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, + int event, u32 flags, int family, + const struct nft_table *table) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || + nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || + nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) +{ + struct sk_buff *skb; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_dump_tables(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_table_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWTABLE, + NLM_F_MULTI, + afi->family, table) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +done: + rcu_read_unlock(); + cb->args[0] = idx; + return skb->len; +} + +/* Internal table flags */ +#define NFT_TABLE_INACTIVE (1 << 15) + +static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + struct sk_buff *skb2; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_tables, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, + family, table); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static int nf_tables_table_enable(const struct nft_af_info *afi, + struct nft_table *table) +{ + struct nft_chain *chain; + int err, i = 0; + + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + if (err < 0) + goto err; + + i++; + } + return 0; +err: + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + if (i-- <= 0) + break; + + nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); + } + return err; +} + +static void nf_tables_table_disable(const struct nft_af_info *afi, + struct nft_table *table) +{ + struct nft_chain *chain; + + list_for_each_entry(chain, &table->chains, list) { + if (chain->flags & NFT_BASE_CHAIN) + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } +} + +static int nf_tables_updtable(struct nft_ctx *ctx) +{ + struct nft_trans *trans; + u32 flags; + int ret = 0; + + if (!ctx->nla[NFTA_TABLE_FLAGS]) + return 0; + + flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; + + if (flags == ctx->table->flags) + return 0; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, + sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if ((flags & NFT_TABLE_F_DORMANT) && + !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { + nft_trans_table_enable(trans) = false; + } else if (!(flags & NFT_TABLE_F_DORMANT) && + ctx->table->flags & NFT_TABLE_F_DORMANT) { + ret = nf_tables_table_enable(ctx->afi, ctx->table); + if (ret >= 0) { + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + nft_trans_table_enable(trans) = true; + } + } + if (ret < 0) + goto err; + + nft_trans_table_update(trans) = true; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +err: + nft_trans_destroy(trans); + return ret; +} + +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWTABLE) + ctx->table->flags |= NFT_TABLE_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nlattr *name; + struct nft_af_info *afi; + struct nft_table *table; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + u32 flags = 0; + struct nft_ctx ctx; + int err; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + name = nla[NFTA_TABLE_NAME]; + table = nf_tables_table_lookup(afi, name); + if (IS_ERR(table)) { + if (PTR_ERR(table) != -ENOENT) + return PTR_ERR(table); + table = NULL; + } + + if (table != NULL) { + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + return nf_tables_updtable(&ctx); + } + + if (nla[NFTA_TABLE_FLAGS]) { + flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; + } + + if (!try_module_get(afi->owner)) + return -EAFNOSUPPORT; + + table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); + if (table == NULL) { + module_put(afi->owner); + return -ENOMEM; + } + + nla_strlcpy(table->name, name, nla_len(name)); + INIT_LIST_HEAD(&table->chains); + INIT_LIST_HEAD(&table->sets); + table->flags = flags; + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); + if (err < 0) { + kfree(table); + module_put(afi->owner); + return err; + } + list_add_tail_rcu(&table->list, &afi->tables); + return 0; +} + +static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct nft_table *table; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family, err; + struct nft_ctx ctx; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + if (table->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); + if (err < 0) + return err; + + list_del_rcu(&table->list); + return 0; +} + +static void nf_tables_table_destroy(struct nft_ctx *ctx) +{ + BUG_ON(ctx->table->use > 0); + + kfree(ctx->table); + module_put(ctx->afi->owner); +} + +int nft_register_chain_type(const struct nf_chain_type *ctype) +{ + int err = 0; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (chain_type[ctype->family][ctype->type] != NULL) { + err = -EBUSY; + goto out; + } + chain_type[ctype->family][ctype->type] = ctype; +out: + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return err; +} +EXPORT_SYMBOL_GPL(nft_register_chain_type); + +void nft_unregister_chain_type(const struct nf_chain_type *ctype) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + chain_type[ctype->family][ctype->type] = NULL; + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_chain_type); + +/* + * Chains + */ + +static struct nft_chain * +nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) +{ + struct nft_chain *chain; + + list_for_each_entry(chain, &table->chains, list) { + if (chain->handle == handle) + return chain; + } + + return ERR_PTR(-ENOENT); +} + +static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, + const struct nlattr *nla) +{ + struct nft_chain *chain; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + list_for_each_entry(chain, &table->chains, list) { + if (!nla_strcmp(nla, chain->name)) + return chain; + } + + return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { + [NFTA_CHAIN_TABLE] = { .type = NLA_STRING }, + [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 }, + [NFTA_CHAIN_NAME] = { .type = NLA_STRING, + .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, + [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, + [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { + [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, + [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, +}; + +static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) +{ + struct nft_stats *cpu_stats, total; + struct nlattr *nest; + unsigned int seq; + u64 pkts, bytes; + int cpu; + + memset(&total, 0, sizeof(total)); + for_each_possible_cpu(cpu) { + cpu_stats = per_cpu_ptr(stats, cpu); + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + pkts = cpu_stats->pkts; + bytes = cpu_stats->bytes; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + total.pkts += pkts; + total.bytes += bytes; + } + nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); + if (nest == NULL) + goto nla_put_failure; + + if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) || + nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes))) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, + int event, u32 flags, int family, + const struct nft_table *table, + const struct nft_chain *chain) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle))) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) + goto nla_put_failure; + + if (chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = nft_base_chain(chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + struct nlattr *nest; + + nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + goto nla_put_failure; + nla_nest_end(skb, nest); + + if (nla_put_be32(skb, NFTA_CHAIN_POLICY, + htonl(basechain->policy))) + goto nla_put_failure; + + if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) + goto nla_put_failure; + + if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) + goto nla_put_failure; + } + + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +{ + struct sk_buff *skb; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_dump_chains(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWCHAIN, + NLM_F_MULTI, + afi->family, table, chain) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + } +done: + rcu_read_unlock(); + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + struct sk_buff *skb2; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_chains, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, + family, table, chain); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { + [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, + [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, +}; + +static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) +{ + struct nlattr *tb[NFTA_COUNTER_MAX+1]; + struct nft_stats __percpu *newstats; + struct nft_stats *stats; + int err; + + err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); + if (err < 0) + return ERR_PTR(err); + + if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) + return ERR_PTR(-EINVAL); + + newstats = netdev_alloc_pcpu_stats(struct nft_stats); + if (newstats == NULL) + return ERR_PTR(-ENOMEM); + + /* Restore old counters on this cpu, no problem. Per-cpu statistics + * are not exposed to userspace. + */ + stats = this_cpu_ptr(newstats); + stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + + return newstats; +} + +static void nft_chain_stats_replace(struct nft_base_chain *chain, + struct nft_stats __percpu *newstats) +{ + if (chain->stats) { + struct nft_stats __percpu *oldstats = + nft_dereference(chain->stats); + + rcu_assign_pointer(chain->stats, newstats); + synchronize_rcu(); + free_percpu(oldstats); + } else + rcu_assign_pointer(chain->stats, newstats); +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWCHAIN) + ctx->chain->flags |= NFT_CHAIN_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static void nf_tables_chain_destroy(struct nft_chain *chain) +{ + BUG_ON(chain->use > 0); + + if (chain->flags & NFT_BASE_CHAIN) { + module_put(nft_base_chain(chain)->type->owner); + free_percpu(nft_base_chain(chain)->stats); + kfree(nft_base_chain(chain)); + } else { + kfree(chain); + } +} + +static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nlattr * uninitialized_var(name); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + struct nft_base_chain *basechain = NULL; + struct nlattr *ha[NFTA_HOOK_MAX + 1]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + u8 policy = NF_ACCEPT; + u64 handle = 0; + unsigned int i; + struct nft_stats __percpu *stats; + int err; + bool create; + struct nft_ctx ctx; + + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + + chain = NULL; + name = nla[NFTA_CHAIN_NAME]; + + if (nla[NFTA_CHAIN_HANDLE]) { + handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); + chain = nf_tables_chain_lookup_byhandle(table, handle); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { + chain = nf_tables_chain_lookup(table, name); + if (IS_ERR(chain)) { + if (PTR_ERR(chain) != -ENOENT) + return PTR_ERR(chain); + chain = NULL; + } + } + + if (nla[NFTA_CHAIN_POLICY]) { + if ((chain != NULL && + !(chain->flags & NFT_BASE_CHAIN)) || + nla[NFTA_CHAIN_HOOK] == NULL) + return -EOPNOTSUPP; + + policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); + switch (policy) { + case NF_DROP: + case NF_ACCEPT: + break; + default: + return -EINVAL; + } + } + + if (chain != NULL) { + struct nft_stats *stats = NULL; + struct nft_trans *trans; + + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (nla[NFTA_CHAIN_HANDLE] && name && + !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) + return -EEXIST; + + if (nla[NFTA_CHAIN_COUNTERS]) { + if (!(chain->flags & NFT_BASE_CHAIN)) + return -EOPNOTSUPP; + + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) + return PTR_ERR(stats); + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, + sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + nft_trans_chain_stats(trans) = stats; + nft_trans_chain_update(trans) = true; + + if (nla[NFTA_CHAIN_POLICY]) + nft_trans_chain_policy(trans) = policy; + else + nft_trans_chain_policy(trans) = -1; + + if (nla[NFTA_CHAIN_HANDLE] && name) { + nla_strlcpy(nft_trans_chain_name(trans), name, + NFT_CHAIN_MAXNAMELEN); + } + list_add_tail(&trans->list, &net->nft.commit_list); + return 0; + } + + if (table->use == UINT_MAX) + return -EOVERFLOW; + + if (nla[NFTA_CHAIN_HOOK]) { + const struct nf_chain_type *type; + struct nf_hook_ops *ops; + nf_hookfn *hookfn; + u32 hooknum, priority; + + type = chain_type[family][NFT_CHAIN_T_DEFAULT]; + if (nla[NFTA_CHAIN_TYPE]) { + type = nf_tables_chain_type_lookup(afi, + nla[NFTA_CHAIN_TYPE], + create); + if (IS_ERR(type)) + return PTR_ERR(type); + } + + err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], + nft_hook_policy); + if (err < 0) + return err; + if (ha[NFTA_HOOK_HOOKNUM] == NULL || + ha[NFTA_HOOK_PRIORITY] == NULL) + return -EINVAL; + + hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); + if (hooknum >= afi->nhooks) + return -EINVAL; + priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + + if (!(type->hook_mask & (1 << hooknum))) + return -EOPNOTSUPP; + if (!try_module_get(type->owner)) + return -ENOENT; + hookfn = type->hooks[hooknum]; + + basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); + if (basechain == NULL) + return -ENOMEM; + + if (nla[NFTA_CHAIN_COUNTERS]) { + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) { + module_put(type->owner); + kfree(basechain); + return PTR_ERR(stats); + } + basechain->stats = stats; + } else { + stats = netdev_alloc_pcpu_stats(struct nft_stats); + if (IS_ERR(stats)) { + module_put(type->owner); + kfree(basechain); + return PTR_ERR(stats); + } + rcu_assign_pointer(basechain->stats, stats); + } + + basechain->type = type; + chain = &basechain->chain; + + for (i = 0; i < afi->nops; i++) { + ops = &basechain->ops[i]; + ops->pf = family; + ops->owner = afi->owner; + ops->hooknum = hooknum; + ops->priority = priority; + ops->priv = chain; + ops->hook = afi->hooks[ops->hooknum]; + if (hookfn) + ops->hook = hookfn; + if (afi->hook_ops_init) + afi->hook_ops_init(ops, i); + } + + chain->flags |= NFT_BASE_CHAIN; + basechain->policy = policy; + } else { + chain = kzalloc(sizeof(*chain), GFP_KERNEL); + if (chain == NULL) + return -ENOMEM; + } + + INIT_LIST_HEAD(&chain->rules); + chain->handle = nf_tables_alloc_handle(table); + chain->net = net; + chain->table = table; + nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) { + err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + if (err < 0) + goto err1; + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); + if (err < 0) + goto err2; + + table->use++; + list_add_tail_rcu(&chain->list, &table->chains); + return 0; +err2: + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } +err1: + nf_tables_chain_destroy(chain); + return err; +} + +static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + struct nft_ctx ctx; + int err; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + if (chain->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); + if (err < 0) + return err; + + table->use--; + list_del_rcu(&chain->list); + return 0; +} + +/* + * Expressions + */ + +/** + * nft_register_expr - register nf_tables expr type + * @ops: expr type + * + * Registers the expr type for use with nf_tables. Returns zero on + * success or a negative errno code otherwise. + */ +int nft_register_expr(struct nft_expr_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (type->family == NFPROTO_UNSPEC) + list_add_tail_rcu(&type->list, &nf_tables_expressions); + else + list_add_rcu(&type->list, &nf_tables_expressions); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_expr); + +/** + * nft_unregister_expr - unregister nf_tables expr type + * @ops: expr type + * + * Unregisters the expr typefor use with nf_tables. + */ +void nft_unregister_expr(struct nft_expr_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&type->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_expr); + +static const struct nft_expr_type *__nft_expr_type_get(u8 family, + struct nlattr *nla) +{ + const struct nft_expr_type *type; + + list_for_each_entry(type, &nf_tables_expressions, list) { + if (!nla_strcmp(nla, type->name) && + (!type->family || type->family == family)) + return type; + } + return NULL; +} + +static const struct nft_expr_type *nft_expr_type_get(u8 family, + struct nlattr *nla) +{ + const struct nft_expr_type *type; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + type = __nft_expr_type_get(family, nla); + if (type != NULL && try_module_get(type->owner)) + return type; + +#ifdef CONFIG_MODULES + if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%.*s", + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { + [NFTA_EXPR_NAME] = { .type = NLA_STRING }, + [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, +}; + +static int nf_tables_fill_expr_info(struct sk_buff *skb, + const struct nft_expr *expr) +{ + if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) + goto nla_put_failure; + + if (expr->ops->dump) { + struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA); + if (data == NULL) + goto nla_put_failure; + if (expr->ops->dump(skb, expr) < 0) + goto nla_put_failure; + nla_nest_end(skb, data); + } + + return skb->len; + +nla_put_failure: + return -1; +}; + +struct nft_expr_info { + const struct nft_expr_ops *ops; + struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nf_tables_expr_parse(const struct nft_ctx *ctx, + const struct nlattr *nla, + struct nft_expr_info *info) +{ + const struct nft_expr_type *type; + const struct nft_expr_ops *ops; + struct nlattr *tb[NFTA_EXPR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy); + if (err < 0) + return err; + + type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); + if (IS_ERR(type)) + return PTR_ERR(type); + + if (tb[NFTA_EXPR_DATA]) { + err = nla_parse_nested(info->tb, type->maxattr, + tb[NFTA_EXPR_DATA], type->policy); + if (err < 0) + goto err1; + } else + memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1)); + + if (type->select_ops != NULL) { + ops = type->select_ops(ctx, + (const struct nlattr * const *)info->tb); + if (IS_ERR(ops)) { + err = PTR_ERR(ops); + goto err1; + } + } else + ops = type->ops; + + info->ops = ops; + return 0; + +err1: + module_put(type->owner); + return err; +} + +static int nf_tables_newexpr(const struct nft_ctx *ctx, + const struct nft_expr_info *info, + struct nft_expr *expr) +{ + const struct nft_expr_ops *ops = info->ops; + int err; + + expr->ops = ops; + if (ops->init) { + err = ops->init(ctx, expr, (const struct nlattr **)info->tb); + if (err < 0) + goto err1; + } + + return 0; + +err1: + expr->ops = NULL; + return err; +} + +static void nf_tables_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) +{ + if (expr->ops->destroy) + expr->ops->destroy(ctx, expr); + module_put(expr->ops->type->owner); +} + +/* + * Rules + */ + +static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, + u64 handle) +{ + struct nft_rule *rule; + + // FIXME: this sucks + list_for_each_entry(rule, &chain->rules, list) { + if (handle == rule->handle) + return rule; + } + + return ERR_PTR(-ENOENT); +} + +static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, + const struct nlattr *nla) +{ + if (nla == NULL) + return ERR_PTR(-EINVAL); + + return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); +} + +static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { + [NFTA_RULE_TABLE] = { .type = NLA_STRING }, + [NFTA_RULE_CHAIN] = { .type = NLA_STRING, + .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, + [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, + [NFTA_RULE_POSITION] = { .type = NLA_U64 }, + [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, +}; + +static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, + int event, u32 flags, int family, + const struct nft_table *table, + const struct nft_chain *chain, + const struct nft_rule *rule) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + const struct nft_expr *expr, *next; + struct nlattr *list; + const struct nft_rule *prule; + int type = event | NFNL_SUBSYS_NFTABLES << 8; + + nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle))) + goto nla_put_failure; + + if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { + prule = list_entry(rule->list.prev, struct nft_rule, list); + if (nla_put_be64(skb, NFTA_RULE_POSITION, + cpu_to_be64(prule->handle))) + goto nla_put_failure; + } + + list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS); + if (list == NULL) + goto nla_put_failure; + nft_rule_for_each_expr(expr, next, rule) { + struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM); + if (elem == NULL) + goto nla_put_failure; + if (nf_tables_fill_expr_info(skb, expr) < 0) + goto nla_put_failure; + nla_nest_end(skb, elem); + } + nla_nest_end(skb, list); + + if (rule->ulen && + nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_rule_notify(const struct nft_ctx *ctx, + const struct nft_rule *rule, + int event) +{ + struct sk_buff *skb; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain, rule); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static inline bool +nft_rule_is_active(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << net->nft.gencursor)) == 0; +} + +static inline int gencursor_next(struct net *net) +{ + return net->nft.gencursor+1 == 1 ? 1 : 0; +} + +static inline int +nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << gencursor_next(net))) == 0; +} + +static inline void +nft_rule_activate_next(struct net *net, struct nft_rule *rule) +{ + /* Now inactive, will be active in the future */ + rule->genmask = (1 << net->nft.gencursor); +} + +static inline void +nft_rule_disactivate_next(struct net *net, struct nft_rule *rule) +{ + rule->genmask = (1 << gencursor_next(net)); +} + +static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) +{ + rule->genmask = 0; +} + +static int nf_tables_dump_rules(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + const struct nft_rule *rule; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { + list_for_each_entry_rcu(rule, &chain->rules, list) { + if (!nft_rule_is_active(net, rule)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWRULE, + NLM_F_MULTI | NLM_F_APPEND, + afi->family, table, chain, rule) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + } + } +done: + rcu_read_unlock(); + + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_chain *chain; + const struct nft_rule *rule; + struct sk_buff *skb2; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_rules, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; + + rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) + return PTR_ERR(rule); + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, + family, table, chain, rule); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static void nf_tables_rule_destroy(const struct nft_ctx *ctx, + struct nft_rule *rule) +{ + struct nft_expr *expr; + + /* + * Careful: some expressions might not be initialized in case this + * is called on error from nf_tables_newrule(). + */ + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + nf_tables_expr_destroy(ctx, expr); + expr = nft_expr_next(expr); + } + kfree(rule); +} + +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, + struct nft_rule *rule) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); + if (trans == NULL) + return NULL; + + nft_trans_rule(trans) = rule; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return trans; +} + +#define NFT_RULE_MAXEXPRS 128 + +static struct nft_expr_info *info; + +static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct net *net = sock_net(skb->sk); + struct nft_table *table; + struct nft_chain *chain; + struct nft_rule *rule, *old_rule = NULL; + struct nft_trans *trans = NULL; + struct nft_expr *expr; + struct nft_ctx ctx; + struct nlattr *tmp; + unsigned int size, i, n, ulen = 0; + int err, rem; + bool create; + u64 handle, pos_handle; + + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + + if (nla[NFTA_RULE_HANDLE]) { + handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); + rule = __nf_tables_rule_lookup(chain, handle); + if (IS_ERR(rule)) + return PTR_ERR(rule); + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + old_rule = rule; + else + return -EOPNOTSUPP; + } else { + if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) + return -EINVAL; + handle = nf_tables_alloc_handle(table); + + if (chain->use == UINT_MAX) + return -EOVERFLOW; + } + + if (nla[NFTA_RULE_POSITION]) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -EOPNOTSUPP; + + pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); + old_rule = __nf_tables_rule_lookup(chain, pos_handle); + if (IS_ERR(old_rule)) + return PTR_ERR(old_rule); + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + + n = 0; + size = 0; + if (nla[NFTA_RULE_EXPRESSIONS]) { + nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { + err = -EINVAL; + if (nla_type(tmp) != NFTA_LIST_ELEM) + goto err1; + if (n == NFT_RULE_MAXEXPRS) + goto err1; + err = nf_tables_expr_parse(&ctx, tmp, &info[n]); + if (err < 0) + goto err1; + size += info[n].ops->size; + n++; + } + } + + if (nla[NFTA_RULE_USERDATA]) + ulen = nla_len(nla[NFTA_RULE_USERDATA]); + + err = -ENOMEM; + rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); + if (rule == NULL) + goto err1; + + nft_rule_activate_next(net, rule); + + rule->handle = handle; + rule->dlen = size; + rule->ulen = ulen; + + if (ulen) + nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); + + expr = nft_expr_first(rule); + for (i = 0; i < n; i++) { + err = nf_tables_newexpr(&ctx, &info[i], expr); + if (err < 0) + goto err2; + info[i].ops = NULL; + expr = nft_expr_next(expr); + } + + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (nft_rule_is_active_next(net, old_rule)) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, + old_rule); + if (trans == NULL) { + err = -ENOMEM; + goto err2; + } + nft_rule_disactivate_next(net, old_rule); + chain->use--; + list_add_tail_rcu(&rule->list, &old_rule->list); + } else { + err = -ENOENT; + goto err2; + } + } else if (nlh->nlmsg_flags & NLM_F_APPEND) + if (old_rule) + list_add_rcu(&rule->list, &old_rule->list); + else + list_add_tail_rcu(&rule->list, &chain->rules); + else { + if (old_rule) + list_add_tail_rcu(&rule->list, &old_rule->list); + else + list_add_rcu(&rule->list, &chain->rules); + } + + if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { + err = -ENOMEM; + goto err3; + } + chain->use++; + return 0; + +err3: + list_del_rcu(&rule->list); + if (trans) { + list_del_rcu(&nft_trans_rule(trans)->list); + nft_rule_clear(net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + chain->use++; + } +err2: + nf_tables_rule_destroy(&ctx, rule); +err1: + for (i = 0; i < n; i++) { + if (info[i].ops != NULL) + module_put(info[i].ops->type->owner); + } + return err; +} + +static int +nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) +{ + /* You cannot delete the same rule twice */ + if (nft_rule_is_active_next(ctx->net, rule)) { + if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) + return -ENOMEM; + nft_rule_disactivate_next(ctx->net, rule); + ctx->chain->use--; + return 0; + } + return -ENOENT; +} + +static int nf_table_delrule_by_chain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &ctx->chain->rules, list) { + err = nf_tables_delrule_one(ctx, rule); + if (err < 0) + return err; + } + return 0; +} + +static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct net *net = sock_net(skb->sk); + struct nft_table *table; + struct nft_chain *chain = NULL; + struct nft_rule *rule; + int family = nfmsg->nfgen_family, err = 0; + struct nft_ctx ctx; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + + if (nla[NFTA_RULE_CHAIN]) { + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } + + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + + if (chain) { + if (nla[NFTA_RULE_HANDLE]) { + rule = nf_tables_rule_lookup(chain, + nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) + return PTR_ERR(rule); + + err = nf_tables_delrule_one(&ctx, rule); + } else { + err = nf_table_delrule_by_chain(&ctx); + } + } else { + list_for_each_entry(chain, &table->chains, list) { + ctx.chain = chain; + err = nf_table_delrule_by_chain(&ctx); + if (err < 0) + break; + } + } + + return err; +} + +/* + * Sets + */ + +static LIST_HEAD(nf_tables_set_ops); + +int nft_register_set(struct nft_set_ops *ops) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_tail_rcu(&ops->list, &nf_tables_set_ops); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_set); + +void nft_unregister_set(struct nft_set_ops *ops) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&ops->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_set); + +/* + * Select a set implementation based on the data characteristics and the + * given policy. The total memory use might not be known if no size is + * given, in that case the amount of memory per element is used. + */ +static const struct nft_set_ops * +nft_select_set_ops(const struct nlattr * const nla[], + const struct nft_set_desc *desc, + enum nft_set_policies policy) +{ + const struct nft_set_ops *ops, *bops; + struct nft_set_estimate est, best; + u32 features; + +#ifdef CONFIG_MODULES + if (list_empty(&nf_tables_set_ops)) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-set"); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (!list_empty(&nf_tables_set_ops)) + return ERR_PTR(-EAGAIN); + } +#endif + features = 0; + if (nla[NFTA_SET_FLAGS] != NULL) { + features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); + features &= NFT_SET_INTERVAL | NFT_SET_MAP; + } + + bops = NULL; + best.size = ~0; + best.class = ~0; + + list_for_each_entry(ops, &nf_tables_set_ops, list) { + if ((ops->features & features) != features) + continue; + if (!ops->estimate(desc, features, &est)) + continue; + + switch (policy) { + case NFT_SET_POL_PERFORMANCE: + if (est.class < best.class) + break; + if (est.class == best.class && est.size < best.size) + break; + continue; + case NFT_SET_POL_MEMORY: + if (est.size < best.size) + break; + if (est.size == best.size && est.class < best.class) + break; + continue; + default: + break; + } + + if (!try_module_get(ops->owner)) + continue; + if (bops != NULL) + module_put(bops->owner); + + bops = ops; + best = est; + } + + if (bops != NULL) + return bops; + + return ERR_PTR(-EOPNOTSUPP); +} + +static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { + [NFTA_SET_TABLE] = { .type = NLA_STRING }, + [NFTA_SET_NAME] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, + [NFTA_SET_FLAGS] = { .type = NLA_U32 }, + [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, + [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, + [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 }, + [NFTA_SET_DATA_LEN] = { .type = NLA_U32 }, + [NFTA_SET_POLICY] = { .type = NLA_U32 }, + [NFTA_SET_DESC] = { .type = NLA_NESTED }, + [NFTA_SET_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { + [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi = NULL; + struct nft_table *table = NULL; + + if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + } + + if (nla[NFTA_SET_TABLE] != NULL) { + if (afi == NULL) + return -EAFNOSUPPORT; + + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; + } + + nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + return 0; +} + +struct nft_set *nf_tables_set_lookup(const struct nft_table *table, + const struct nlattr *nla) +{ + struct nft_set *set; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + list_for_each_entry(set, &table->sets, list) { + if (!nla_strcmp(nla, set->name)) + return set; + } + return ERR_PTR(-ENOENT); +} + +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + struct nft_trans *trans; + u32 id = ntohl(nla_get_be32(nla)); + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWSET && + id == nft_trans_set_id(trans)) + return nft_trans_set(trans); + } + return ERR_PTR(-ENOENT); +} + +static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, + const char *name) +{ + const struct nft_set *i; + const char *p; + unsigned long *inuse; + unsigned int n = 0, min = 0; + + p = strnchr(name, IFNAMSIZ, '%'); + if (p != NULL) { + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (inuse == NULL) + return -ENOMEM; +cont: + list_for_each_entry(i, &ctx->table->sets, list) { + int tmp; + + if (!sscanf(i->name, name, &tmp)) + continue; + if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) + continue; + + set_bit(tmp - min, inuse); + } + + n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); + if (n >= BITS_PER_BYTE * PAGE_SIZE) { + min += BITS_PER_BYTE * PAGE_SIZE; + memset(inuse, 0, PAGE_SIZE); + goto cont; + } + free_page((unsigned long)inuse); + } + + snprintf(set->name, sizeof(set->name), name, min + n); + list_for_each_entry(i, &ctx->table->sets, list) { + if (!strcmp(set->name, i->name)) + return -ENFILE; + } + return 0; +} + +static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, + const struct nft_set *set, u16 event, u16 flags) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *desc; + u32 portid = ctx->portid; + u32 seq = ctx->seq; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_NAME, set->name)) + goto nla_put_failure; + if (set->flags != 0) + if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) + goto nla_put_failure; + + if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen))) + goto nla_put_failure; + if (set->flags & NFT_SET_MAP) { + if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) + goto nla_put_failure; + } + + desc = nla_nest_start(skb, NFTA_SET_DESC); + if (desc == NULL) + goto nla_put_failure; + if (set->size && + nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + goto nla_put_failure; + nla_nest_end(skb, desc); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_set_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + int event, gfp_t gfp_flags) +{ + struct sk_buff *skb; + u32 portid = ctx->portid; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); + if (skb == NULL) + goto err; + + err = nf_tables_fill_set(skb, ctx, set, event, 0); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, + ctx->report, gfp_flags); +err: + if (err < 0) + nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); + return err; +} + +static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nft_set *set; + unsigned int idx = 0, s_idx = cb->args[0]; + + if (cb->args[1]) + return skb->len; + + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(set, &ctx->table->sets, list) { + if (idx < s_idx) + goto cont; + if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + cb->args[1] = 1; +done: + rcu_read_unlock(); + return skb->len; +} + +static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nft_set *set; + unsigned int idx, s_idx = cb->args[0]; + struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; + + if (cb->args[1]) + return skb->len; + + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(table, &ctx->afi->tables, list) { + if (cur_table) { + if (cur_table != table) + continue; + + cur_table = NULL; + } + ctx->table = table; + idx = 0; + list_for_each_entry_rcu(set, &ctx->table->sets, list) { + if (idx < s_idx) + goto cont; + if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + cb->args[2] = (unsigned long) table; + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + cb->args[1] = 1; +done: + rcu_read_unlock(); + return skb->len; +} + +static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nft_set *set; + unsigned int idx, s_idx = cb->args[0]; + struct nft_af_info *afi; + struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; + struct net *net = sock_net(skb->sk); + int cur_family = cb->args[3]; + + if (cb->args[1]) + return skb->len; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (cur_family) { + if (afi->family != cur_family) + continue; + + cur_family = 0; + } + + list_for_each_entry_rcu(table, &afi->tables, list) { + if (cur_table) { + if (cur_table != table) + continue; + + cur_table = NULL; + } + + ctx->table = table; + ctx->afi = afi; + idx = 0; + list_for_each_entry_rcu(set, &ctx->table->sets, list) { + if (idx < s_idx) + goto cont; + if (nf_tables_fill_set(skb, ctx, set, + NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + cb->args[2] = (unsigned long) table; + cb->args[3] = afi->family; + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + if (s_idx) + s_idx = 0; + } + } + cb->args[1] = 1; +done: + rcu_read_unlock(); + return skb->len; +} + +static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nlattr *nla[NFTA_SET_MAX + 1]; + struct nft_ctx ctx; + int err, ret; + + err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX, + nft_set_policy); + if (err < 0) + return err; + + err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla); + if (err < 0) + return err; + + if (ctx.table == NULL) { + if (ctx.afi == NULL) + ret = nf_tables_dump_sets_all(&ctx, skb, cb); + else + ret = nf_tables_dump_sets_family(&ctx, skb, cb); + } else + ret = nf_tables_dump_sets_table(&ctx, skb, cb); + + return ret; +} + +#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */ + +static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nft_set *set; + struct nft_ctx ctx; + struct sk_buff *skb2; + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + int err; + + /* Verify existance before starting dump */ + err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + if (err < 0) + return err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_sets, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + /* Only accept unspec with dump */ + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + +err: + kfree_skb(skb2); + return err; +} + +static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, + struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *da[NFTA_SET_DESC_MAX + 1]; + int err; + + err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); + if (err < 0) + return err; + + if (da[NFTA_SET_DESC_SIZE] != NULL) + desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + + return 0; +} + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + nft_trans_set_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); + set->flags |= NFT_SET_INACTIVE; + } + nft_trans_set(trans) = set; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_set_ops *ops; + struct nft_af_info *afi; + struct net *net = sock_net(skb->sk); + struct nft_table *table; + struct nft_set *set; + struct nft_ctx ctx; + char name[IFNAMSIZ]; + unsigned int size; + bool create; + u32 ktype, dtype, flags, policy; + struct nft_set_desc desc; + int err; + + if (nla[NFTA_SET_TABLE] == NULL || + nla[NFTA_SET_NAME] == NULL || + nla[NFTA_SET_KEY_LEN] == NULL || + nla[NFTA_SET_ID] == NULL) + return -EINVAL; + + memset(&desc, 0, sizeof(desc)); + + ktype = NFT_DATA_VALUE; + if (nla[NFTA_SET_KEY_TYPE] != NULL) { + ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); + if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) + return -EINVAL; + } + + desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); + if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + + flags = 0; + if (nla[NFTA_SET_FLAGS] != NULL) { + flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); + if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | + NFT_SET_INTERVAL | NFT_SET_MAP)) + return -EINVAL; + } + + dtype = 0; + if (nla[NFTA_SET_DATA_TYPE] != NULL) { + if (!(flags & NFT_SET_MAP)) + return -EINVAL; + + dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); + if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && + dtype != NFT_DATA_VERDICT) + return -EINVAL; + + if (dtype != NFT_DATA_VERDICT) { + if (nla[NFTA_SET_DATA_LEN] == NULL) + return -EINVAL; + desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); + if (desc.dlen == 0 || + desc.dlen > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + } else + desc.dlen = sizeof(struct nft_data); + } else if (flags & NFT_SET_MAP) + return -EINVAL; + + policy = NFT_SET_POL_PERFORMANCE; + if (nla[NFTA_SET_POLICY] != NULL) + policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + + if (nla[NFTA_SET_DESC] != NULL) { + err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); + if (err < 0) + return err; + } + + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + + set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); + if (IS_ERR(set)) { + if (PTR_ERR(set) != -ENOENT) + return PTR_ERR(set); + set = NULL; + } + + if (set != NULL) { + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + return 0; + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -ENOENT; + + ops = nft_select_set_ops(nla, &desc, policy); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + size = 0; + if (ops->privsize != NULL) + size = ops->privsize(nla); + + err = -ENOMEM; + set = kzalloc(sizeof(*set) + size, GFP_KERNEL); + if (set == NULL) + goto err1; + + nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name)); + err = nf_tables_set_alloc_name(&ctx, set, name); + if (err < 0) + goto err2; + + INIT_LIST_HEAD(&set->bindings); + set->ops = ops; + set->ktype = ktype; + set->klen = desc.klen; + set->dtype = dtype; + set->dlen = desc.dlen; + set->flags = flags; + set->size = desc.size; + + err = ops->init(set, &desc, nla); + if (err < 0) + goto err2; + + err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); + if (err < 0) + goto err2; + + list_add_tail_rcu(&set->list, &table->sets); + table->use++; + return 0; + +err2: + kfree(set); +err1: + module_put(ops->owner); + return err; +} + +static void nft_set_destroy(struct nft_set *set) +{ + set->ops->destroy(set); + module_put(set->ops->owner); + kfree(set); +} + +static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ + list_del_rcu(&set->list); + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); + nft_set_destroy(set); +} + +static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_set *set; + struct nft_ctx ctx; + int err; + + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; + if (nla[NFTA_SET_TABLE] == NULL) + return -EINVAL; + + err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + if (!list_empty(&set->bindings)) + return -EBUSY; + + err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); + if (err < 0) + return err; + + list_del_rcu(&set->list); + ctx.table->use--; + return 0; +} + +static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_iter *iter, + const struct nft_set_elem *elem) +{ + enum nft_registers dreg; + + dreg = nft_type_to_reg(set->dtype); + return nft_validate_data_load(ctx, dreg, &elem->data, + set->dtype == NFT_DATA_VERDICT ? + NFT_DATA_VERDICT : NFT_DATA_VALUE); +} + +int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding) +{ + struct nft_set_binding *i; + struct nft_set_iter iter; + + if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) + return -EBUSY; + + if (set->flags & NFT_SET_MAP) { + /* If the set is already bound to the same chain all + * jumps are already validated for that chain. + */ + list_for_each_entry(i, &set->bindings, list) { + if (i->chain == binding->chain) + goto bind; + } + + iter.skip = 0; + iter.count = 0; + iter.err = 0; + iter.fn = nf_tables_bind_check_setelem; + + set->ops->walk(ctx, set, &iter); + if (iter.err < 0) { + /* Destroy anonymous sets if binding fails */ + if (set->flags & NFT_SET_ANONYMOUS) + nf_tables_set_destroy(ctx, set); + + return iter.err; + } + } +bind: + binding->chain = ctx->chain; + list_add_tail_rcu(&binding->list, &set->bindings); + return 0; +} + +void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding) +{ + list_del_rcu(&binding->list); + + if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && + !(set->flags & NFT_SET_INACTIVE)) + nf_tables_set_destroy(ctx, set); +} + +/* + * Set elements + */ + +static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { + [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { + [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, +}; + +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + bool trans) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_af_info *afi; + struct nft_table *table; + struct net *net = sock_net(skb->sk); + + afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); + if (IS_ERR(table)) + return PTR_ERR(table); + if (!trans && (table->flags & NFT_TABLE_INACTIVE)) + return -ENOENT; + + nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + return 0; +} + +static int nf_tables_fill_setelem(struct sk_buff *skb, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + nest = nla_nest_start(skb, NFTA_LIST_ELEM); + if (nest == NULL) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE, + set->klen) < 0) + goto nla_put_failure; + + if (set->flags & NFT_SET_MAP && + !(elem->flags & NFT_SET_ELEM_INTERVAL_END) && + nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data, + set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, + set->dlen) < 0) + goto nla_put_failure; + + if (elem->flags != 0) + if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags))) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +struct nft_set_dump_args { + const struct netlink_callback *cb; + struct nft_set_iter iter; + struct sk_buff *skb; +}; + +static int nf_tables_dump_setelem(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_iter *iter, + const struct nft_set_elem *elem) +{ + struct nft_set_dump_args *args; + + args = container_of(iter, struct nft_set_dump_args, iter); + return nf_tables_fill_setelem(args->skb, set, elem); +} + +static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nft_set *set; + struct nft_set_dump_args args; + struct nft_ctx ctx; + struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *nest; + u32 portid, seq; + int event, err; + + err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, + NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy); + if (err < 0) + return err; + + err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, + false); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + + event = NFT_MSG_NEWSETELEM; + event |= NFNL_SUBSYS_NFTABLES << 8; + portid = NETLINK_CB(cb->skb).portid; + seq = cb->nlh->nlmsg_seq; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + NLM_F_MULTI); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx.afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + if (nest == NULL) + goto nla_put_failure; + + args.cb = cb; + args.skb = skb; + args.iter.skip = cb->args[0]; + args.iter.count = 0; + args.iter.err = 0; + args.iter.fn = nf_tables_dump_setelem; + set->ops->walk(&ctx, set, &args.iter); + + nla_nest_end(skb, nest); + nlmsg_end(skb, nlh); + + if (args.iter.err && args.iter.err != -EMSGSIZE) + return args.iter.err; + if (args.iter.count == cb->args[0]) + return 0; + + cb->args[0] = args.iter.count; + return skb->len; + +nla_put_failure: + return -ENOSPC; +} + +static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nft_set *set; + struct nft_ctx ctx; + int err; + + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_set, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + return -EOPNOTSUPP; +} + +static int nf_tables_fill_setelem_info(struct sk_buff *skb, + const struct nft_ctx *ctx, u32 seq, + u32 portid, int event, u16 flags, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *nest; + int err; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_NAME, set->name)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + if (nest == NULL) + goto nla_put_failure; + + err = nf_tables_fill_setelem(skb, set, elem); + if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) +{ + struct net *net = ctx->net; + u32 portid = ctx->portid; + struct sk_buff *skb; + int err; + + if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, + set, elem); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); +err: + if (err < 0) + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + return err; +} + +static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, + int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + if (trans == NULL) + return NULL; + + nft_trans_elem_set(trans) = set; + return trans; +} + +static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr *attr) +{ + struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_data_desc d1, d2; + struct nft_set_elem elem; + struct nft_set_binding *binding; + enum nft_registers dreg; + struct nft_trans *trans; + int err; + + if (set->size && set->nelems == set->size) + return -ENFILE; + + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy); + if (err < 0) + return err; + + if (nla[NFTA_SET_ELEM_KEY] == NULL) + return -EINVAL; + + elem.flags = 0; + if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { + elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); + if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + } + + if (set->flags & NFT_SET_MAP) { + if (nla[NFTA_SET_ELEM_DATA] == NULL && + !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) + return -EINVAL; + if (nla[NFTA_SET_ELEM_DATA] != NULL && + elem.flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + } else { + if (nla[NFTA_SET_ELEM_DATA] != NULL) + return -EINVAL; + } + + err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + goto err1; + err = -EINVAL; + if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) + goto err2; + + err = -EEXIST; + if (set->ops->get(set, &elem) == 0) + goto err2; + + if (nla[NFTA_SET_ELEM_DATA] != NULL) { + err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]); + if (err < 0) + goto err2; + + err = -EINVAL; + if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) + goto err3; + + dreg = nft_type_to_reg(set->dtype); + list_for_each_entry(binding, &set->bindings, list) { + struct nft_ctx bind_ctx = { + .afi = ctx->afi, + .table = ctx->table, + .chain = (struct nft_chain *)binding->chain, + }; + + err = nft_validate_data_load(&bind_ctx, dreg, + &elem.data, d2.type); + if (err < 0) + goto err3; + } + } + + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); + if (trans == NULL) + goto err3; + + err = set->ops->insert(set, &elem); + if (err < 0) + goto err4; + + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; + +err4: + kfree(trans); +err3: + if (nla[NFTA_SET_ELEM_DATA] != NULL) + nft_data_uninit(&elem.data, d2.type); +err2: + nft_data_uninit(&elem.key, d1.type); +err1: + return err; +} + +static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + const struct nlattr *attr; + struct nft_set *set; + struct nft_ctx ctx; + int rem, err = 0; + + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) { + if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { + set = nf_tables_set_lookup_byid(net, + nla[NFTA_SET_ELEM_LIST_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + + if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + return -EBUSY; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_add_set_elem(&ctx, set, attr); + if (err < 0) + break; + + set->nelems++; + } + return err; +} + +static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr *attr) +{ + struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_data_desc desc; + struct nft_set_elem elem; + struct nft_trans *trans; + int err; + + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy); + if (err < 0) + goto err1; + + err = -EINVAL; + if (nla[NFTA_SET_ELEM_KEY] == NULL) + goto err1; + + err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + goto err1; + + err = -EINVAL; + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) + goto err2; + + err = set->ops->get(set, &elem); + if (err < 0) + goto err2; + + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); + if (trans == NULL) + goto err2; + + nft_trans_elem(trans) = elem; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + nft_data_uninit(&elem.key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP) + nft_data_uninit(&elem.data, set->dtype); + +err2: + nft_data_uninit(&elem.key, desc.type); +err1: + return err; +} + +static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nlattr *attr; + struct nft_set *set; + struct nft_ctx ctx; + int rem, err = 0; + + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + if (IS_ERR(set)) + return PTR_ERR(set); + if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + return -EBUSY; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_del_setelem(&ctx, set, attr); + if (err < 0) + break; + + set->nelems--; + } + return err; +} + +static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { + [NFT_MSG_NEWTABLE] = { + .call_batch = nf_tables_newtable, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, + [NFT_MSG_GETTABLE] = { + .call = nf_tables_gettable, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, + [NFT_MSG_DELTABLE] = { + .call_batch = nf_tables_deltable, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, + [NFT_MSG_NEWCHAIN] = { + .call_batch = nf_tables_newchain, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, + [NFT_MSG_GETCHAIN] = { + .call = nf_tables_getchain, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, + [NFT_MSG_DELCHAIN] = { + .call_batch = nf_tables_delchain, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, + [NFT_MSG_NEWRULE] = { + .call_batch = nf_tables_newrule, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_GETRULE] = { + .call = nf_tables_getrule, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_DELRULE] = { + .call_batch = nf_tables_delrule, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_NEWSET] = { + .call_batch = nf_tables_newset, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, + [NFT_MSG_GETSET] = { + .call = nf_tables_getset, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, + [NFT_MSG_DELSET] = { + .call_batch = nf_tables_delset, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, + [NFT_MSG_NEWSETELEM] = { + .call_batch = nf_tables_newsetelem, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, + [NFT_MSG_GETSETELEM] = { + .call = nf_tables_getsetelem, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, + [NFT_MSG_DELSETELEM] = { + .call_batch = nf_tables_delsetelem, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, +}; + +static void nft_chain_commit_update(struct nft_trans *trans) +{ + struct nft_base_chain *basechain; + + if (nft_trans_chain_name(trans)[0]) + strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + + if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) + return; + + basechain = nft_base_chain(trans->ctx.chain); + nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + + switch (nft_trans_chain_policy(trans)) { + case NF_DROP: + case NF_ACCEPT: + basechain->policy = nft_trans_chain_policy(trans); + break; + } +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * removed rules. + */ +static void nf_tables_commit_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_DELTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_DELRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_DELSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_commit(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + /* Bump generation counter, invalidate any dump in progress */ + while (++net->nft.base_seq == 0); + + /* A new generation has just started */ + net->nft.gencursor = gencursor_next(net); + + /* Make sure all packets have left the previous generation before + * purging old rules. + */ + synchronize_rcu(); + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (!nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + } else { + trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + } + nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELTABLE: + nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) + nft_chain_commit_update(trans); + else + trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + + nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + break; + case NFT_MSG_NEWRULE: + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_NEWRULE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELRULE: + list_del_rcu(&nft_trans_rule(trans)->list); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_DELRULE); + break; + case NFT_MSG_NEWSET: + nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS && + !list_empty(&nft_trans_set(trans)->bindings)) + trans->ctx.table->use--; + + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_NEWSET, GFP_KERNEL); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSET: + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_DELSET, GFP_KERNEL); + break; + case NFT_MSG_NEWSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_NEWSETELEM, 0); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_DELSETELEM, 0); + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); + } + + return 0; +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * aborted rules. + */ +static void nf_tables_abort_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_NEWCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_NEWRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_NEWSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_abort(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + nft_trans_destroy(trans); + } else { + list_del_rcu(&trans->ctx.table->list); + } + break; + case NFT_MSG_DELTABLE: + list_add_tail_rcu(&trans->ctx.table->list, + &trans->ctx.afi->tables); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) { + if (nft_trans_chain_stats(trans)) + free_percpu(nft_trans_chain_stats(trans)); + + nft_trans_destroy(trans); + } else { + trans->ctx.table->use--; + list_del_rcu(&trans->ctx.chain->list); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + } + break; + case NFT_MSG_DELCHAIN: + trans->ctx.table->use++; + list_add_tail_rcu(&trans->ctx.chain->list, + &trans->ctx.table->chains); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWRULE: + trans->ctx.chain->use--; + list_del_rcu(&nft_trans_rule(trans)->list); + break; + case NFT_MSG_DELRULE: + trans->ctx.chain->use++; + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSET: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_set(trans)->list); + break; + case NFT_MSG_DELSET: + trans->ctx.table->use++; + list_add_tail_rcu(&nft_trans_set(trans)->list, + &trans->ctx.table->sets); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSETELEM: + nft_trans_elem_set(trans)->nelems--; + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nft_trans_elem_set(trans)->nelems++; + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe_reverse(trans, next, + &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); + } + + return 0; +} + +static const struct nfnetlink_subsystem nf_tables_subsys = { + .name = "nf_tables", + .subsys_id = NFNL_SUBSYS_NFTABLES, + .cb_count = NFT_MSG_MAX, + .cb = nf_tables_cb, + .commit = nf_tables_commit, + .abort = nf_tables_abort, +}; + +/* + * Loop detection - walk through the ruleset beginning at the destination chain + * of a new jump until either the source chain is reached (loop) or all + * reachable chains have been traversed. + * + * The loop check is performed whenever a new jump verdict is added to an + * expression or verdict map or a verdict map is bound to a new chain. + */ + +static int nf_tables_check_loops(const struct nft_ctx *ctx, + const struct nft_chain *chain); + +static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_iter *iter, + const struct nft_set_elem *elem) +{ + if (elem->flags & NFT_SET_ELEM_INTERVAL_END) + return 0; + + switch (elem->data.verdict) { + case NFT_JUMP: + case NFT_GOTO: + return nf_tables_check_loops(ctx, elem->data.chain); + default: + return 0; + } +} + +static int nf_tables_check_loops(const struct nft_ctx *ctx, + const struct nft_chain *chain) +{ + const struct nft_rule *rule; + const struct nft_expr *expr, *last; + const struct nft_set *set; + struct nft_set_binding *binding; + struct nft_set_iter iter; + + if (ctx->chain == chain) + return -ELOOP; + + list_for_each_entry(rule, &chain->rules, list) { + nft_rule_for_each_expr(expr, last, rule) { + const struct nft_data *data = NULL; + int err; + + if (!expr->ops->validate) + continue; + + err = expr->ops->validate(ctx, expr, &data); + if (err < 0) + return err; + + if (data == NULL) + continue; + + switch (data->verdict) { + case NFT_JUMP: + case NFT_GOTO: + err = nf_tables_check_loops(ctx, data->chain); + if (err < 0) + return err; + default: + break; + } + } + } + + list_for_each_entry(set, &ctx->table->sets, list) { + if (!(set->flags & NFT_SET_MAP) || + set->dtype != NFT_DATA_VERDICT) + continue; + + list_for_each_entry(binding, &set->bindings, list) { + if (binding->chain != chain) + continue; + + iter.skip = 0; + iter.count = 0; + iter.err = 0; + iter.fn = nf_tables_loop_check_setelem; + + set->ops->walk(ctx, set, &iter); + if (iter.err < 0) + return iter.err; + } + } + + return 0; +} + +/** + * nft_validate_input_register - validate an expressions' input register + * + * @reg: the register number + * + * Validate that the input register is one of the general purpose + * registers. + */ +int nft_validate_input_register(enum nft_registers reg) +{ + if (reg <= NFT_REG_VERDICT) + return -EINVAL; + if (reg > NFT_REG_MAX) + return -ERANGE; + return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_input_register); + +/** + * nft_validate_output_register - validate an expressions' output register + * + * @reg: the register number + * + * Validate that the output register is one of the general purpose + * registers or the verdict register. + */ +int nft_validate_output_register(enum nft_registers reg) +{ + if (reg < NFT_REG_VERDICT) + return -EINVAL; + if (reg > NFT_REG_MAX) + return -ERANGE; + return 0; +} +EXPORT_SYMBOL_GPL(nft_validate_output_register); + +/** + * nft_validate_data_load - validate an expressions' data load + * + * @ctx: context of the expression performing the load + * @reg: the destination register number + * @data: the data to load + * @type: the data type + * + * Validate that a data load uses the appropriate data type for + * the destination register. A value of NULL for the data means + * that its runtime gathered data, which is always of type + * NFT_DATA_VALUE. + */ +int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type) +{ + int err; + + switch (reg) { + case NFT_REG_VERDICT: + if (data == NULL || type != NFT_DATA_VERDICT) + return -EINVAL; + + if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) { + err = nf_tables_check_loops(ctx, data->chain); + if (err < 0) + return err; + + if (ctx->chain->level + 1 > data->chain->level) { + if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE) + return -EMLINK; + data->chain->level = ctx->chain->level + 1; + } + } + + return 0; + default: + if (data != NULL && type != NFT_DATA_VALUE) + return -EINVAL; + return 0; + } +} +EXPORT_SYMBOL_GPL(nft_validate_data_load); + +static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { + [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, + [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, + .len = NFT_CHAIN_MAXNAMELEN - 1 }, +}; + +static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, + struct nft_data_desc *desc, const struct nlattr *nla) +{ + struct nlattr *tb[NFTA_VERDICT_MAX + 1]; + struct nft_chain *chain; + int err; + + err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy); + if (err < 0) + return err; + + if (!tb[NFTA_VERDICT_CODE]) + return -EINVAL; + data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); + + switch (data->verdict) { + default: + switch (data->verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; + default: + return -EINVAL; + } + /* fall through */ + case NFT_CONTINUE: + case NFT_BREAK: + case NFT_RETURN: + desc->len = sizeof(data->verdict); + break; + case NFT_JUMP: + case NFT_GOTO: + if (!tb[NFTA_VERDICT_CHAIN]) + return -EINVAL; + chain = nf_tables_chain_lookup(ctx->table, + tb[NFTA_VERDICT_CHAIN]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + if (chain->flags & NFT_BASE_CHAIN) + return -EOPNOTSUPP; + + chain->use++; + data->chain = chain; + desc->len = sizeof(data); + break; + } + + desc->type = NFT_DATA_VERDICT; + return 0; +} + +static void nft_verdict_uninit(const struct nft_data *data) +{ + switch (data->verdict) { + case NFT_JUMP: + case NFT_GOTO: + data->chain->use--; + break; + } +} + +static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, NFTA_DATA_VERDICT); + if (!nest) + goto nla_put_failure; + + if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict))) + goto nla_put_failure; + + switch (data->verdict) { + case NFT_JUMP: + case NFT_GOTO: + if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + return -1; +} + +static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data, + struct nft_data_desc *desc, const struct nlattr *nla) +{ + unsigned int len; + + len = nla_len(nla); + if (len == 0) + return -EINVAL; + if (len > sizeof(data->data)) + return -EOVERFLOW; + + nla_memcpy(data->data, nla, sizeof(data->data)); + desc->type = NFT_DATA_VALUE; + desc->len = len; + return 0; +} + +static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data, + unsigned int len) +{ + return nla_put(skb, NFTA_DATA_VALUE, len, data->data); +} + +static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { + [NFTA_DATA_VALUE] = { .type = NLA_BINARY, + .len = FIELD_SIZEOF(struct nft_data, data) }, + [NFTA_DATA_VERDICT] = { .type = NLA_NESTED }, +}; + +/** + * nft_data_init - parse nf_tables data netlink attributes + * + * @ctx: context of the expression using the data + * @data: destination struct nft_data + * @desc: data description + * @nla: netlink attribute containing data + * + * Parse the netlink data attributes and initialize a struct nft_data. + * The type and length of data are returned in the data description. + * + * The caller can indicate that it only wants to accept data of type + * NFT_DATA_VALUE by passing NULL for the ctx argument. + */ +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, + struct nft_data_desc *desc, const struct nlattr *nla) +{ + struct nlattr *tb[NFTA_DATA_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy); + if (err < 0) + return err; + + if (tb[NFTA_DATA_VALUE]) + return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); + if (tb[NFTA_DATA_VERDICT] && ctx != NULL) + return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(nft_data_init); + +/** + * nft_data_uninit - release a nft_data item + * + * @data: struct nft_data to release + * @type: type of data + * + * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, + * all others need to be released by calling this function. + */ +void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) +{ + switch (type) { + case NFT_DATA_VALUE: + return; + case NFT_DATA_VERDICT: + return nft_verdict_uninit(data); + default: + WARN_ON(1); + } +} +EXPORT_SYMBOL_GPL(nft_data_uninit); + +int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, + enum nft_data_types type, unsigned int len) +{ + struct nlattr *nest; + int err; + + nest = nla_nest_start(skb, attr); + if (nest == NULL) + return -1; + + switch (type) { + case NFT_DATA_VALUE: + err = nft_value_dump(skb, data, len); + break; + case NFT_DATA_VERDICT: + err = nft_verdict_dump(skb, data); + break; + default: + err = -EINVAL; + WARN_ON(1); + } + + nla_nest_end(skb, nest); + return err; +} +EXPORT_SYMBOL_GPL(nft_data_dump); + +static int nf_tables_init_net(struct net *net) +{ + INIT_LIST_HEAD(&net->nft.af_info); + INIT_LIST_HEAD(&net->nft.commit_list); + net->nft.base_seq = 1; + return 0; +} + +static struct pernet_operations nf_tables_net_ops = { + .init = nf_tables_init_net, +}; + +static int __init nf_tables_module_init(void) +{ + int err; + + info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS, + GFP_KERNEL); + if (info == NULL) { + err = -ENOMEM; + goto err1; + } + + err = nf_tables_core_module_init(); + if (err < 0) + goto err2; + + err = nfnetlink_subsys_register(&nf_tables_subsys); + if (err < 0) + goto err3; + + pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n"); + return register_pernet_subsys(&nf_tables_net_ops); +err3: + nf_tables_core_module_exit(); +err2: + kfree(info); +err1: + return err; +} + +static void __exit nf_tables_module_exit(void) +{ + unregister_pernet_subsys(&nf_tables_net_ops); + nfnetlink_subsys_unregister(&nf_tables_subsys); + nf_tables_core_module_exit(); + kfree(info); +} + +module_init(nf_tables_module_init); +module_exit(nf_tables_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c new file mode 100644 index 00000000000..3b90eb2b2c5 --- /dev/null +++ b/net/netfilter/nf_tables_core.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> + +static void nft_cmp_fast_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1]) +{ + const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + u32 mask = nft_cmp_fast_mask(priv->len); + + if ((data[priv->sreg].data[0] & mask) == priv->data) + return; + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static bool nft_payload_fast_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + struct nft_data *dest = &data[priv->dreg]; + unsigned char *ptr; + + if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) + ptr = skb_network_header(skb); + else + ptr = skb_network_header(skb) + pkt->xt.thoff; + + ptr += priv->offset; + + if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) + return false; + + if (priv->len == 2) + *(u16 *)dest->data = *(u16 *)ptr; + else if (priv->len == 4) + *(u32 *)dest->data = *(u32 *)ptr; + else + *(u8 *)dest->data = *(u8 *)ptr; + return true; +} + +struct nft_jumpstack { + const struct nft_chain *chain; + const struct nft_rule *rule; + int rulenum; +}; + +enum nft_trace { + NFT_TRACE_RULE, + NFT_TRACE_RETURN, + NFT_TRACE_POLICY, +}; + +static const char *const comments[] = { + [NFT_TRACE_RULE] = "rule", + [NFT_TRACE_RETURN] = "return", + [NFT_TRACE_POLICY] = "policy", +}; + +static struct nf_loginfo trace_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 4, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static void nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) +{ + struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + + nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", + chain->table->name, chain->name, comments[type], + rulenum); +} + +unsigned int +nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +{ + const struct nft_chain *chain = ops->priv, *basechain = chain; + const struct nft_rule *rule; + const struct nft_expr *expr, *last; + struct nft_data data[NFT_REG_MAX + 1]; + unsigned int stackptr = 0; + struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; + struct nft_stats *stats; + int rulenum; + /* + * Cache cursor to avoid problems in case that the cursor is updated + * while traversing the ruleset. + */ + unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); + +do_chain: + rulenum = 0; + rule = list_entry(&chain->rules, struct nft_rule, list); +next_rule: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + list_for_each_entry_continue_rcu(rule, &chain->rules, list) { + + /* This rule is not active, skip. */ + if (unlikely(rule->genmask & (1 << gencursor))) + continue; + + rulenum++; + + nft_rule_for_each_expr(expr, last, rule) { + if (expr->ops == &nft_cmp_fast_ops) + nft_cmp_fast_eval(expr, data); + else if (expr->ops != &nft_payload_fast_ops || + !nft_payload_fast_eval(expr, data, pkt)) + expr->ops->eval(expr, data, pkt); + + if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE) + break; + } + + switch (data[NFT_REG_VERDICT].verdict) { + case NFT_BREAK: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + continue; + case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + continue; + } + break; + } + + switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + + return data[NFT_REG_VERDICT].verdict; + } + + switch (data[NFT_REG_VERDICT].verdict) { + case NFT_JUMP: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + + BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); + jumpstack[stackptr].chain = chain; + jumpstack[stackptr].rule = rule; + jumpstack[stackptr].rulenum = rulenum; + stackptr++; + chain = data[NFT_REG_VERDICT].chain; + goto do_chain; + case NFT_GOTO: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + + chain = data[NFT_REG_VERDICT].chain; + goto do_chain; + case NFT_RETURN: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); + break; + case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) + nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); + break; + default: + WARN_ON(1); + } + + if (stackptr > 0) { + stackptr--; + chain = jumpstack[stackptr].chain; + rule = jumpstack[stackptr].rule; + rulenum = jumpstack[stackptr].rulenum; + goto next_rule; + } + + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + + rcu_read_lock_bh(); + stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); + u64_stats_update_begin(&stats->syncp); + stats->pkts++; + stats->bytes += pkt->skb->len; + u64_stats_update_end(&stats->syncp); + rcu_read_unlock_bh(); + + return nft_base_chain(basechain)->policy; +} +EXPORT_SYMBOL_GPL(nft_do_chain); + +int __init nf_tables_core_module_init(void) +{ + int err; + + err = nft_immediate_module_init(); + if (err < 0) + goto err1; + + err = nft_cmp_module_init(); + if (err < 0) + goto err2; + + err = nft_lookup_module_init(); + if (err < 0) + goto err3; + + err = nft_bitwise_module_init(); + if (err < 0) + goto err4; + + err = nft_byteorder_module_init(); + if (err < 0) + goto err5; + + err = nft_payload_module_init(); + if (err < 0) + goto err6; + + return 0; + +err6: + nft_byteorder_module_exit(); +err5: + nft_bitwise_module_exit(); +err4: + nft_lookup_module_exit(); +err3: + nft_cmp_module_exit(); +err2: + nft_immediate_module_exit(); +err1: + return err; +} + +void nf_tables_core_module_exit(void) +{ + nft_payload_module_exit(); + nft_byteorder_module_exit(); + nft_bitwise_module_exit(); + nft_lookup_module_exit(); + nft_cmp_module_exit(); + nft_immediate_module_exit(); +} diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c new file mode 100644 index 00000000000..9dd2d216cfc --- /dev/null +++ b/net/netfilter/nf_tables_inet.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/ip.h> + +static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n) +{ + struct nft_af_info *afi; + + if (n == 1) + afi = &nft_af_ipv4; + else + afi = &nft_af_ipv6; + + ops->pf = afi->family; + if (afi->hooks[ops->hooknum]) + ops->hook = afi->hooks[ops->hooknum]; +} + +static struct nft_af_info nft_af_inet __read_mostly = { + .family = NFPROTO_INET, + .nhooks = NF_INET_NUMHOOKS, + .owner = THIS_MODULE, + .nops = 2, + .hook_ops_init = nft_inet_hook_ops_init, +}; + +static int __net_init nf_tables_inet_init_net(struct net *net) +{ + net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.inet == NULL) + return -ENOMEM; + memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet)); + + if (nft_register_afinfo(net, net->nft.inet) < 0) + goto err; + + return 0; + +err: + kfree(net->nft.inet); + return -ENOMEM; +} + +static void __net_exit nf_tables_inet_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.inet); + kfree(net->nft.inet); +} + +static struct pernet_operations nf_tables_inet_net_ops = { + .init = nf_tables_inet_init_net, + .exit = nf_tables_inet_exit_net, +}; + +static const struct nf_chain_type filter_inet = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_INET, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_inet_init(void) +{ + int ret; + + nft_register_chain_type(&filter_inet); + ret = register_pernet_subsys(&nf_tables_inet_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_inet); + + return ret; +} + +static void __exit nf_tables_inet_exit(void) +{ + unregister_pernet_subsys(&nf_tables_inet_net_ops); + nft_unregister_chain_type(&filter_inet); +} + +module_init(nf_tables_inet_init); +module_exit(nf_tables_inet_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(1); diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c deleted file mode 100644 index 474d621cbc2..00000000000 --- a/net/netfilter/nf_tproxy_core.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Transparent proxy support for Linux/iptables - * - * Copyright (c) 2006-2007 BalaBit IT Ltd. - * Author: Balazs Scheidler, Krisztian Kovacs - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> - -#include <linux/net.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <net/udp.h> -#include <net/netfilter/nf_tproxy_core.h> - - -static void -nf_tproxy_destructor(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - skb->sk = NULL; - skb->destructor = NULL; - - if (sk) - sock_put(sk); -} - -/* consumes sk */ -void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) -{ - /* assigning tw sockets complicates things; most - * skb->sk->X checks would have to test sk->sk_state first */ - if (sk->sk_state == TCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return; - } - - skb_orphan(skb); - skb->sk = sk; - skb->destructor = nf_tproxy_destructor; -} -EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); - -static int __init nf_tproxy_init(void) -{ - pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n"); - pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n"); - return 0; -} - -module_init(nf_tproxy_init); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Krisztian Kovacs"); -MODULE_DESCRIPTION("Transparent proxy support core routines"); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4d70785b953..c138b8fbe28 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -23,12 +23,10 @@ #include <linux/net.h> #include <linux/skbuff.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <net/sock.h> -#include <net/netlink.h> #include <linux/init.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); @@ -37,30 +35,49 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; -static const struct nfnetlink_subsystem __rcu *subsys_table[NFNL_SUBSYS_COUNT]; -static DEFINE_MUTEX(nfnl_mutex); +static struct { + struct mutex mutex; + const struct nfnetlink_subsystem __rcu *subsys; +} table[NFNL_SUBSYS_COUNT]; + +static const int nfnl_group2type[NFNLGRP_MAX+1] = { + [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK, + [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK, + [NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK, + [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP, + [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP, + [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, +}; -void nfnl_lock(void) +void nfnl_lock(__u8 subsys_id) { - mutex_lock(&nfnl_mutex); + mutex_lock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_lock); -void nfnl_unlock(void) +void nfnl_unlock(__u8 subsys_id) { - mutex_unlock(&nfnl_mutex); + mutex_unlock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_unlock); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_nfnl_is_held(u8 subsys_id) +{ + return lockdep_is_held(&table[subsys_id].mutex); +} +EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); +#endif + int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { - nfnl_lock(); - if (subsys_table[n->subsys_id]) { - nfnl_unlock(); + nfnl_lock(n->subsys_id); + if (table[n->subsys_id].subsys) { + nfnl_unlock(n->subsys_id); return -EBUSY; } - rcu_assign_pointer(subsys_table[n->subsys_id], n); - nfnl_unlock(); + rcu_assign_pointer(table[n->subsys_id].subsys, n); + nfnl_unlock(n->subsys_id); return 0; } @@ -68,9 +85,9 @@ EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) { - nfnl_lock(); - subsys_table[n->subsys_id] = NULL; - nfnl_unlock(); + nfnl_lock(n->subsys_id); + table[n->subsys_id].subsys = NULL; + nfnl_unlock(n->subsys_id); synchronize_rcu(); return 0; } @@ -83,7 +100,7 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; - return rcu_dereference(subsys_table[subsys_id]); + return rcu_dereference(table[subsys_id].subsys); } static inline const struct nfnl_callback * @@ -103,22 +120,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, - unsigned group, int echo, gfp_t flags) +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ + return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); +} +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, + unsigned int group, int echo, gfp_t flags) { - return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); + return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_set_err(net->nfnl, pid, group, error); + return netlink_set_err(net->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, + int flags) { - return netlink_unicast(net->nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, portid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); @@ -130,11 +155,8 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) const struct nfnetlink_subsystem *ss; int type, err; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg))) + if (nlmsg_len(nlh) < sizeof(struct nfgenmsg)) return 0; type = nlh->nlmsg_type; @@ -162,16 +184,19 @@ replay: } { - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; + __u8 subsys_id = NFNL_SUBSYS_ID(type); err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy); - if (err < 0) + if (err < 0) { + rcu_read_unlock(); return err; + } if (nc->call_rcu) { err = nc->call_rcu(net->nfnl, skb, nlh, @@ -179,16 +204,17 @@ replay: rcu_read_unlock(); } else { rcu_read_unlock(); - nfnl_lock(); - if (rcu_dereference_protected( - subsys_table[NFNL_SUBSYS_ID(type)], - lockdep_is_held(&nfnl_mutex)) != ss || + nfnl_lock(subsys_id); + if (rcu_dereference_protected(table[subsys_id].subsys, + lockdep_is_held(&table[subsys_id].mutex)) != ss || nfnetlink_find_client(type, ss) != nc) err = -EAGAIN; - else + else if (nc->call) err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda); - nfnl_unlock(); + else + err = -EINVAL; + nfnl_unlock(subsys_id); } if (err == -EAGAIN) goto replay; @@ -196,17 +222,209 @@ replay: } } +static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, + u_int16_t subsys_id) +{ + struct sk_buff *nskb, *oskb = skb; + struct net *net = sock_net(skb->sk); + const struct nfnetlink_subsystem *ss; + const struct nfnl_callback *nc; + bool success = true, done = false; + int err; + + if (subsys_id >= NFNL_SUBSYS_COUNT) + return netlink_ack(skb, nlh, -EINVAL); +replay: + nskb = netlink_skb_clone(oskb, GFP_KERNEL); + if (!nskb) + return netlink_ack(oskb, nlh, -ENOMEM); + + nskb->sk = oskb->sk; + skb = nskb; + + nfnl_lock(subsys_id); + ss = rcu_dereference_protected(table[subsys_id].subsys, + lockdep_is_held(&table[subsys_id].mutex)); + if (!ss) { +#ifdef CONFIG_MODULES + nfnl_unlock(subsys_id); + request_module("nfnetlink-subsys-%d", subsys_id); + nfnl_lock(subsys_id); + ss = rcu_dereference_protected(table[subsys_id].subsys, + lockdep_is_held(&table[subsys_id].mutex)); + if (!ss) +#endif + { + nfnl_unlock(subsys_id); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(nskb); + } + } + + if (!ss->commit || !ss->abort) { + nfnl_unlock(subsys_id); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(skb); + } + + while (skb->len >= nlmsg_total_size(0)) { + int msglen, type; + + nlh = nlmsg_hdr(skb); + err = 0; + + if (nlh->nlmsg_len < NLMSG_HDRLEN) { + err = -EINVAL; + goto ack; + } + + /* Only requests are handled by the kernel */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { + err = -EINVAL; + goto ack; + } + + type = nlh->nlmsg_type; + if (type == NFNL_MSG_BATCH_BEGIN) { + /* Malformed: Batch begin twice */ + success = false; + goto done; + } else if (type == NFNL_MSG_BATCH_END) { + done = true; + goto done; + } else if (type < NLMSG_MIN_TYPE) { + err = -EINVAL; + goto ack; + } + + /* We only accept a batch with messages for the same + * subsystem. + */ + if (NFNL_SUBSYS_ID(type) != subsys_id) { + err = -EINVAL; + goto ack; + } + + nc = nfnetlink_find_client(type, ss); + if (!nc) { + err = -EINVAL; + goto ack; + } + + { + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *attr = (void *)nlh + min_len; + int attrlen = nlh->nlmsg_len - min_len; + + err = nla_parse(cda, ss->cb[cb_id].attr_count, + attr, attrlen, ss->cb[cb_id].policy); + if (err < 0) + goto ack; + + if (nc->call_batch) { + err = nc->call_batch(net->nfnl, skb, nlh, + (const struct nlattr **)cda); + } + + /* The lock was released to autoload some module, we + * have to abort and start from scratch using the + * original skb. + */ + if (err == -EAGAIN) { + ss->abort(skb); + nfnl_unlock(subsys_id); + kfree_skb(nskb); + goto replay; + } + } +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) { + /* We don't stop processing the batch on errors, thus, + * userspace gets all the errors that the batch + * triggers. + */ + netlink_ack(skb, nlh, err); + if (err) + success = false; + } + + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + skb_pull(skb, msglen); + } +done: + if (success && done) + ss->commit(skb); + else + ss->abort(skb); + + nfnl_unlock(subsys_id); + kfree_skb(nskb); +} + static void nfnetlink_rcv(struct sk_buff *skb) { - netlink_rcv_skb(skb, &nfnetlink_rcv_msg); + struct nlmsghdr *nlh = nlmsg_hdr(skb); + int msglen; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len) + return; + + if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { + netlink_ack(skb, nlh, -EPERM); + return; + } + + if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { + struct nfgenmsg *nfgenmsg; + + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) + return; + + nfgenmsg = nlmsg_data(nlh); + skb_pull(skb, msglen); + nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); + } else { + netlink_rcv_skb(skb, &nfnetlink_rcv_msg); + } +} + +#ifdef CONFIG_MODULES +static int nfnetlink_bind(int group) +{ + const struct nfnetlink_subsystem *ss; + int type = nfnl_group2type[group]; + + rcu_read_lock(); + ss = nfnetlink_get_subsys(type); + rcu_read_unlock(); + if (!ss) + request_module("nfnetlink-subsys-%d", type); + return 0; } +#endif static int __net_init nfnetlink_net_init(struct net *net) { struct sock *nfnl; + struct netlink_kernel_cfg cfg = { + .groups = NFNLGRP_MAX, + .input = nfnetlink_rcv, +#ifdef CONFIG_MODULES + .bind = nfnetlink_bind, +#endif + }; - nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, - nfnetlink_rcv, NULL, THIS_MODULE); + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); if (!nfnl) return -ENOMEM; net->nfnl_stash = nfnl; @@ -232,6 +450,11 @@ static struct pernet_operations nfnetlink_net_ops = { static int __init nfnetlink_init(void) { + int i; + + for (i=0; i<NFNL_SUBSYS_COUNT; i++) + mutex_init(&table[i].mutex); + pr_info("Netfilter messages via NETLINK v%s.\n", nfversion); return register_pernet_subsys(&nfnetlink_net_ops); } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 11ba013e47f..2baa125c2e8 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/atomic.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/slab.h> @@ -17,7 +18,6 @@ #include <linux/errno.h> #include <net/netlink.h> #include <net/sock.h> -#include <asm/atomic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> @@ -32,23 +32,31 @@ static LIST_HEAD(nfnl_acct_list); struct nf_acct { atomic64_t pkts; atomic64_t bytes; + unsigned long flags; struct list_head head; atomic_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; + char data[0]; }; +#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) + static int nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; char *acct_name; + unsigned int size = 0; + u32 flags = 0; if (!tb[NFACCT_NAME]) return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); + if (strlen(acct_name) == 0) + return -EINVAL; list_for_each_entry(nfacct, &nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) @@ -66,24 +74,47 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); + smp_mb__before_atomic(); + /* reset overquota flag if quota is enabled. */ + if ((matching->flags & NFACCT_F_QUOTA)) + clear_bit(NFACCT_F_OVERQUOTA, &matching->flags); return 0; } return -EBUSY; } - nfacct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL); + if (tb[NFACCT_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); + if (flags & ~NFACCT_F_QUOTA) + return -EOPNOTSUPP; + if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) + return -EINVAL; + if (flags & NFACCT_F_OVERQUOTA) + return -EINVAL; + + size += sizeof(u64); + } + + nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL); if (nfacct == NULL) return -ENOMEM; + if (flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)nfacct->data; + + *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); + nfacct->flags = flags; + } + strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, - be64_to_cpu(nla_get_u64(tb[NFACCT_BYTES]))); + be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES]))); } if (tb[NFACCT_PKTS]) { atomic64_set(&nfacct->pkts, - be64_to_cpu(nla_get_u64(tb[NFACCT_PKTS]))); + be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } atomic_set(&nfacct->refcnt, 1); list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); @@ -91,16 +122,16 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, } static int -nfnl_acct_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, +nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_acct *acct) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - unsigned int flags = pid ? NLM_F_MULTI : 0; + unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; event |= NFNL_SUBSYS_ACCT << 8; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -109,19 +140,30 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; - NLA_PUT_STRING(skb, NFACCT_NAME, acct->name); + if (nla_put_string(skb, NFACCT_NAME, acct->name)) + goto nla_put_failure; if (type == NFNL_MSG_ACCT_GET_CTRZERO) { pkts = atomic64_xchg(&acct->pkts, 0); bytes = atomic64_xchg(&acct->bytes, 0); + smp_mb__before_atomic(); + if (acct->flags & NFACCT_F_QUOTA) + clear_bit(NFACCT_F_OVERQUOTA, &acct->flags); } else { pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); } - NLA_PUT_BE64(skb, NFACCT_PKTS, cpu_to_be64(pkts)); - NLA_PUT_BE64(skb, NFACCT_BYTES, cpu_to_be64(bytes)); - NLA_PUT_BE32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))); - + if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) || + nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) || + nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) + goto nla_put_failure; + if (acct->flags & NFACCT_F_QUOTA) { + u64 *quota = (u64 *)acct->data; + + if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) || + nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) + goto nla_put_failure; + } nlmsg_end(skb, nlh); return skb->len; @@ -145,10 +187,13 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { - if (last && cur != last) - continue; + if (last) { + if (cur != last) + continue; - if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).pid, + last = NULL; + } + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), NFNL_MSG_ACCT_NEW, cur) < 0) { @@ -171,8 +216,10 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, char *acct_name; if (nlh->nlmsg_flags & NLM_F_DUMP) { - return netlink_dump_start(nfnl, skb, nlh, nfnl_acct_dump, - NULL, 0); + struct netlink_dump_control c = { + .dump = nfnl_acct_dump, + }; + return netlink_dump_start(nfnl, skb, nlh, &c); } if (!tb[NFACCT_NAME]) @@ -191,7 +238,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, break; } - ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).pid, + ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), NFNL_MSG_ACCT_NEW, cur); @@ -199,7 +246,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).pid, + ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -261,6 +308,8 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, [NFACCT_BYTES] = { .type = NLA_U64 }, [NFACCT_PKTS] = { .type = NLA_U64 }, + [NFACCT_FLAGS] = { .type = NLA_U32 }, + [NFACCT_QUOTA] = { .type = NLA_U64 }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { @@ -327,6 +376,50 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_update); +static void nfnl_overquota_report(struct nf_acct *nfacct) +{ + int ret; + struct sk_buff *skb; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (skb == NULL) + return; + + ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, + nfacct); + if (ret <= 0) { + kfree_skb(skb); + return; + } + netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, + GFP_ATOMIC); +} + +int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) +{ + u64 now; + u64 *quota; + int ret = NFACCT_UNDERQUOTA; + + /* no place here if we don't have a quota */ + if (!(nfacct->flags & NFACCT_F_QUOTA)) + return NFACCT_NO_QUOTA; + + quota = (u64 *)nfacct->data; + now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? + atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); + + ret = now > *quota; + + if (now >= *quota && + !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) { + nfnl_overquota_report(nfacct); + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfnl_acct_overquota); + static int __init nfnl_acct_init(void) { int ret; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c new file mode 100644 index 00000000000..9e287cb56a0 --- /dev/null +++ b/net/netfilter/nfnetlink_cthelper.c @@ -0,0 +1,680 @@ +/* + * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + * + * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com> + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/errno.h> +#include <net/netlink.h> +#include <net/sock.h> + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_ecache.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <linux/netfilter/nfnetlink_cthelper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); + +static int +nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + const struct nf_conn_help *help; + struct nf_conntrack_helper *helper; + + help = nfct_help(ct); + if (help == NULL) + return NF_DROP; + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + if (helper == NULL) + return NF_DROP; + + /* This is an user-space helper not yet configured, skip. */ + if ((helper->flags & + (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == + NF_CT_HELPER_F_USERSPACE) + return NF_ACCEPT; + + /* If the user-space helper is not available, don't block traffic. */ + return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS; +} + +static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = { + [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, }, + [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, }, +}; + +static int +nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, + const struct nlattr *attr) +{ + int err; + struct nlattr *tb[NFCTH_TUPLE_MAX+1]; + + err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol); + if (err < 0) + return err; + + if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) + return -EINVAL; + + tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); + tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); + + return 0; +} + +static int +nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) +{ + const struct nf_conn_help *help = nfct_help(ct); + + if (attr == NULL) + return -EINVAL; + + if (help->helper->data_len == 0) + return -EINVAL; + + memcpy(&help->data, nla_data(attr), help->helper->data_len); + return 0; +} + +static int +nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct) +{ + const struct nf_conn_help *help = nfct_help(ct); + + if (help->helper->data_len && + nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = { + [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN-1 }, + [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, }, + [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, }, +}; + +static int +nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, + const struct nlattr *attr) +{ + int err; + struct nlattr *tb[NFCTH_POLICY_MAX+1]; + + err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol); + if (err < 0) + return err; + + if (!tb[NFCTH_POLICY_NAME] || + !tb[NFCTH_POLICY_EXPECT_MAX] || + !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) + return -EINVAL; + + strncpy(expect_policy->name, + nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN); + expect_policy->max_expected = + ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); + expect_policy->timeout = + ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); + + return 0; +} + +static const struct nla_policy +nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = { + [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, }, +}; + +static int +nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, + const struct nlattr *attr) +{ + int i, ret; + struct nf_conntrack_expect_policy *expect_policy; + struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; + + ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, + nfnl_cthelper_expect_policy_set); + if (ret < 0) + return ret; + + if (!tb[NFCTH_POLICY_SET_NUM]) + return -EINVAL; + + helper->expect_class_max = + ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); + + if (helper->expect_class_max != 0 && + helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES) + return -EOVERFLOW; + + expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * + helper->expect_class_max, GFP_KERNEL); + if (expect_policy == NULL) + return -ENOMEM; + + for (i=0; i<helper->expect_class_max; i++) { + if (!tb[NFCTH_POLICY_SET+i]) + goto err; + + ret = nfnl_cthelper_expect_policy(&expect_policy[i], + tb[NFCTH_POLICY_SET+i]); + if (ret < 0) + goto err; + } + helper->expect_policy = expect_policy; + return 0; +err: + kfree(expect_policy); + return -EINVAL; +} + +static int +nfnl_cthelper_create(const struct nlattr * const tb[], + struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_helper *helper; + int ret; + + if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) + return -EINVAL; + + helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL); + if (helper == NULL) + return -ENOMEM; + + ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); + if (ret < 0) + goto err; + + strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); + helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); + helper->flags |= NF_CT_HELPER_F_USERSPACE; + memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); + + helper->me = THIS_MODULE; + helper->help = nfnl_userspace_cthelper; + helper->from_nlattr = nfnl_cthelper_from_nlattr; + helper->to_nlattr = nfnl_cthelper_to_nlattr; + + /* Default to queue number zero, this can be updated at any time. */ + if (tb[NFCTH_QUEUE_NUM]) + helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); + + if (tb[NFCTH_STATUS]) { + int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); + + switch(status) { + case NFCT_HELPER_STATUS_ENABLED: + helper->flags |= NF_CT_HELPER_F_CONFIGURED; + break; + case NFCT_HELPER_STATUS_DISABLED: + helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; + break; + } + } + + ret = nf_conntrack_helper_register(helper); + if (ret < 0) + goto err; + + return 0; +err: + kfree(helper); + return ret; +} + +static int +nfnl_cthelper_update(const struct nlattr * const tb[], + struct nf_conntrack_helper *helper) +{ + int ret; + + if (tb[NFCTH_PRIV_DATA_LEN]) + return -EBUSY; + + if (tb[NFCTH_POLICY]) { + ret = nfnl_cthelper_parse_expect_policy(helper, + tb[NFCTH_POLICY]); + if (ret < 0) + return ret; + } + if (tb[NFCTH_QUEUE_NUM]) + helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); + + if (tb[NFCTH_STATUS]) { + int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); + + switch(status) { + case NFCT_HELPER_STATUS_ENABLED: + helper->flags |= NF_CT_HELPER_F_CONFIGURED; + break; + case NFCT_HELPER_STATUS_DISABLED: + helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; + break; + } + } + return 0; +} + +static int +nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + const char *helper_name; + struct nf_conntrack_helper *cur, *helper = NULL; + struct nf_conntrack_tuple tuple; + int ret = 0, i; + + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) + return -EINVAL; + + helper_name = nla_data(tb[NFCTH_NAME]); + + ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); + if (ret < 0) + return ret; + + rcu_read_lock(); + for (i = 0; i < nf_ct_helper_hsize && !helper; i++) { + hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { + + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + if (strncmp(cur->name, helper_name, + NF_CT_HELPER_NAME_LEN) != 0) + continue; + + if ((tuple.src.l3num != cur->tuple.src.l3num || + tuple.dst.protonum != cur->tuple.dst.protonum)) + continue; + + if (nlh->nlmsg_flags & NLM_F_EXCL) { + ret = -EEXIST; + goto err; + } + helper = cur; + break; + } + } + rcu_read_unlock(); + + if (helper == NULL) + ret = nfnl_cthelper_create(tb, &tuple); + else + ret = nfnl_cthelper_update(tb, helper); + + return ret; +err: + rcu_read_unlock(); + return ret; +} + +static int +nfnl_cthelper_dump_tuple(struct sk_buff *skb, + struct nf_conntrack_helper *helper) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED); + if (nest_parms == NULL) + goto nla_put_failure; + + if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM, + htons(helper->tuple.src.l3num))) + goto nla_put_failure; + + if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum)) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + return 0; + +nla_put_failure: + return -1; +} + +static int +nfnl_cthelper_dump_policy(struct sk_buff *skb, + struct nf_conntrack_helper *helper) +{ + int i; + struct nlattr *nest_parms1, *nest_parms2; + + nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED); + if (nest_parms1 == NULL) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, + htonl(helper->expect_class_max))) + goto nla_put_failure; + + for (i=0; i<helper->expect_class_max; i++) { + nest_parms2 = nla_nest_start(skb, + (NFCTH_POLICY_SET+i) | NLA_F_NESTED); + if (nest_parms2 == NULL) + goto nla_put_failure; + + if (nla_put_string(skb, NFCTH_POLICY_NAME, + helper->expect_policy[i].name)) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX, + htonl(helper->expect_policy[i].max_expected))) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT, + htonl(helper->expect_policy[i].timeout))) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms2); + } + nla_nest_end(skb, nest_parms1); + return 0; + +nla_put_failure: + return -1; +} + +static int +nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + int event, struct nf_conntrack_helper *helper) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + int status; + + event |= NFNL_SUBSYS_CTHELPER << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFCTH_NAME, helper->name)) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num))) + goto nla_put_failure; + + if (nfnl_cthelper_dump_tuple(skb, helper) < 0) + goto nla_put_failure; + + if (nfnl_cthelper_dump_policy(skb, helper) < 0) + goto nla_put_failure; + + if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len))) + goto nla_put_failure; + + if (helper->flags & NF_CT_HELPER_F_CONFIGURED) + status = NFCT_HELPER_STATUS_ENABLED; + else + status = NFCT_HELPER_STATUS_DISABLED; + + if (nla_put_be32(skb, NFCTH_STATUS, htonl(status))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_helper *cur, *last; + + rcu_read_lock(); + last = (struct nf_conntrack_helper *)cb->args[1]; + for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) { +restart: + hlist_for_each_entry_rcu(cur, + &nf_ct_helper_hash[cb->args[0]], hnode) { + + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + if (cb->args[1]) { + if (cur != last) + continue; + cb->args[1] = 0; + } + if (nfnl_cthelper_fill_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + NFNL_MSG_CTHELPER_NEW, cur) < 0) { + cb->args[1] = (unsigned long)cur; + goto out; + } + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } +out: + rcu_read_unlock(); + return skb->len; +} + +static int +nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + int ret = -ENOENT, i; + struct nf_conntrack_helper *cur; + struct sk_buff *skb2; + char *helper_name = NULL; + struct nf_conntrack_tuple tuple; + bool tuple_set = false; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nfnl_cthelper_dump_table, + }; + return netlink_dump_start(nfnl, skb, nlh, &c); + } + + if (tb[NFCTH_NAME]) + helper_name = nla_data(tb[NFCTH_NAME]); + + if (tb[NFCTH_TUPLE]) { + ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); + if (ret < 0) + return ret; + + tuple_set = true; + } + + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { + + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + if (helper_name && strncmp(cur->name, helper_name, + NF_CT_HELPER_NAME_LEN) != 0) { + continue; + } + if (tuple_set && + (tuple.src.l3num != cur->tuple.src.l3num || + tuple.dst.protonum != cur->tuple.dst.protonum)) + continue; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + ret = -ENOMEM; + break; + } + + ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + NFNL_MSG_CTHELPER_NEW, cur); + if (ret <= 0) { + kfree_skb(skb2); + break; + } + + ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; + } + } + return ret; +} + +static int +nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + char *helper_name = NULL; + struct nf_conntrack_helper *cur; + struct hlist_node *tmp; + struct nf_conntrack_tuple tuple; + bool tuple_set = false, found = false; + int i, j = 0, ret; + + if (tb[NFCTH_NAME]) + helper_name = nla_data(tb[NFCTH_NAME]); + + if (tb[NFCTH_TUPLE]) { + ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); + if (ret < 0) + return ret; + + tuple_set = true; + } + + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], + hnode) { + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + j++; + + if (helper_name && strncmp(cur->name, helper_name, + NF_CT_HELPER_NAME_LEN) != 0) { + continue; + } + if (tuple_set && + (tuple.src.l3num != cur->tuple.src.l3num || + tuple.dst.protonum != cur->tuple.dst.protonum)) + continue; + + found = true; + nf_conntrack_helper_unregister(cur); + } + } + /* Make sure we return success if we flush and there is no helpers */ + return (found || j == 0) ? 0 : -ENOENT; +} + +static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { + [NFCTH_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN-1 }, + [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, +}; + +static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { + [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy }, + [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy }, + [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy }, +}; + +static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { + .name = "cthelper", + .subsys_id = NFNL_SUBSYS_CTHELPER, + .cb_count = NFNL_MSG_CTHELPER_MAX, + .cb = nfnl_cthelper_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER); + +static int __init nfnl_cthelper_init(void) +{ + int ret; + + ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys); + if (ret < 0) { + pr_err("nfnl_cthelper: cannot register with nfnetlink.\n"); + goto err_out; + } + return 0; +err_out: + return ret; +} + +static void __exit nfnl_cthelper_exit(void) +{ + struct nf_conntrack_helper *cur; + struct hlist_node *tmp; + int i; + + nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); + + for (i=0; i<nf_ct_helper_hsize; i++) { + hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], + hnode) { + /* skip non-userspace conntrack helpers. */ + if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) + continue; + + nf_conntrack_helper_unregister(cur); + } + } +} + +module_init(nfnl_cthelper_init); +module_exit(nfnl_cthelper_exit); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c new file mode 100644 index 00000000000..476accd1714 --- /dev/null +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -0,0 +1,585 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/rculist.h> +#include <linux/rculist_nulls.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/security.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); + +static LIST_HEAD(cttimeout_list); + +static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { + [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING, + .len = CTNL_TIMEOUT_NAME_MAX - 1}, + [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 }, + [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 }, + [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, +}; + +static int +ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, + struct net *net, const struct nlattr *attr) +{ + int ret = 0; + + if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) { + struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1]; + + ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, + attr, l4proto->ctnl_timeout.nla_policy); + if (ret < 0) + return ret; + + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + } + return ret; +} + +static int +cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + __u16 l3num; + __u8 l4num; + struct nf_conntrack_l4proto *l4proto; + struct ctnl_timeout *timeout, *matching = NULL; + struct net *net = sock_net(skb->sk); + char *name; + int ret; + + if (!cda[CTA_TIMEOUT_NAME] || + !cda[CTA_TIMEOUT_L3PROTO] || + !cda[CTA_TIMEOUT_L4PROTO] || + !cda[CTA_TIMEOUT_DATA]) + return -EINVAL; + + name = nla_data(cda[CTA_TIMEOUT_NAME]); + l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + + list_for_each_entry(timeout, &cttimeout_list, head) { + if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + matching = timeout; + break; + } + + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supportted, skip. */ + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; + goto err_proto_put; + } + + if (matching) { + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + /* You cannot replace one timeout policy by another of + * different kind, sorry. + */ + if (matching->l3num != l3num || + matching->l4proto->l4proto != l4num) { + ret = -EINVAL; + goto err_proto_put; + } + + ret = ctnl_timeout_parse_policy(&matching->data, + l4proto, net, + cda[CTA_TIMEOUT_DATA]); + return ret; + } + ret = -EBUSY; + goto err_proto_put; + } + + timeout = kzalloc(sizeof(struct ctnl_timeout) + + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); + if (timeout == NULL) { + ret = -ENOMEM; + goto err_proto_put; + } + + ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net, + cda[CTA_TIMEOUT_DATA]); + if (ret < 0) + goto err; + + strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); + timeout->l3num = l3num; + timeout->l4proto = l4proto; + atomic_set(&timeout->refcnt, 1); + list_add_tail_rcu(&timeout->head, &cttimeout_list); + + return 0; +err: + kfree(timeout); +err_proto_put: + nf_ct_l4proto_put(l4proto); + return ret; +} + +static int +ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + int event, struct ctnl_timeout *timeout) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + struct nf_conntrack_l4proto *l4proto = timeout->l4proto; + + event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || + nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) || + nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || + nla_put_be32(skb, CTA_TIMEOUT_USE, + htonl(atomic_read(&timeout->refcnt)))) + goto nla_put_failure; + + if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { + struct nlattr *nest_parms; + int ret; + + nest_parms = nla_nest_start(skb, + CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); + if (ret < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + } + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ctnl_timeout *cur, *last; + + if (cb->args[2]) + return 0; + + last = (struct ctnl_timeout *)cb->args[1]; + if (cb->args[1]) + cb->args[1] = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &cttimeout_list, head) { + if (last) { + if (cur != last) + continue; + + last = NULL; + } + if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { + cb->args[1] = (unsigned long)cur; + break; + } + } + if (!cb->args[1]) + cb->args[2] = 1; + rcu_read_unlock(); + return skb->len; +} + +static int +cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + int ret = -ENOENT; + char *name; + struct ctnl_timeout *cur; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnl_timeout_dump, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + if (!cda[CTA_TIMEOUT_NAME]) + return -EINVAL; + name = nla_data(cda[CTA_TIMEOUT_NAME]); + + list_for_each_entry(cur, &cttimeout_list, head) { + struct sk_buff *skb2; + + if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + ret = -ENOMEM; + break; + } + + ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_NEW, cur); + if (ret <= 0) { + kfree_skb(skb2); + break; + } + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; + } + return ret; +} + +/* try to delete object, fail if it is still in use. */ +static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) +{ + int ret = 0; + + /* we want to avoid races with nf_ct_timeout_find_get. */ + if (atomic_dec_and_test(&timeout->refcnt)) { + /* We are protected by nfnl mutex. */ + list_del_rcu(&timeout->head); + nf_ct_l4proto_put(timeout->l4proto); + kfree_rcu(timeout, rcu_head); + } else { + /* still in use, restore reference counter. */ + atomic_inc(&timeout->refcnt); + ret = -EBUSY; + } + return ret; +} + +static int +cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + char *name; + struct ctnl_timeout *cur; + int ret = -ENOENT; + + if (!cda[CTA_TIMEOUT_NAME]) { + list_for_each_entry(cur, &cttimeout_list, head) + ctnl_timeout_try_del(cur); + + return 0; + } + name = nla_data(cda[CTA_TIMEOUT_NAME]); + + list_for_each_entry(cur, &cttimeout_list, head) { + if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + ret = ctnl_timeout_try_del(cur); + if (ret < 0) + return ret; + + break; + } + return ret; +} + +static int +cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + __u16 l3num; + __u8 l4num; + struct nf_conntrack_l4proto *l4proto; + struct net *net = sock_net(skb->sk); + unsigned int *timeouts; + int ret; + + if (!cda[CTA_TIMEOUT_L3PROTO] || + !cda[CTA_TIMEOUT_L4PROTO] || + !cda[CTA_TIMEOUT_DATA]) + return -EINVAL; + + l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supported, skip. */ + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; + goto err; + } + + timeouts = l4proto->get_timeouts(net); + + ret = ctnl_timeout_parse_policy(timeouts, l4proto, net, + cda[CTA_TIMEOUT_DATA]); + if (ret < 0) + goto err; + + nf_ct_l4proto_put(l4proto); + return 0; +err: + nf_ct_l4proto_put(l4proto); + return ret; +} + +static int +cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, + u32 seq, u32 type, int event, + struct nf_conntrack_l4proto *l4proto) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + + event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) || + nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) + goto nla_put_failure; + + if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { + struct nlattr *nest_parms; + unsigned int *timeouts = l4proto->get_timeouts(net); + int ret; + + nest_parms = nla_nest_start(skb, + CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); + if (ret < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + } + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + __u16 l3num; + __u8 l4num; + struct nf_conntrack_l4proto *l4proto; + struct net *net = sock_net(skb->sk); + struct sk_buff *skb2; + int ret, err; + + if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) + return -EINVAL; + + l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supported, skip. */ + if (l4proto->l4proto != l4num) { + err = -EOPNOTSUPP; + goto err; + } + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + err = -ENOMEM; + goto err; + } + + ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_DEFAULT_SET, + l4proto); + if (ret <= 0) { + kfree_skb(skb2); + err = -ENOMEM; + goto err; + } + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; +err: + nf_ct_l4proto_put(l4proto); + return err; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static struct ctnl_timeout *ctnl_timeout_find_get(const char *name) +{ + struct ctnl_timeout *timeout, *matching = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(timeout, &cttimeout_list, head) { + if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + if (!try_module_get(THIS_MODULE)) + goto err; + + if (!atomic_inc_not_zero(&timeout->refcnt)) { + module_put(THIS_MODULE); + goto err; + } + matching = timeout; + break; + } +err: + rcu_read_unlock(); + return matching; +} + +static void ctnl_timeout_put(struct ctnl_timeout *timeout) +{ + atomic_dec(&timeout->refcnt); + module_put(THIS_MODULE); +} +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + +static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { + [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, +}; + +static const struct nfnetlink_subsystem cttimeout_subsys = { + .name = "conntrack_timeout", + .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT, + .cb_count = IPCTNL_MSG_TIMEOUT_MAX, + .cb = cttimeout_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); + +static int __init cttimeout_init(void) +{ + int ret; + + ret = nfnetlink_subsys_register(&cttimeout_subsys); + if (ret < 0) { + pr_err("cttimeout_init: cannot register cttimeout with " + "nfnetlink.\n"); + goto err_out; + } +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get); + RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + return 0; + +err_out: + return ret; +} + +static void __exit cttimeout_exit(void) +{ + struct ctnl_timeout *cur, *tmp; + + pr_info("cttimeout: unregistering from nfnetlink.\n"); + + nfnetlink_subsys_unregister(&cttimeout_subsys); + list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) { + list_del_rcu(&cur->head); + /* We are sure that our objects have no clients at this point, + * it's safe to release them all without checking refcnt. + */ + nf_ct_l4proto_put(cur->l4proto); + kfree_rcu(cur, rcu_head); + } +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); + RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ +} + +module_init(cttimeout_init); +module_exit(cttimeout_exit); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 66b2c54c544..d292c8d286e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -3,6 +3,7 @@ * nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ipt_ULOG.c: * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> @@ -13,12 +14,13 @@ */ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> #include <linux/spinlock.h> @@ -26,11 +28,10 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/list.h> -#include <linux/jhash.h> -#include <linux/random.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h> #include <net/netfilter/nfnetlink_log.h> #include <linux/atomic.h> @@ -55,7 +56,9 @@ struct nfulnl_instance { unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; - int peer_pid; /* PID of the peer process */ + struct net *net; + struct user_namespace *peer_user_ns; /* User namespace of the peer process */ + int peer_portid; /* PORTID of the peer process */ /* configurable parameters */ unsigned int flushtimeout; /* timeout until queue flush */ @@ -69,12 +72,20 @@ struct nfulnl_instance { struct rcu_head rcu; }; -static DEFINE_SPINLOCK(instances_lock); -static atomic_t global_seq; - #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS]; -static unsigned int hash_init; + +static int nfnl_log_net_id __read_mostly; + +struct nfnl_log_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; + atomic_t global_seq; +}; + +static struct nfnl_log_net *nfnl_log_pernet(struct net *net) +{ + return net_generic(net, nfnl_log_net_id); +} static inline u_int8_t instance_hashfn(u_int16_t group_num) { @@ -82,14 +93,13 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num) } static struct nfulnl_instance * -__instance_lookup(u_int16_t group_num) +__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) { struct hlist_head *head; - struct hlist_node *pos; struct nfulnl_instance *inst; - head = &instance_table[instance_hashfn(group_num)]; - hlist_for_each_entry_rcu(inst, pos, head, hlist) { + head = &log->instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->group_num == group_num) return inst; } @@ -103,12 +113,12 @@ instance_get(struct nfulnl_instance *inst) } static struct nfulnl_instance * -instance_lookup_get(u_int16_t group_num) +instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) { struct nfulnl_instance *inst; rcu_read_lock_bh(); - inst = __instance_lookup(group_num); + inst = __instance_lookup(log, group_num); if (inst && !atomic_inc_not_zero(&inst->use)) inst = NULL; rcu_read_unlock_bh(); @@ -118,7 +128,11 @@ instance_lookup_get(u_int16_t group_num) static void nfulnl_instance_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct nfulnl_instance, rcu)); + struct nfulnl_instance *inst = + container_of(head, struct nfulnl_instance, rcu); + + put_net(inst->net); + kfree(inst); module_put(THIS_MODULE); } @@ -132,13 +146,15 @@ instance_put(struct nfulnl_instance *inst) static void nfulnl_timer(unsigned long data); static struct nfulnl_instance * -instance_create(u_int16_t group_num, int pid) +instance_create(struct net *net, u_int16_t group_num, + int portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; + struct nfnl_log_net *log = nfnl_log_pernet(net); int err; - spin_lock_bh(&instances_lock); - if (__instance_lookup(group_num)) { + spin_lock_bh(&log->instances_lock); + if (__instance_lookup(log, group_num)) { err = -EEXIST; goto out_unlock; } @@ -162,7 +178,9 @@ instance_create(u_int16_t group_num, int pid) setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); - inst->peer_pid = pid; + inst->net = get_net(net); + inst->peer_user_ns = user_ns; + inst->peer_portid = portid; inst->group_num = group_num; inst->qthreshold = NFULNL_QTHRESH_DEFAULT; @@ -172,14 +190,15 @@ instance_create(u_int16_t group_num, int pid) inst->copy_range = NFULNL_COPY_RANGE_MAX; hlist_add_head_rcu(&inst->hlist, - &instance_table[instance_hashfn(group_num)]); + &log->instance_table[instance_hashfn(group_num)]); + - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); return inst; out_unlock: - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); return ERR_PTR(err); } @@ -208,11 +227,12 @@ __instance_destroy(struct nfulnl_instance *inst) } static inline void -instance_destroy(struct nfulnl_instance *inst) +instance_destroy(struct nfnl_log_net *log, + struct nfulnl_instance *inst) { - spin_lock_bh(&instances_lock); + spin_lock_bh(&log->instances_lock); __instance_destroy(inst); - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } static int @@ -296,7 +316,8 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) } static struct sk_buff * -nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, + unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; @@ -305,13 +326,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = alloc_skb(n, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = alloc_skb(pkt_size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(net, pkt_size, + peer_portid, GFP_ATOMIC); if (!skb) pr_err("nfnetlink_log: can't even alloc %u bytes\n", pkt_size); @@ -326,18 +348,20 @@ __nfulnl_send(struct nfulnl_instance *inst) { int status = -1; - if (inst->qlen > 1) - NLMSG_PUT(inst->skb, 0, 0, - NLMSG_DONE, - sizeof(struct nfgenmsg)); - - status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid, + if (inst->qlen > 1) { + struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0, + NLMSG_DONE, + sizeof(struct nfgenmsg), + 0); + if (!nlh) + goto out; + } + status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, MSG_DONTWAIT); inst->qlen = 0; inst->skb = NULL; - -nlmsg_failure: +out: return status; } @@ -366,7 +390,8 @@ nfulnl_timer(unsigned long data) /* This is an inline function, we don't really care about a long * list of arguments */ static inline int -__build_packet_message(struct nfulnl_instance *inst, +__build_packet_message(struct nfnl_log_net *log, + struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, u_int8_t pf, @@ -379,96 +404,125 @@ __build_packet_message(struct nfulnl_instance *inst, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; sk_buff_data_t old_tail = inst->skb->tail; + struct sock *sk; + const unsigned char *hwhdrp; - nlh = NLMSG_PUT(inst->skb, 0, 0, + nlh = nlmsg_put(inst->skb, 0, 0, NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, - sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); + sizeof(struct nfgenmsg), 0); + if (!nlh) + return -1; + nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = pf; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(inst->group_num); + memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; pmsg.hook = hooknum; - NLA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); + if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg)) + goto nla_put_failure; - if (prefix) - NLA_PUT(inst->skb, NFULA_PREFIX, plen, prefix); + if (prefix && + nla_put(inst->skb, NFULA_PREFIX, plen, prefix)) + goto nla_put_failure; if (indev) { #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, - htonl(indev->ifindex)); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, - htonl(indev->ifindex)); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, - htonl(br_port_get_rcu(indev)->br->dev->ifindex)); + nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex))) + goto nla_put_failure; } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, - htonl(indev->ifindex)); - if (skb->nf_bridge && skb->nf_bridge->physindev) - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, - htonl(skb->nf_bridge->physindev->ifindex)); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; + if (skb->nf_bridge && skb->nf_bridge->physindev && + nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(skb->nf_bridge->physindev->ifindex))) + goto nla_put_failure; } #endif } if (outdev) { #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, - htonl(outdev->ifindex)); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, - htonl(outdev->ifindex)); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, - htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); + nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) + goto nla_put_failure; } else { /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, - htonl(outdev->ifindex)); - if (skb->nf_bridge && skb->nf_bridge->physoutdev) - NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, - htonl(skb->nf_bridge->physoutdev->ifindex)); + if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; + if (skb->nf_bridge && skb->nf_bridge->physoutdev && + nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(skb->nf_bridge->physoutdev->ifindex))) + goto nla_put_failure; } #endif } - if (skb->mark) - NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark)); + if (skb->mark && + nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark))) + goto nla_put_failure; if (indev && skb->dev && skb->mac_header != skb->network_header) { struct nfulnl_msg_packet_hw phw; - int len = dev_parse_header(skb, phw.hw_addr); + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(skb, phw.hw_addr); if (len > 0) { phw.hw_addrlen = htons(len); - NLA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); + if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) + goto nla_put_failure; } } if (indev && skb_mac_header_was_set(skb)) { - NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)); - NLA_PUT_BE16(inst->skb, NFULA_HWLEN, - htons(skb->dev->hard_header_len)); - NLA_PUT(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, - skb_mac_header(skb)); + if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || + nla_put_be16(inst->skb, NFULA_HWLEN, + htons(skb->dev->hard_header_len))) + goto nla_put_failure; + + hwhdrp = skb_mac_header(skb); + + if (skb->dev->type == ARPHRD_SIT) + hwhdrp -= ETH_HLEN; + + if (hwhdrp >= skb->head && + nla_put(inst->skb, NFULA_HWHEADER, + skb->dev->hard_header_len, hwhdrp)) + goto nla_put_failure; } if (skb->tstamp.tv64) { @@ -477,32 +531,38 @@ __build_packet_message(struct nfulnl_instance *inst, ts.sec = cpu_to_be64(tv.tv_sec); ts.usec = cpu_to_be64(tv.tv_usec); - NLA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); + if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) + goto nla_put_failure; } /* UID */ - if (skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) { - struct file *file = skb->sk->sk_socket->file; - __be32 uid = htonl(file->f_cred->fsuid); - __be32 gid = htonl(file->f_cred->fsgid); - /* need to unlock here since NLA_PUT may goto */ - read_unlock_bh(&skb->sk->sk_callback_lock); - NLA_PUT_BE32(inst->skb, NFULA_UID, uid); - NLA_PUT_BE32(inst->skb, NFULA_GID, gid); + sk = skb->sk; + if (sk && sk->sk_state != TCP_TIME_WAIT) { + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + struct file *file = sk->sk_socket->file; + const struct cred *cred = file->f_cred; + struct user_namespace *user_ns = inst->peer_user_ns; + __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid)); + __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid)); + read_unlock_bh(&sk->sk_callback_lock); + if (nla_put_be32(inst->skb, NFULA_UID, uid) || + nla_put_be32(inst->skb, NFULA_GID, gid)) + goto nla_put_failure; } else - read_unlock_bh(&skb->sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } /* local sequence number */ - if (inst->flags & NFULNL_CFG_F_SEQ) - NLA_PUT_BE32(inst->skb, NFULA_SEQ, htonl(inst->seq++)); + if ((inst->flags & NFULNL_CFG_F_SEQ) && + nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++))) + goto nla_put_failure; /* global sequence number */ - if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) - NLA_PUT_BE32(inst->skb, NFULA_SEQ_GLOBAL, - htonl(atomic_inc_return(&global_seq))); + if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && + nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, + htonl(atomic_inc_return(&log->global_seq)))) + goto nla_put_failure; if (data_len) { struct nlattr *nla; @@ -510,7 +570,7 @@ __build_packet_message(struct nfulnl_instance *inst, if (skb_tailroom(inst->skb) < nla_total_size(data_len)) { printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); - goto nlmsg_failure; + return -1; } nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len)); @@ -524,7 +584,6 @@ __build_packet_message(struct nfulnl_instance *inst, nlh->nlmsg_len = inst->skb->tail - old_tail; return 0; -nlmsg_failure: nla_put_failure: PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); return -1; @@ -545,7 +604,8 @@ static struct nf_loginfo default_loginfo = { /* log handler for internal netfilter logging api */ void -nfulnl_log_packet(u_int8_t pf, +nfulnl_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -558,13 +618,14 @@ nfulnl_log_packet(u_int8_t pf, const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; + struct nfnl_log_net *log = nfnl_log_pernet(net); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; - inst = instance_lookup_get(li->u.ulog.group); + inst = instance_lookup_get(log, li->u.ulog.group); if (!inst) return; @@ -575,7 +636,7 @@ nfulnl_log_packet(u_int8_t pf, /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -639,14 +700,15 @@ nfulnl_log_packet(u_int8_t pf, } if (!inst->skb) { - inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + inst->skb = nfulnl_alloc_skb(net, inst->peer_portid, + inst->nlbufsiz, size); if (!inst->skb) goto alloc_failure; } inst->qlen++; - __build_packet_message(inst, skb, data_len, pf, + __build_packet_message(log, inst, skb, data_len, pf, hooknum, in, out, prefix, plen); if (inst->qlen >= qthreshold) @@ -675,24 +737,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; + struct nfnl_log_net *log = nfnl_log_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; - /* destroy all instances for this pid */ - spin_lock_bh(&instances_lock); + /* destroy all instances for this portid */ + spin_lock_bh(&log->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp, *t2; + struct hlist_node *t2; struct nfulnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &log->instance_table[i]; - hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if ((net_eq(n->net, &init_net)) && - (n->pid == inst->peer_pid)) + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) __instance_destroy(inst); } } - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } return NOTIFY_DONE; } @@ -729,10 +791,12 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nfula[]) { - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t group_num = ntohs(nfmsg->res_id); struct nfulnl_instance *inst; struct nfulnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_log_net *log = nfnl_log_pernet(net); int ret = 0; if (nfula[NFULA_CFG_CMD]) { @@ -742,15 +806,15 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: - return nf_log_bind_pf(pf, &nfulnl_logger); + return nf_log_bind_pf(net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: - nf_log_unbind_pf(pf); + nf_log_unbind_pf(net, pf); return 0; } } - inst = instance_lookup_get(group_num); - if (inst && inst->peer_pid != NETLINK_CB(skb).pid) { + inst = instance_lookup_get(log, group_num); + if (inst && inst->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto out_put; } @@ -763,8 +827,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out_put; } - inst = instance_create(group_num, - NETLINK_CB(skb).pid); + inst = instance_create(net, group_num, + NETLINK_CB(skb).portid, + sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; @@ -776,7 +841,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out; } - instance_destroy(inst); + instance_destroy(log, inst); goto out_put; default: ret = -ENOTSUPP; @@ -859,55 +924,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = { #ifdef CONFIG_PROC_FS struct iter_state { + struct seq_net_private p; unsigned int bucket; }; -static struct hlist_node *get_first(struct iter_state *st) +static struct hlist_node *get_first(struct net *net, struct iter_state *st) { + struct nfnl_log_net *log; if (!st) return NULL; + log = nfnl_log_pernet(net); + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + struct hlist_head *head = &log->instance_table[st->bucket]; + + if (!hlist_empty(head)) + return rcu_dereference_bh(hlist_first_rcu(head)); } return NULL; } -static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) +static struct hlist_node *get_next(struct net *net, struct iter_state *st, + struct hlist_node *h) { h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { + struct nfnl_log_net *log; + struct hlist_head *head; + if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + log = nfnl_log_pernet(net); + head = &log->instance_table[st->bucket]; + h = rcu_dereference_bh(hlist_first_rcu(head)); } return h; } -static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) +static struct hlist_node *get_idx(struct net *net, struct iter_state *st, + loff_t pos) { struct hlist_node *head; - head = get_first(st); + head = get_first(net, st); if (head) - while (pos && (head = get_next(st, head))) + while (pos && (head = get_next(net, st, head))) pos--; return pos ? NULL : head; } -static void *seq_start(struct seq_file *seq, loff_t *pos) +static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(rcu_bh) { rcu_read_lock_bh(); - return get_idx(seq->private, *pos); + return get_idx(seq_file_net(s), s->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; - return get_next(s->private, v); + return get_next(seq_file_net(s), s->private, v); } static void seq_stop(struct seq_file *s, void *v) @@ -922,7 +1000,7 @@ static int seq_show(struct seq_file *s, void *v) return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", inst->group_num, - inst->peer_pid, inst->qlen, + inst->peer_portid, inst->qlen, inst->copy_mode, inst->copy_range, inst->flushtimeout, atomic_read(&inst->use)); } @@ -936,8 +1014,8 @@ static const struct seq_operations nful_seq_ops = { static int nful_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &nful_seq_ops, - sizeof(struct iter_state)); + return seq_open_net(inode, file, &nful_seq_ops, + sizeof(struct iter_state)); } static const struct file_operations nful_file_ops = { @@ -945,47 +1023,69 @@ static const struct file_operations nful_file_ops = { .open = nful_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* PROC_FS */ -static int __init nfnetlink_log_init(void) +static int __net_init nfnl_log_net_init(struct net *net) { - int i, status = -ENOMEM; + unsigned int i; + struct nfnl_log_net *log = nfnl_log_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); + INIT_HLIST_HEAD(&log->instance_table[i]); + spin_lock_init(&log->instances_lock); - /* it's not really all that important to have a random value, so - * we can do this from the init function, even if there hasn't - * been that much entropy yet */ - get_random_bytes(&hash_init, sizeof(hash_init)); +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_log", 0440, + net->nf.proc_netfilter, &nful_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_log_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); +#endif + nf_log_unset(net, &nfulnl_logger); +} + +static struct pernet_operations nfnl_log_net_ops = { + .init = nfnl_log_net_init, + .exit = nfnl_log_net_exit, + .id = &nfnl_log_net_id, + .size = sizeof(struct nfnl_log_net), +}; + +static int __init nfnetlink_log_init(void) +{ + int status = -ENOMEM; netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { - printk(KERN_ERR "log: failed to create netlink socket\n"); + pr_err("log: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); if (status < 0) { - printk(KERN_ERR "log: failed to register logger\n"); + pr_err("log: failed to register logger\n"); goto cleanup_subsys; } -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_log", 0440, - proc_net_netfilter, &nful_file_ops)) + status = register_pernet_subsys(&nfnl_log_net_ops); + if (status < 0) { + pr_err("log: failed to register pernet ops\n"); goto cleanup_logger; -#endif + } return status; -#ifdef CONFIG_PROC_FS cleanup_logger: nf_log_unregister(&nfulnl_logger); -#endif cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: @@ -995,10 +1095,8 @@ cleanup_netlink_notifier: static void __exit nfnetlink_log_fini(void) { + unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_log", proc_net_netfilter); -#endif nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue_core.c index a80b0cb03f1..108120f216b 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -29,7 +29,10 @@ #include <linux/netfilter/nfnetlink_queue.h> #include <linux/list.h> #include <net/sock.h> +#include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> +#include <net/netns/generic.h> +#include <net/netfilter/nfnetlink_queue.h> #include <linux/atomic.h> @@ -39,11 +42,19 @@ #define NFQNL_QMAX_DEFAULT 1024 +/* We're using struct nlattr which has 16bit nla_len. Note that nla_len + * includes the header length. Thus, the maximum packet length that we + * support is 65531 bytes. We send truncated packets if the specified length + * is larger than that. Userspace can check for presence of NFQA_CAP_LEN + * attribute to detect truncation. + */ +#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) + struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; - int peer_pid; + int peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; @@ -52,6 +63,7 @@ struct nfqnl_instance { u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; + u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ /* * Following fields are dirtied for each queued packet, * keep them in same cache line if possible. @@ -64,25 +76,32 @@ struct nfqnl_instance { typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); -static DEFINE_SPINLOCK(instances_lock); +static int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly; +struct nfnl_queue_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; +}; + +static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) +{ + return net_generic(net, nfnl_queue_net_id); +} static inline u_int8_t instance_hashfn(u_int16_t queue_num) { - return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; + return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * -instance_lookup(u_int16_t queue_num) +instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; - struct hlist_node *pos; struct nfqnl_instance *inst; - head = &instance_table[instance_hashfn(queue_num)]; - hlist_for_each_entry_rcu(inst, pos, head, hlist) { + head = &q->instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; } @@ -90,14 +109,15 @@ instance_lookup(u_int16_t queue_num) } static struct nfqnl_instance * -instance_create(u_int16_t queue_num, int pid) +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, + int portid) { struct nfqnl_instance *inst; unsigned int h; int err; - spin_lock(&instances_lock); - if (instance_lookup(queue_num)) { + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } @@ -109,9 +129,9 @@ instance_create(u_int16_t queue_num, int pid) } inst->queue_num = queue_num; - inst->peer_pid = pid; + inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = 0xfffff; + inst->copy_range = NFQNL_MAX_COPY_RANGE; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); @@ -122,16 +142,16 @@ instance_create(u_int16_t queue_num, int pid) } h = instance_hashfn(queue_num); - hlist_add_head_rcu(&inst->hlist, &instance_table[h]); + hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); return ERR_PTR(err); } @@ -157,11 +177,11 @@ __instance_destroy(struct nfqnl_instance *inst) } static void -instance_destroy(struct nfqnl_instance *inst) +instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { - spin_lock(&instances_lock); + spin_lock(&q->instances_lock); __instance_destroy(inst); - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); } static inline void @@ -216,14 +236,56 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_unlock_bh(&queue->lock); } +static int +nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, + bool csum_verify) +{ + __u32 flags = 0; + + if (packet->ip_summed == CHECKSUM_PARTIAL) + flags = NFQA_SKB_CSUMNOTREADY; + else if (csum_verify) + flags = NFQA_SKB_CSUM_NOTVERIFIED; + + if (skb_is_gso(packet)) + flags |= NFQA_SKB_GSO; + + return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; +} + +static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) +{ + const struct cred *cred; + + if (sk->sk_state == TCP_TIME_WAIT) + return 0; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + cred = sk->sk_socket->file->f_cred; + if (nla_put_be32(skb, NFQA_UID, + htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) + goto nla_put_failure; + if (nla_put_be32(skb, NFQA_GID, + htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) + goto nla_put_failure; + } + read_unlock_bh(&sk->sk_callback_lock); + return 0; + +nla_put_failure: + read_unlock_bh(&sk->sk_callback_lock); + return -1; +} + static struct sk_buff * -nfqnl_build_packet_message(struct nfqnl_instance *queue, +nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { - sk_buff_data_t old_tail; size_t size; - size_t data_len = 0; + size_t data_len = 0, cap_len = 0; + unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; @@ -232,8 +294,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; + struct nf_conn *ct = NULL; + enum ip_conntrack_info uninitialized_var(ctinfo); + bool csum_verify; - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -243,7 +308,17 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) - + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + + nla_total_size(sizeof(u_int32_t)); /* cap_len */ + + if (entskb->tstamp.tv64) + size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + + if (entry->hook <= NF_INET_FORWARD || + (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) + csum_verify = !skb_csum_unnecessary(entskb); + else + csum_verify = false; outdev = entry->outdev; @@ -253,28 +328,46 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, break; case NFQNL_COPY_PACKET: - if (entskb->ip_summed == CHECKSUM_PARTIAL && + if (!(queue->flags & NFQA_CFG_F_GSO) && + entskb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(entskb)) return NULL; data_len = ACCESS_ONCE(queue->copy_range); - if (data_len == 0 || data_len > entskb->len) + if (data_len > entskb->len) data_len = entskb->len; - size += nla_total_size(data_len); + hlen = skb_zerocopy_headlen(entskb); + hlen = min_t(unsigned int, hlen, data_len); + size += sizeof(struct nlattr) + hlen; + cap_len = entskb->len; break; } + if (queue->flags & NFQA_CFG_F_CONNTRACK) + ct = nfqnl_ct_get(entskb, &size, &ctinfo); - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - goto nlmsg_failure; + if (queue->flags & NFQA_CFG_F_UID_GID) { + size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t))); /* gid */ + } + + skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, + GFP_ATOMIC); + if (!skb) { + skb_tx_error(entskb); + return NULL; + } - old_tail = skb->tail; - nlh = NLMSG_PUT(skb, 0, 0, + nlh = nlmsg_put(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, - sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); + sizeof(struct nfgenmsg), 0); + if (!nlh) { + skb_tx_error(entskb); + kfree_skb(skb); + return NULL; + } + nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = entry->pf; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(queue->queue_num); @@ -288,66 +381,79 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, indev = entry->indev; if (indev) { #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)); + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) + goto nla_put_failure; #else if (entry->pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, - htonl(indev->ifindex)); + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ - NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, - htonl(br_port_get_rcu(indev)->br->dev->ifindex)); + nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex))) + goto nla_put_failure; } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ - NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, - htonl(indev->ifindex)); - if (entskb->nf_bridge && entskb->nf_bridge->physindev) - NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, - htonl(entskb->nf_bridge->physindev->ifindex)); + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; + if (entskb->nf_bridge && entskb->nf_bridge->physindev && + nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(entskb->nf_bridge->physindev->ifindex))) + goto nla_put_failure; } #endif } if (outdev) { #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)); + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) + goto nla_put_failure; #else if (entry->pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, - htonl(outdev->ifindex)); + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ - NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, - htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); + nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) + goto nla_put_failure; } else { /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ - NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, - htonl(outdev->ifindex)); - if (entskb->nf_bridge && entskb->nf_bridge->physoutdev) - NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, - htonl(entskb->nf_bridge->physoutdev->ifindex)); + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; + if (entskb->nf_bridge && entskb->nf_bridge->physoutdev && + nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(entskb->nf_bridge->physoutdev->ifindex))) + goto nla_put_failure; } #endif } - if (entskb->mark) - NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark)); + if (entskb->mark && + nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) + goto nla_put_failure; if (indev && entskb->dev && entskb->mac_header != entskb->network_header) { struct nfqnl_msg_packet_hw phw; - int len = dev_parse_header(entskb, phw.hw_addr); + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); - NLA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); + if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) + goto nla_put_failure; } } @@ -357,82 +463,80 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, ts.sec = cpu_to_be64(tv.tv_sec); ts.usec = cpu_to_be64(tv.tv_usec); - NLA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); + if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) + goto nla_put_failure; } + if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && + nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) + goto nla_put_failure; + + if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) + goto nla_put_failure; + + if (cap_len > data_len && + nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) + goto nla_put_failure; + + if (nfqnl_put_packet_info(skb, entskb, csum_verify)) + goto nla_put_failure; + if (data_len) { struct nlattr *nla; - int sz = nla_attr_size(data_len); - if (skb_tailroom(skb) < nla_total_size(data_len)) { - printk(KERN_WARNING "nf_queue: no tailroom!\n"); - goto nlmsg_failure; - } + if (skb_tailroom(skb) < sizeof(*nla) + hlen) + goto nla_put_failure; - nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len)); + nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; - nla->nla_len = sz; + nla->nla_len = nla_attr_size(data_len); - if (skb_copy_bits(entskb, 0, nla_data(nla), data_len)) - BUG(); + if (skb_zerocopy(skb, entskb, data_len, hlen)) + goto nla_put_failure; } - nlh->nlmsg_len = skb->tail - old_tail; + nlh->nlmsg_len = skb->len; return skb; -nlmsg_failure: nla_put_failure: - if (skb) - kfree_skb(skb); - if (net_ratelimit()) - printk(KERN_ERR "nf_queue: error creating packet message\n"); + skb_tx_error(entskb); + kfree_skb(skb); + net_err_ratelimited("nf_queue: error creating packet message\n"); return NULL; } static int -nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry) { struct sk_buff *nskb; - struct nfqnl_instance *queue; int err = -ENOBUFS; __be32 *packet_id_ptr; + int failopen = 0; - /* rcu_read_lock()ed by nf_hook_slow() */ - queue = instance_lookup(queuenum); - if (!queue) { - err = -ESRCH; - goto err_out; - } - - if (queue->copy_mode == NFQNL_COPY_NONE) { - err = -EINVAL; - goto err_out; - } - - nskb = nfqnl_build_packet_message(queue, entry, &packet_id_ptr); + nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; goto err_out; } spin_lock_bh(&queue->lock); - if (!queue->peer_pid) { - err = -EINVAL; - goto err_out_free_nskb; - } if (queue->queue_total >= queue->queue_maxlen) { - queue->queue_dropped++; - if (net_ratelimit()) - printk(KERN_WARNING "nf_queue: full at %d entries, " - "dropping packets(s).\n", - queue->queue_total); + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_dropped++; + net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", + queue->queue_total); + } goto err_out_free_nskb; } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); if (err < 0) { queue->queue_user_dropped++; goto err_out_unlock; @@ -447,17 +551,152 @@ err_out_free_nskb: kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); + if (failopen) + nf_reinject(entry, NF_ACCEPT); err_out: return err; } +static struct nf_queue_entry * +nf_queue_entry_dup(struct nf_queue_entry *e) +{ + struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); + if (entry) { + if (nf_queue_entry_get_refs(entry)) + return entry; + kfree(entry); + } + return NULL; +} + +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} + +static int +__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, + struct sk_buff *skb, struct nf_queue_entry *entry) +{ + int ret = -ENOMEM; + struct nf_queue_entry *entry_seg; + + nf_bridge_adjust_segmented_data(skb); + + if (skb->next == NULL) { /* last packet, no need to copy entry */ + struct sk_buff *gso_skb = entry->skb; + entry->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry); + if (ret) + entry->skb = gso_skb; + return ret; + } + + skb->next = NULL; + + entry_seg = nf_queue_entry_dup(entry); + if (entry_seg) { + entry_seg->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry_seg); + if (ret) + free_entry(entry_seg); + } + return ret; +} + static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e) +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + unsigned int queued; + struct nfqnl_instance *queue; + struct sk_buff *skb, *segs; + int err = -ENOBUFS; + struct net *net = dev_net(entry->indev ? + entry->indev : entry->outdev); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(q, queuenum); + if (!queue) + return -ESRCH; + + if (queue->copy_mode == NFQNL_COPY_NONE) + return -EINVAL; + + skb = entry->skb; + + switch (entry->pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + return __nfqnl_enqueue_packet(net, queue, entry); + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to + * mean 'ignore this hook'. + */ + if (IS_ERR(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + if (err == 0) + err = __nfqnl_enqueue_packet_gso(net, queue, + segs, entry); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + if (err) /* some segments are already queued */ + free_entry(entry); + kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); + return err; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; - int diff; - diff = data_len - e->skb->len; if (diff < 0) { if (pskb_trim(e->skb, data_len)) return -ENOMEM; @@ -500,9 +739,8 @@ nfqnl_set_mode(struct nfqnl_instance *queue, case NFQNL_COPY_PACKET: queue->copy_mode = mode; - /* we're using struct nlattr which has 16bit nla_len */ - if (range > 0xffff) - queue->copy_range = 0xffff; + if (range == 0 || range > NFQNL_MAX_COPY_RANGE) + queue->copy_range = NFQNL_MAX_COPY_RANGE; else queue->copy_range = range; break; @@ -541,18 +779,18 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void -nfqnl_dev_drop(int ifindex) +nfqnl_dev_drop(struct net *net, int ifindex) { int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp; struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &q->instance_table[i]; - hlist_for_each_entry_rcu(inst, tmp, head, hlist) + hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); } @@ -565,14 +803,11 @@ static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) - nfqnl_dev_drop(dev->ifindex); + nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } @@ -585,24 +820,24 @@ nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; + struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; - /* destroy all instances for this pid */ - spin_lock(&instances_lock); + /* destroy all instances for this portid */ + spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp, *t2; + struct hlist_node *t2; struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &q->instance_table[i]; - hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if ((n->net == &init_net) && - (n->pid == inst->peer_pid)) + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) __instance_destroy(inst); } } - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } @@ -615,6 +850,8 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, + [NFQA_CT] = { .type = NLA_UNSPEC }, + [NFQA_EXP] = { .type = NLA_UNSPEC }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { @@ -622,15 +859,16 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_MARK] = { .type = NLA_U32 }, }; -static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlpid) +static struct nfqnl_instance * +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid) { struct nfqnl_instance *queue; - queue = instance_lookup(queue_num); + queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); - if (queue->peer_pid != nlpid) + if (queue->peer_portid != nlportid) return ERR_PTR(-EPERM); return queue; @@ -662,7 +900,7 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_queue_entry *entry, *tmp; unsigned int verdict, maxid; struct nfqnl_msg_verdict_hdr *vhdr; @@ -670,7 +908,11 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, LIST_HEAD(batch_list); u16 queue_num = ntohs(nfmsg->res_id); - queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).pid); + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); @@ -708,18 +950,23 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict; struct nf_queue_entry *entry; + enum ip_conntrack_info uninitialized_var(ctinfo); + struct nf_conn *ct = NULL; - queue = instance_lookup(queue_num); - if (!queue) + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); - queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).pid); + queue = instance_lookup(q, queue_num); + if (!queue) + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); @@ -733,10 +980,25 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, if (entry == NULL) return -ENOENT; + if (nfqa[NFQA_CT]) { + ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); + if (ct && nfqa[NFQA_EXP]) { + nfqnl_attach_expect(ct, nfqa[NFQA_EXP], + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + } + } + if (nfqa[NFQA_PAYLOAD]) { + u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); + int diff = payload_len - entry->skb->len; + if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), - nla_len(nfqa[NFQA_PAYLOAD]), entry) < 0) + payload_len, entry, diff) < 0) verdict = NF_DROP; + + if (ct) + nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff); } if (nfqa[NFQA_MARK]) @@ -760,7 +1022,6 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { }; static const struct nf_queue_handler nfqh = { - .name = "nf_queue", .outfn = &nfqnl_enqueue_packet, }; @@ -769,29 +1030,27 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_instance *queue; struct nfqnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); int ret = 0; if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); - /* Commands without queue context - might sleep */ + /* Obsolete commands without queue context */ switch (cmd->command) { - case NFQNL_CFG_CMD_PF_BIND: - return nf_register_queue_handler(ntohs(cmd->pf), - &nfqh); - case NFQNL_CFG_CMD_PF_UNBIND: - return nf_unregister_queue_handler(ntohs(cmd->pf), - &nfqh); + case NFQNL_CFG_CMD_PF_BIND: return 0; + case NFQNL_CFG_CMD_PF_UNBIND: return 0; } } rcu_read_lock(); - queue = instance_lookup(queue_num); - if (queue && queue->peer_pid != NETLINK_CB(skb).pid) { + queue = instance_lookup(q, queue_num); + if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; } @@ -803,7 +1062,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -EBUSY; goto err_out_unlock; } - queue = instance_create(queue_num, NETLINK_CB(skb).pid); + queue = instance_create(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; @@ -814,7 +1074,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -ENODEV; goto err_out_unlock; } - instance_destroy(queue); + instance_destroy(q, queue); break; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: @@ -850,6 +1110,36 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, spin_unlock_bh(&queue->lock); } + if (nfqa[NFQA_CFG_FLAGS]) { + __u32 flags, mask; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + + if (!nfqa[NFQA_CFG_MASK]) { + /* A mask is needed to specify which flags are being + * changed. + */ + ret = -EINVAL; + goto err_out_unlock; + } + + flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); + mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); + + if (flags >= NFQA_CFG_F_MAX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } + + spin_lock_bh(&queue->lock); + queue->flags &= ~mask; + queue->flags |= flags & mask; + spin_unlock_bh(&queue->lock); + } + err_out_unlock: rcu_read_unlock(); return ret; @@ -878,19 +1168,24 @@ static const struct nfnetlink_subsystem nfqnl_subsys = { #ifdef CONFIG_PROC_FS struct iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; + struct net *net; + struct nfnl_queue_net *q; if (!st) return NULL; + net = seq_file_net(seq); + q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return instance_table[st->bucket].first; + if (!hlist_empty(&q->instance_table[st->bucket])) + return q->instance_table[st->bucket].first; } return NULL; } @@ -898,13 +1193,17 @@ static struct hlist_node *get_first(struct seq_file *seq) static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; + struct net *net = seq_file_net(seq); h = h->next; while (!h) { + struct nfnl_queue_net *q; + if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = instance_table[st->bucket].first; + q = nfnl_queue_pernet(net); + h = q->instance_table[st->bucket].first; } return h; } @@ -920,11 +1219,11 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) return pos ? NULL : head; } -static void *seq_start(struct seq_file *seq, loff_t *pos) - __acquires(instances_lock) +static void *seq_start(struct seq_file *s, loff_t *pos) + __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { - spin_lock(&instances_lock); - return get_idx(seq, *pos); + spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); + return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) @@ -934,9 +1233,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) } static void seq_stop(struct seq_file *s, void *v) - __releases(instances_lock) + __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { - spin_unlock(&instances_lock); + spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) @@ -945,7 +1244,7 @@ static int seq_show(struct seq_file *s, void *v) return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", inst->queue_num, - inst->peer_pid, inst->queue_total, + inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); @@ -960,7 +1259,7 @@ static const struct seq_operations nfqnl_seq_ops = { static int nfqnl_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &nfqnl_seq_ops, + return seq_open_net(inode, file, &nfqnl_seq_ops, sizeof(struct iter_state)); } @@ -969,38 +1268,65 @@ static const struct file_operations nfqnl_file_ops = { .open = nfqnl_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* PROC_FS */ -static int __init nfnetlink_queue_init(void) +static int __net_init nfnl_queue_net_init(struct net *net) { - int i, status = -ENOMEM; + unsigned int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); + INIT_HLIST_HEAD(&q->instance_table[i]); + + spin_lock_init(&q->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + net->nf.proc_netfilter, &nfqnl_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_queue_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +#endif +} + +static struct pernet_operations nfnl_queue_net_ops = { + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), +}; + +static int __init nfnetlink_queue_init(void) +{ + int status = -ENOMEM; netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { - printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + pr_err("nf_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_queue", 0440, - proc_net_netfilter, &nfqnl_file_ops)) + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); goto cleanup_subsys; -#endif - + } register_netdevice_notifier(&nfqnl_dev_notifier); + nf_register_queue_handler(&nfqh); return status; -#ifdef CONFIG_PROC_FS cleanup_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); -#endif cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); return status; @@ -1008,11 +1334,9 @@ cleanup_netlink_notifier: static void __exit nfnetlink_queue_fini(void) { - nf_unregister_queue_handlers(&nfqh); + nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_queue", proc_net_netfilter); -#endif + unregister_pernet_subsys(&nfnl_queue_net_ops); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c new file mode 100644 index 00000000000..96cac50e0d1 --- /dev/null +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -0,0 +1,113 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nfnetlink_queue.h> + +struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size, + enum ip_conntrack_info *ctinfo) +{ + struct nfq_ct_hook *nfq_ct; + struct nf_conn *ct; + + /* rcu_read_lock()ed by __nf_queue already. */ + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return NULL; + + ct = nf_ct_get(entskb, ctinfo); + if (ct) { + if (!nf_ct_is_untracked(ct)) + *size += nfq_ct->build_size(ct); + else + ct = NULL; + } + return ct; +} + +struct nf_conn * +nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr, + enum ip_conntrack_info *ctinfo) +{ + struct nfq_ct_hook *nfq_ct; + struct nf_conn *ct; + + /* rcu_read_lock()ed by __nf_queue already. */ + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return NULL; + + ct = nf_ct_get(skb, ctinfo); + if (ct && !nf_ct_is_untracked(ct)) + nfq_ct->parse(attr, ct); + + return ct; +} + +int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct nfq_ct_hook *nfq_ct; + struct nlattr *nest_parms; + u_int32_t tmp; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return 0; + + nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nfq_ct->build(skb, ct) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + tmp = ctinfo; + if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int diff) +{ + struct nfq_ct_hook *nfq_ct; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return; + + if ((ct->status & IPS_NAT_MASK) && diff) + nfq_ct->seq_adjust(skb, ct, ctinfo, diff); +} + +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + struct nfq_ct_hook *nfq_ct; + + if (nf_ct_is_untracked(ct)) + return 0; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return -EOPNOTSUPP; + + return nfq_ct->attach_expect(attr, ct, portid, report); +} diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c new file mode 100644 index 00000000000..4fb6ee2c110 --- /dev/null +++ b/net/netfilter/nft_bitwise.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_bitwise { + enum nft_registers sreg:8; + enum nft_registers dreg:8; + u8 len; + struct nft_data mask; + struct nft_data xor; +}; + +static void nft_bitwise_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + const struct nft_data *src = &data[priv->sreg]; + struct nft_data *dst = &data[priv->dreg]; + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) { + dst->data[i] = (src->data[i] & priv->mask.data[i]) ^ + priv->xor.data[i]; + } +} + +static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { + [NFTA_BITWISE_SREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_DREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, + [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, + [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, +}; + +static int nft_bitwise_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_bitwise *priv = nft_expr_priv(expr); + struct nft_data_desc d1, d2; + int err; + + if (tb[NFTA_BITWISE_SREG] == NULL || + tb[NFTA_BITWISE_DREG] == NULL || + tb[NFTA_BITWISE_LEN] == NULL || + tb[NFTA_BITWISE_MASK] == NULL || + tb[NFTA_BITWISE_XOR] == NULL) + return -EINVAL; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + + err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]); + if (err < 0) + return err; + if (d1.len != priv->len) + return -EINVAL; + + err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]); + if (err < 0) + return err; + if (d2.len != priv->len) + return -EINVAL; + + return 0; +} + +static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_bitwise_type; +static const struct nft_expr_ops nft_bitwise_ops = { + .type = &nft_bitwise_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), + .eval = nft_bitwise_eval, + .init = nft_bitwise_init, + .dump = nft_bitwise_dump, +}; + +static struct nft_expr_type nft_bitwise_type __read_mostly = { + .name = "bitwise", + .ops = &nft_bitwise_ops, + .policy = nft_bitwise_policy, + .maxattr = NFTA_BITWISE_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_bitwise_module_init(void) +{ + return nft_register_expr(&nft_bitwise_type); +} + +void nft_bitwise_module_exit(void) +{ + nft_unregister_expr(&nft_bitwise_type); +} diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c new file mode 100644 index 00000000000..c39ed8d29df --- /dev/null +++ b/net/netfilter/nft_byteorder.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_byteorder { + enum nft_registers sreg:8; + enum nft_registers dreg:8; + enum nft_byteorder_ops op:8; + u8 len; + u8 size; +}; + +static void nft_byteorder_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_byteorder *priv = nft_expr_priv(expr); + struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg]; + union { u32 u32; u16 u16; } *s, *d; + unsigned int i; + + s = (void *)src->data; + d = (void *)dst->data; + + switch (priv->size) { + case 4: + switch (priv->op) { + case NFT_BYTEORDER_NTOH: + for (i = 0; i < priv->len / 4; i++) + d[i].u32 = ntohl((__force __be32)s[i].u32); + break; + case NFT_BYTEORDER_HTON: + for (i = 0; i < priv->len / 4; i++) + d[i].u32 = (__force __u32)htonl(s[i].u32); + break; + } + break; + case 2: + switch (priv->op) { + case NFT_BYTEORDER_NTOH: + for (i = 0; i < priv->len / 2; i++) + d[i].u16 = ntohs((__force __be16)s[i].u16); + break; + case NFT_BYTEORDER_HTON: + for (i = 0; i < priv->len / 2; i++) + d[i].u16 = (__force __u16)htons(s[i].u16); + break; + } + break; + } +} + +static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { + [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_OP] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 }, +}; + +static int nft_byteorder_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_byteorder *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_BYTEORDER_SREG] == NULL || + tb[NFTA_BYTEORDER_DREG] == NULL || + tb[NFTA_BYTEORDER_LEN] == NULL || + tb[NFTA_BYTEORDER_SIZE] == NULL || + tb[NFTA_BYTEORDER_OP] == NULL) + return -EINVAL; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); + switch (priv->op) { + case NFT_BYTEORDER_NTOH: + case NFT_BYTEORDER_HTON: + break; + default: + return -EINVAL; + } + + priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); + if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + + priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); + switch (priv->size) { + case 2: + case 4: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_byteorder *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_byteorder_type; +static const struct nft_expr_ops nft_byteorder_ops = { + .type = &nft_byteorder_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), + .eval = nft_byteorder_eval, + .init = nft_byteorder_init, + .dump = nft_byteorder_dump, +}; + +static struct nft_expr_type nft_byteorder_type __read_mostly = { + .name = "byteorder", + .ops = &nft_byteorder_ops, + .policy = nft_byteorder_policy, + .maxattr = NFTA_BYTEORDER_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_byteorder_module_init(void) +{ + return nft_register_expr(&nft_byteorder_type); +} + +void nft_byteorder_module_exit(void) +{ + nft_unregister_expr(&nft_byteorder_type); +} diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c new file mode 100644 index 00000000000..e2b3f51c81f --- /dev/null +++ b/net/netfilter/nft_cmp.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_cmp_expr { + struct nft_data data; + enum nft_registers sreg:8; + u8 len; + enum nft_cmp_ops op:8; +}; + +static void nft_cmp_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_cmp_expr *priv = nft_expr_priv(expr); + int d; + + d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len); + switch (priv->op) { + case NFT_CMP_EQ: + if (d != 0) + goto mismatch; + break; + case NFT_CMP_NEQ: + if (d == 0) + goto mismatch; + break; + case NFT_CMP_LT: + if (d == 0) + goto mismatch; + case NFT_CMP_LTE: + if (d > 0) + goto mismatch; + break; + case NFT_CMP_GT: + if (d == 0) + goto mismatch; + case NFT_CMP_GTE: + if (d < 0) + goto mismatch; + break; + } + return; + +mismatch: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = { + [NFTA_CMP_SREG] = { .type = NLA_U32 }, + [NFTA_CMP_OP] = { .type = NLA_U32 }, + [NFTA_CMP_DATA] = { .type = NLA_NESTED }, +}; + +static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_cmp_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc; + int err; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); + + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); + BUG_ON(err < 0); + + priv->len = desc.len; + return 0; +} + +static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_cmp_expr *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_cmp_type; +static const struct nft_expr_ops nft_cmp_ops = { + .type = &nft_cmp_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), + .eval = nft_cmp_eval, + .init = nft_cmp_init, + .dump = nft_cmp_dump, +}; + +static int nft_cmp_fast_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc; + struct nft_data data; + u32 mask; + int err; + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); + BUG_ON(err < 0); + desc.len *= BITS_PER_BYTE; + + mask = nft_cmp_fast_mask(desc.len); + priv->data = data.data[0] & mask; + priv->len = desc.len; + return 0; +} + +static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + struct nft_data data; + + if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ))) + goto nla_put_failure; + + data.data[0] = priv->data; + if (nft_data_dump(skb, NFTA_CMP_DATA, &data, + NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +const struct nft_expr_ops nft_cmp_fast_ops = { + .type = &nft_cmp_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)), + .eval = NULL, /* inlined */ + .init = nft_cmp_fast_init, + .dump = nft_cmp_fast_dump, +}; + +static const struct nft_expr_ops * +nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) +{ + struct nft_data_desc desc; + struct nft_data data; + enum nft_registers sreg; + enum nft_cmp_ops op; + int err; + + if (tb[NFTA_CMP_SREG] == NULL || + tb[NFTA_CMP_OP] == NULL || + tb[NFTA_CMP_DATA] == NULL) + return ERR_PTR(-EINVAL); + + sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + err = nft_validate_input_register(sreg); + if (err < 0) + return ERR_PTR(err); + + op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); + switch (op) { + case NFT_CMP_EQ: + case NFT_CMP_NEQ: + case NFT_CMP_LT: + case NFT_CMP_LTE: + case NFT_CMP_GT: + case NFT_CMP_GTE: + break; + default: + return ERR_PTR(-EINVAL); + } + + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); + if (err < 0) + return ERR_PTR(err); + + if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) + return &nft_cmp_fast_ops; + else + return &nft_cmp_ops; +} + +static struct nft_expr_type nft_cmp_type __read_mostly = { + .name = "cmp", + .select_ops = nft_cmp_select_ops, + .policy = nft_cmp_policy, + .maxattr = NFTA_CMP_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_cmp_module_init(void) +{ + return nft_register_expr(&nft_cmp_type); +} + +void nft_cmp_module_exit(void) +{ + nft_unregister_expr(&nft_cmp_type); +} diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c new file mode 100644 index 00000000000..1840989092e --- /dev/null +++ b/net/netfilter/nft_compat.c @@ -0,0 +1,793 @@ +/* + * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This software has been sponsored by Sophos Astaro <http://www.sophos.com> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/netfilter/nf_tables_compat.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <asm/uaccess.h> /* for set_fs */ +#include <net/netfilter/nf_tables.h> + +union nft_entry { + struct ipt_entry e4; + struct ip6t_entry e6; +}; + +static inline void +nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +{ + par->target = xt; + par->targinfo = xt_info; + par->hotdrop = false; +} + +static void nft_target_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + void *info = nft_expr_priv(expr); + struct xt_target *target = expr->ops->data; + struct sk_buff *skb = pkt->skb; + int ret; + + nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + + ret = target->target(skb, &pkt->xt); + + if (pkt->xt.hotdrop) + ret = NF_DROP; + + switch(ret) { + case XT_CONTINUE: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + break; + default: + data[NFT_REG_VERDICT].verdict = ret; + break; + } + return; +} + +static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { + [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_TARGET_REV] = { .type = NLA_U32 }, + [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, +}; + +static void +nft_target_set_tgchk_param(struct xt_tgchk_param *par, + const struct nft_ctx *ctx, + struct xt_target *target, void *info, + union nft_entry *entry, u8 proto, bool inv) +{ + par->net = &init_net; + par->table = ctx->table->name; + switch (ctx->afi->family) { + case AF_INET: + entry->e4.ip.proto = proto; + entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; + break; + case AF_INET6: + entry->e6.ipv6.proto = proto; + entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; + break; + } + par->entryinfo = entry; + par->target = target; + par->targinfo = info; + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + par->hook_mask = 1 << ops->hooknum; + } + par->family = ctx->afi->family; +} + +static void target_compat_from_user(struct xt_target *t, void *in, void *out) +{ +#ifdef CONFIG_COMPAT + if (t->compat_from_user) { + int pad; + + t->compat_from_user(out, in); + pad = XT_ALIGN(t->targetsize) - t->targetsize; + if (pad > 0) + memset(out + t->targetsize, 0, pad); + } else +#endif + memcpy(out, in, XT_ALIGN(t->targetsize)); +} + +static inline int nft_compat_target_offset(struct xt_target *target) +{ +#ifdef CONFIG_COMPAT + return xt_compat_target_offset(target); +#else + return 0; +#endif +} + +static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { + [NFTA_RULE_COMPAT_PROTO] = { .type = NLA_U32 }, + [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 }, +}; + +static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) +{ + struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; + u32 flags; + int err; + + err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr, + nft_rule_compat_policy); + if (err < 0) + return err; + + if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS]) + return -EINVAL; + + flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); + if (flags & ~NFT_RULE_COMPAT_F_MASK) + return -EINVAL; + if (flags & NFT_RULE_COMPAT_F_INV) + *inv = true; + + *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + return 0; +} + +static int +nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + void *info = nft_expr_priv(expr); + struct xt_target *target = expr->ops->data; + struct xt_tgchk_param par; + size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); + u8 proto = 0; + bool inv = false; + union nft_entry e = {}; + int ret; + + target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info); + + if (ctx->nla[NFTA_RULE_COMPAT]) { + ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); + if (ret < 0) + goto err; + } + + nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + + ret = xt_check_target(&par, size, proto, inv); + if (ret < 0) + goto err; + + /* The standard target cannot be used */ + if (target->target == NULL) { + ret = -EINVAL; + goto err; + } + + return 0; +err: + module_put(target->me); + return ret; +} + +static void +nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgdtor_param par; + + par.net = ctx->net; + par.target = target; + par.targinfo = info; + par.family = ctx->afi->family; + if (par.target->destroy != NULL) + par.target->destroy(&par); + + module_put(target->me); +} + +static int +target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in) +{ + int ret; + +#ifdef CONFIG_COMPAT + if (t->compat_to_user) { + mm_segment_t old_fs; + void *out; + + out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC); + if (out == NULL) + return -ENOMEM; + + /* We want to reuse existing compat_to_user */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + t->compat_to_user(out, in); + set_fs(old_fs); + ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out); + kfree(out); + } else +#endif + ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in); + + return ret; +} + +static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + + if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || + nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || + target_dump_info(skb, target, info)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_target_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + struct xt_target *target = expr->ops->data; + unsigned int hook_mask = 0; + + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + hook_mask = 1 << ops->hooknum; + if (hook_mask & target->hooks) + return 0; + + /* This target is being called from an invalid chain */ + return -EINVAL; + } + return 0; +} + +static void nft_match_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + void *info = nft_expr_priv(expr); + struct xt_match *match = expr->ops->data; + struct sk_buff *skb = pkt->skb; + bool ret; + + nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + + ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + + if (pkt->xt.hotdrop) { + data[NFT_REG_VERDICT].verdict = NF_DROP; + return; + } + + switch(ret) { + case true: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + break; + case false: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; + break; + } +} + +static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { + [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_MATCH_REV] = { .type = NLA_U32 }, + [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, +}; + +/* struct xt_mtchk_param and xt_tgchk_param look very similar */ +static void +nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, + struct xt_match *match, void *info, + union nft_entry *entry, u8 proto, bool inv) +{ + par->net = &init_net; + par->table = ctx->table->name; + switch (ctx->afi->family) { + case AF_INET: + entry->e4.ip.proto = proto; + entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; + break; + case AF_INET6: + entry->e6.ipv6.proto = proto; + entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; + break; + } + par->entryinfo = entry; + par->match = match; + par->matchinfo = info; + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + par->hook_mask = 1 << ops->hooknum; + } + par->family = ctx->afi->family; +} + +static void match_compat_from_user(struct xt_match *m, void *in, void *out) +{ +#ifdef CONFIG_COMPAT + if (m->compat_from_user) { + int pad; + + m->compat_from_user(out, in); + pad = XT_ALIGN(m->matchsize) - m->matchsize; + if (pad > 0) + memset(out + m->matchsize, 0, pad); + } else +#endif + memcpy(out, in, XT_ALIGN(m->matchsize)); +} + +static int +nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + void *info = nft_expr_priv(expr); + struct xt_match *match = expr->ops->data; + struct xt_mtchk_param par; + size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); + u8 proto = 0; + bool inv = false; + union nft_entry e = {}; + int ret; + + match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info); + + if (ctx->nla[NFTA_RULE_COMPAT]) { + ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); + if (ret < 0) + goto err; + } + + nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + + ret = xt_check_match(&par, size, proto, inv); + if (ret < 0) + goto err; + + return 0; +err: + module_put(match->me); + return ret; +} + +static void +nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + struct xt_match *match = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_mtdtor_param par; + + par.net = ctx->net; + par.match = match; + par.matchinfo = info; + par.family = ctx->afi->family; + if (par.match->destroy != NULL) + par.match->destroy(&par); + + module_put(match->me); +} + +static int +match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in) +{ + int ret; + +#ifdef CONFIG_COMPAT + if (m->compat_to_user) { + mm_segment_t old_fs; + void *out; + + out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC); + if (out == NULL) + return -ENOMEM; + + /* We want to reuse existing compat_to_user */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + m->compat_to_user(out, in); + set_fs(old_fs); + ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out); + kfree(out); + } else +#endif + ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in); + + return ret; +} + +static inline int nft_compat_match_offset(struct xt_match *match) +{ +#ifdef CONFIG_COMPAT + return xt_compat_match_offset(match); +#else + return 0; +#endif +} + +static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + void *info = nft_expr_priv(expr); + struct xt_match *match = expr->ops->data; + + if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || + nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || + match_dump_info(skb, match, info)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_match_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + struct xt_match *match = expr->ops->data; + unsigned int hook_mask = 0; + + if (ctx->chain->flags & NFT_BASE_CHAIN) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); + const struct nf_hook_ops *ops = &basechain->ops[0]; + + hook_mask = 1 << ops->hooknum; + if (hook_mask & match->hooks) + return 0; + + /* This match is being called from an invalid chain */ + return -EINVAL; + } + return 0; +} + +static int +nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, + int event, u16 family, const char *name, + int rev, int target) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = portid ? NLM_F_MULTI : 0; + + event |= NFNL_SUBSYS_NFT_COMPAT << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || + nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || + nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + int ret = 0, target; + struct nfgenmsg *nfmsg; + const char *fmt; + const char *name; + u32 rev; + struct sk_buff *skb2; + + if (tb[NFTA_COMPAT_NAME] == NULL || + tb[NFTA_COMPAT_REV] == NULL || + tb[NFTA_COMPAT_TYPE] == NULL) + return -EINVAL; + + name = nla_data(tb[NFTA_COMPAT_NAME]); + rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); + target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); + + nfmsg = nlmsg_data(nlh); + + switch(nfmsg->nfgen_family) { + case AF_INET: + fmt = "ipt_%s"; + break; + case AF_INET6: + fmt = "ip6t_%s"; + break; + default: + pr_err("nft_compat: unsupported protocol %d\n", + nfmsg->nfgen_family); + return -EINVAL; + } + + try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, + rev, target, &ret), + fmt, name); + + if (ret < 0) + return ret; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + /* include the best revision for this extension in the message */ + if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + NFNL_MSG_COMPAT_GET, + nfmsg->nfgen_family, + name, ret, target) <= 0) { + kfree_skb(skb2); + return -ENOSPC; + } + + ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + return ret == -EAGAIN ? -ENOBUFS : ret; +} + +static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { + [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, + .len = NFT_COMPAT_NAME_MAX-1 }, + [NFTA_COMPAT_REV] = { .type = NLA_U32 }, + [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, +}; + +static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { + [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get, + .attr_count = NFTA_COMPAT_MAX, + .policy = nfnl_compat_policy_get }, +}; + +static const struct nfnetlink_subsystem nfnl_compat_subsys = { + .name = "nft-compat", + .subsys_id = NFNL_SUBSYS_NFT_COMPAT, + .cb_count = NFNL_MSG_COMPAT_MAX, + .cb = nfnl_nft_compat_cb, +}; + +static LIST_HEAD(nft_match_list); + +struct nft_xt { + struct list_head head; + struct nft_expr_ops ops; +}; + +static struct nft_expr_type nft_match_type; + +static const struct nft_expr_ops * +nft_match_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + struct nft_xt *nft_match; + struct xt_match *match; + char *mt_name; + __u32 rev, family; + + if (tb[NFTA_MATCH_NAME] == NULL || + tb[NFTA_MATCH_REV] == NULL || + tb[NFTA_MATCH_INFO] == NULL) + return ERR_PTR(-EINVAL); + + mt_name = nla_data(tb[NFTA_MATCH_NAME]); + rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); + family = ctx->afi->family; + + /* Re-use the existing match if it's already loaded. */ + list_for_each_entry(nft_match, &nft_match_list, head) { + struct xt_match *match = nft_match->ops.data; + + if (strcmp(match->name, mt_name) == 0 && + match->revision == rev && match->family == family) + return &nft_match->ops; + } + + match = xt_request_find_match(family, mt_name, rev); + if (IS_ERR(match)) + return ERR_PTR(-ENOENT); + + /* This is the first time we use this match, allocate operations */ + nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); + if (nft_match == NULL) + return ERR_PTR(-ENOMEM); + + nft_match->ops.type = &nft_match_type; + nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) + + nft_compat_match_offset(match)); + nft_match->ops.eval = nft_match_eval; + nft_match->ops.init = nft_match_init; + nft_match->ops.destroy = nft_match_destroy; + nft_match->ops.dump = nft_match_dump; + nft_match->ops.validate = nft_match_validate; + nft_match->ops.data = match; + + list_add(&nft_match->head, &nft_match_list); + + return &nft_match->ops; +} + +static void nft_match_release(void) +{ + struct nft_xt *nft_match, *tmp; + + list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head) + kfree(nft_match); +} + +static struct nft_expr_type nft_match_type __read_mostly = { + .name = "match", + .select_ops = nft_match_select_ops, + .policy = nft_match_policy, + .maxattr = NFTA_MATCH_MAX, + .owner = THIS_MODULE, +}; + +static LIST_HEAD(nft_target_list); + +static struct nft_expr_type nft_target_type; + +static const struct nft_expr_ops * +nft_target_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + struct nft_xt *nft_target; + struct xt_target *target; + char *tg_name; + __u32 rev, family; + + if (tb[NFTA_TARGET_NAME] == NULL || + tb[NFTA_TARGET_REV] == NULL || + tb[NFTA_TARGET_INFO] == NULL) + return ERR_PTR(-EINVAL); + + tg_name = nla_data(tb[NFTA_TARGET_NAME]); + rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); + family = ctx->afi->family; + + /* Re-use the existing target if it's already loaded. */ + list_for_each_entry(nft_target, &nft_match_list, head) { + struct xt_target *target = nft_target->ops.data; + + if (strcmp(target->name, tg_name) == 0 && + target->revision == rev && target->family == family) + return &nft_target->ops; + } + + target = xt_request_find_target(family, tg_name, rev); + if (IS_ERR(target)) + return ERR_PTR(-ENOENT); + + /* This is the first time we use this target, allocate operations */ + nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); + if (nft_target == NULL) + return ERR_PTR(-ENOMEM); + + nft_target->ops.type = &nft_target_type; + nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) + + nft_compat_target_offset(target)); + nft_target->ops.eval = nft_target_eval; + nft_target->ops.init = nft_target_init; + nft_target->ops.destroy = nft_target_destroy; + nft_target->ops.dump = nft_target_dump; + nft_target->ops.validate = nft_target_validate; + nft_target->ops.data = target; + + list_add(&nft_target->head, &nft_target_list); + + return &nft_target->ops; +} + +static void nft_target_release(void) +{ + struct nft_xt *nft_target, *tmp; + + list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head) + kfree(nft_target); +} + +static struct nft_expr_type nft_target_type __read_mostly = { + .name = "target", + .select_ops = nft_target_select_ops, + .policy = nft_target_policy, + .maxattr = NFTA_TARGET_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_compat_module_init(void) +{ + int ret; + + ret = nft_register_expr(&nft_match_type); + if (ret < 0) + return ret; + + ret = nft_register_expr(&nft_target_type); + if (ret < 0) + goto err_match; + + ret = nfnetlink_subsys_register(&nfnl_compat_subsys); + if (ret < 0) { + pr_err("nft_compat: cannot register with nfnetlink.\n"); + goto err_target; + } + + pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n"); + + return ret; + +err_target: + nft_unregister_expr(&nft_target_type); +err_match: + nft_unregister_expr(&nft_match_type); + return ret; +} + +static void __exit nft_compat_module_exit(void) +{ + nfnetlink_subsys_unregister(&nfnl_compat_subsys); + nft_unregister_expr(&nft_target_type); + nft_unregister_expr(&nft_match_type); + nft_match_release(); + nft_target_release(); +} + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); + +module_init(nft_compat_module_init); +module_exit(nft_compat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_EXPR("match"); +MODULE_ALIAS_NFT_EXPR("target"); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c new file mode 100644 index 00000000000..c89ee486ce5 --- /dev/null +++ b/net/netfilter/nft_counter.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/seqlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_counter { + seqlock_t lock; + u64 bytes; + u64 packets; +}; + +static void nft_counter_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_counter *priv = nft_expr_priv(expr); + + write_seqlock_bh(&priv->lock); + priv->bytes += pkt->skb->len; + priv->packets++; + write_sequnlock_bh(&priv->lock); +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_counter *priv = nft_expr_priv(expr); + unsigned int seq; + u64 bytes; + u64 packets; + + do { + seq = read_seqbegin(&priv->lock); + bytes = priv->bytes; + packets = priv->packets; + } while (read_seqretry(&priv->lock, seq)); + + if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes))) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { + [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, + [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, +}; + +static int nft_counter_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_counter *priv = nft_expr_priv(expr); + + if (tb[NFTA_COUNTER_PACKETS]) + priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + if (tb[NFTA_COUNTER_BYTES]) + priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + + seqlock_init(&priv->lock); + return 0; +} + +static struct nft_expr_type nft_counter_type; +static const struct nft_expr_ops nft_counter_ops = { + .type = &nft_counter_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)), + .eval = nft_counter_eval, + .init = nft_counter_init, + .dump = nft_counter_dump, +}; + +static struct nft_expr_type nft_counter_type __read_mostly = { + .name = "counter", + .ops = &nft_counter_ops, + .policy = nft_counter_policy, + .maxattr = NFTA_COUNTER_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_counter_module_init(void) +{ + return nft_register_expr(&nft_counter_type); +} + +static void __exit nft_counter_module_exit(void) +{ + nft_unregister_expr(&nft_counter_type); +} + +module_init(nft_counter_module_init); +module_exit(nft_counter_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("counter"); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c new file mode 100644 index 00000000000..cc560301624 --- /dev/null +++ b/net/netfilter/nft_ct.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +struct nft_ct { + enum nft_ct_keys key:8; + enum ip_conntrack_dir dir:8; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; +}; + +static void nft_ct_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + struct nft_data *dest = &data[priv->dreg]; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + const struct nf_conn_help *help; + const struct nf_conntrack_tuple *tuple; + const struct nf_conntrack_helper *helper; + long diff; + unsigned int state; + + ct = nf_ct_get(pkt->skb, &ctinfo); + + switch (priv->key) { + case NFT_CT_STATE: + if (ct == NULL) + state = NF_CT_STATE_INVALID_BIT; + else if (nf_ct_is_untracked(ct)) + state = NF_CT_STATE_UNTRACKED_BIT; + else + state = NF_CT_STATE_BIT(ctinfo); + dest->data[0] = state; + return; + } + + if (ct == NULL) + goto err; + + switch (priv->key) { + case NFT_CT_DIRECTION: + dest->data[0] = CTINFO2DIR(ctinfo); + return; + case NFT_CT_STATUS: + dest->data[0] = ct->status; + return; +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + dest->data[0] = ct->mark; + return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + dest->data[0] = ct->secmark; + return; +#endif + case NFT_CT_EXPIRATION: + diff = (long)jiffies - (long)ct->timeout.expires; + if (diff < 0) + diff = 0; + dest->data[0] = jiffies_to_msecs(diff); + return; + case NFT_CT_HELPER: + if (ct->master == NULL) + goto err; + help = nfct_help(ct->master); + if (help == NULL) + goto err; + helper = rcu_dereference(help->helper); + if (helper == NULL) + goto err; + if (strlen(helper->name) >= sizeof(dest->data)) + goto err; + strncpy((char *)dest->data, helper->name, sizeof(dest->data)); + return; +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: { + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + unsigned int size; + + if (!labels) { + memset(dest->data, 0, sizeof(dest->data)); + return; + } + + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data)); + size = labels->words * sizeof(long); + + memcpy(dest->data, labels->bits, size); + if (size < sizeof(dest->data)) + memset(((char *) dest->data) + size, 0, + sizeof(dest->data) - size); + return; + } +#endif + } + + tuple = &ct->tuplehash[priv->dir].tuple; + switch (priv->key) { + case NFT_CT_L3PROTOCOL: + dest->data[0] = nf_ct_l3num(ct); + return; + case NFT_CT_SRC: + memcpy(dest->data, tuple->src.u3.all, + nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); + return; + case NFT_CT_DST: + memcpy(dest->data, tuple->dst.u3.all, + nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); + return; + case NFT_CT_PROTOCOL: + dest->data[0] = nf_ct_protonum(ct); + return; + case NFT_CT_PROTO_SRC: + dest->data[0] = (__force __u16)tuple->src.u.all; + return; + case NFT_CT_PROTO_DST: + dest->data[0] = (__force __u16)tuple->dst.u.all; + return; + } + return; +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static void nft_ct_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; +#ifdef CONFIG_NF_CONNTRACK_MARK + u32 value = data[priv->sreg].data[0]; +#endif + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return; + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + if (ct->mark != value) { + ct->mark = value; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; +#endif + } +} + +static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { + [NFTA_CT_DREG] = { .type = NLA_U32 }, + [NFTA_CT_KEY] = { .type = NLA_U32 }, + [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, + [NFTA_CT_SREG] = { .type = NLA_U32 }, +}; + +static int nft_ct_l3proto_try_module_get(uint8_t family) +{ + int err; + + if (family == NFPROTO_INET) { + err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_l3proto_try_module_get(family); + if (err < 0) + goto err1; + } + return 0; + +err2: + nf_ct_l3proto_module_put(NFPROTO_IPV4); +err1: + return err; +} + +static void nft_ct_l3proto_module_put(uint8_t family) +{ + if (family == NFPROTO_INET) { + nf_ct_l3proto_module_put(NFPROTO_IPV4); + nf_ct_l3proto_module_put(NFPROTO_IPV6); + } else + nf_ct_l3proto_module_put(family); +} + +static int nft_ct_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ct *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + switch (priv->key) { + case NFT_CT_STATE: + case NFT_CT_DIRECTION: + case NFT_CT_STATUS: +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: +#endif + case NFT_CT_EXPIRATION: + case NFT_CT_HELPER: + if (tb[NFTA_CT_DIRECTION] != NULL) + return -EINVAL; + break; + case NFT_CT_L3PROTOCOL: + case NFT_CT_PROTOCOL: + case NFT_CT_SRC: + case NFT_CT_DST: + case NFT_CT_PROTO_SRC: + case NFT_CT_PROTO_DST: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + break; + default: + return -EOPNOTSUPP; + } + + if (tb[NFTA_CT_DIRECTION] != NULL) { + priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + case IP_CT_DIR_REPLY: + break; + default: + return -EINVAL; + } + } + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + + return 0; +} + +static int nft_ct_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ct *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + break; +#endif + default: + return -EOPNOTSUPP; + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + + return 0; +} + +static void nft_ct_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nft_ct_l3proto_module_put(ctx->afi->family); +} + +static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) + goto nla_put_failure; + + switch (priv->key) { + case NFT_CT_PROTOCOL: + case NFT_CT_SRC: + case NFT_CT_DST: + case NFT_CT_PROTO_SRC: + case NFT_CT_PROTO_DST: + if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + default: + break; + } + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_ct_type; +static const struct nft_expr_ops nft_ct_get_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_destroy, + .dump = nft_ct_get_dump, +}; + +static const struct nft_expr_ops nft_ct_set_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_set_eval, + .init = nft_ct_set_init, + .destroy = nft_ct_destroy, + .dump = nft_ct_set_dump, +}; + +static const struct nft_expr_ops * +nft_ct_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_CT_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_CT_DREG]) + return &nft_ct_get_ops; + + if (tb[NFTA_CT_SREG]) + return &nft_ct_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_ct_type __read_mostly = { + .name = "ct", + .select_ops = &nft_ct_select_ops, + .policy = nft_ct_policy, + .maxattr = NFTA_CT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_ct_module_init(void) +{ + return nft_register_expr(&nft_ct_type); +} + +static void __exit nft_ct_module_exit(void) +{ + nft_unregister_expr(&nft_ct_type); +} + +module_init(nft_ct_module_init); +module_exit(nft_ct_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("ct"); diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c new file mode 100644 index 00000000000..b6eed4d5a09 --- /dev/null +++ b/net/netfilter/nft_expr_template.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_template { + +}; + +static void nft_template_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_template *priv = nft_expr_priv(expr); + +} + +static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = { + [NFTA_TEMPLATE_ATTR] = { .type = NLA_U32 }, +}; + +static int nft_template_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_template *priv = nft_expr_priv(expr); + + return 0; +} + +static void nft_template_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_template *priv = nft_expr_priv(expr); + +} + +static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_template *priv = nft_expr_priv(expr); + + NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field); + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_template_type; +static const struct nft_expr_ops nft_template_ops = { + .type = &nft_template_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_template)), + .eval = nft_template_eval, + .init = nft_template_init, + .destroy = nft_template_destroy, + .dump = nft_template_dump, +}; + +static struct nft_expr_type nft_template_type __read_mostly = { + .name = "template", + .ops = &nft_template_ops, + .policy = nft_template_policy, + .maxattr = NFTA_TEMPLATE_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_template_module_init(void) +{ + return nft_register_expr(&nft_template_type); +} + +static void __exit nft_template_module_exit(void) +{ + nft_unregister_expr(&nft_template_type); +} + +module_init(nft_template_module_init); +module_exit(nft_template_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("template"); diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c new file mode 100644 index 00000000000..55c939f5371 --- /dev/null +++ b/net/netfilter/nft_exthdr.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +// FIXME: +#include <net/ipv6.h> + +struct nft_exthdr { + u8 type; + u8 offset; + u8 len; + enum nft_registers dreg:8; +}; + +static void nft_exthdr_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + struct nft_data *dest = &data[priv->dreg]; + unsigned int offset = 0; + int err; + + err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); + if (err < 0) + goto err; + offset += priv->offset; + + if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0) + goto err; + return; +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { + [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, + [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, + [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, + [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, +}; + +static int nft_exthdr_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_EXTHDR_DREG] == NULL || + tb[NFTA_EXTHDR_TYPE] == NULL || + tb[NFTA_EXTHDR_OFFSET] == NULL || + tb[NFTA_EXTHDR_LEN] == NULL) + return -EINVAL; + + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); + priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); + if (priv->len == 0 || + priv->len > FIELD_SIZEOF(struct nft_data, data)) + return -EINVAL; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg))) + goto nla_put_failure; + if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_exthdr_type; +static const struct nft_expr_ops nft_exthdr_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + +static struct nft_expr_type nft_exthdr_type __read_mostly = { + .name = "exthdr", + .ops = &nft_exthdr_ops, + .policy = nft_exthdr_policy, + .maxattr = NFTA_EXTHDR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_exthdr_module_init(void) +{ + return nft_register_expr(&nft_exthdr_type); +} + +static void __exit nft_exthdr_module_exit(void) +{ + nft_unregister_expr(&nft_exthdr_type); +} + +module_init(nft_exthdr_module_init); +module_exit(nft_exthdr_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("exthdr"); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c new file mode 100644 index 00000000000..4080ed6a072 --- /dev/null +++ b/net/netfilter/nft_hash.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/log2.h> +#include <linux/jhash.h> +#include <linux/netlink.h> +#include <linux/vmalloc.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +#define NFT_HASH_MIN_SIZE 4UL + +struct nft_hash { + struct nft_hash_table __rcu *tbl; +}; + +struct nft_hash_table { + unsigned int size; + struct nft_hash_elem __rcu *buckets[]; +}; + +struct nft_hash_elem { + struct nft_hash_elem __rcu *next; + struct nft_data key; + struct nft_data data[]; +}; + +#define nft_hash_for_each_entry(i, head) \ + for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next)) +#define nft_hash_for_each_entry_rcu(i, head) \ + for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next)) + +static u32 nft_hash_rnd __read_mostly; +static bool nft_hash_rnd_initted __read_mostly; + +static unsigned int nft_hash_data(const struct nft_data *data, + unsigned int hsize, unsigned int len) +{ + unsigned int h; + + h = jhash(data->data, len, nft_hash_rnd); + return h & (hsize - 1); +} + +static bool nft_hash_lookup(const struct nft_set *set, + const struct nft_data *key, + struct nft_data *data) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = rcu_dereference(priv->tbl); + const struct nft_hash_elem *he; + unsigned int h; + + h = nft_hash_data(key, tbl->size, set->klen); + nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) { + if (nft_data_cmp(&he->key, key, set->klen)) + continue; + if (set->flags & NFT_SET_MAP) + nft_data_copy(data, he->data); + return true; + } + return false; +} + +static void nft_hash_tbl_free(const struct nft_hash_table *tbl) +{ + kvfree(tbl); +} + +static unsigned int nft_hash_tbl_size(unsigned int nelem) +{ + return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); +} + +static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) +{ + struct nft_hash_table *tbl; + size_t size; + + size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); + tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN); + if (tbl == NULL) + tbl = vzalloc(size); + if (tbl == NULL) + return NULL; + tbl->size = nbuckets; + + return tbl; +} + +static void nft_hash_chain_unzip(const struct nft_set *set, + const struct nft_hash_table *ntbl, + struct nft_hash_table *tbl, unsigned int n) +{ + struct nft_hash_elem *he, *last, *next; + unsigned int h; + + he = nft_dereference(tbl->buckets[n]); + if (he == NULL) + return; + h = nft_hash_data(&he->key, ntbl->size, set->klen); + + /* Find last element of first chain hashing to bucket h */ + last = he; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + break; + last = he; + } + + /* Unlink first chain from the old table */ + RCU_INIT_POINTER(tbl->buckets[n], last->next); + + /* If end of chain reached, done */ + if (he == NULL) + return; + + /* Find first element of second chain hashing to bucket h */ + next = NULL; + nft_hash_for_each_entry(he, he->next) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) + continue; + next = he; + break; + } + + /* Link the two chains */ + RCU_INIT_POINTER(last->next, next); +} + +static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem *he; + unsigned int i, h; + bool complete; + + ntbl = nft_hash_tbl_alloc(tbl->size * 2); + if (ntbl == NULL) + return -ENOMEM; + + /* Link new table's buckets to first element in the old table + * hashing to the new bucket. + */ + for (i = 0; i < ntbl->size; i++) { + h = i < tbl->size ? i : i - tbl->size; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_hash_data(&he->key, ntbl->size, set->klen) != i) + continue; + RCU_INIT_POINTER(ntbl->buckets[i], he); + break; + } + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + + /* Unzip interleaved hash chains */ + do { + /* Wait for readers to use new table/unzipped chains */ + synchronize_rcu(); + + complete = true; + for (i = 0; i < tbl->size; i++) { + nft_hash_chain_unzip(set, ntbl, tbl, i); + if (tbl->buckets[i] != NULL) + complete = false; + } + } while (!complete); + + nft_hash_tbl_free(tbl); + return 0; +} + +static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) +{ + struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; + struct nft_hash_elem __rcu **pprev; + unsigned int i; + + ntbl = nft_hash_tbl_alloc(tbl->size / 2); + if (ntbl == NULL) + return -ENOMEM; + + for (i = 0; i < ntbl->size; i++) { + ntbl->buckets[i] = tbl->buckets[i]; + + for (pprev = &ntbl->buckets[i]; *pprev != NULL; + pprev = &nft_dereference(*pprev)->next) + ; + RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); + } + + /* Publish new table */ + rcu_assign_pointer(priv->tbl, ntbl); + synchronize_rcu(); + + nft_hash_tbl_free(tbl); + return 0; +} + +static int nft_hash_insert(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he; + unsigned int size, h; + + if (elem->flags != 0) + return -EINVAL; + + size = sizeof(*he); + if (set->flags & NFT_SET_MAP) + size += sizeof(he->data[0]); + + he = kzalloc(size, GFP_KERNEL); + if (he == NULL) + return -ENOMEM; + + nft_data_copy(&he->key, &elem->key); + if (set->flags & NFT_SET_MAP) + nft_data_copy(he->data, &elem->data); + + h = nft_hash_data(&he->key, tbl->size, set->klen); + RCU_INIT_POINTER(he->next, tbl->buckets[h]); + rcu_assign_pointer(tbl->buckets[h], he); + + /* Expand table when exceeding 75% load */ + if (set->nelems + 1 > tbl->size / 4 * 3) + nft_hash_tbl_expand(set, priv); + + return 0; +} + +static void nft_hash_elem_destroy(const struct nft_set *set, + struct nft_hash_elem *he) +{ + nft_data_uninit(&he->key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP) + nft_data_uninit(he->data, set->dtype); + kfree(he); +} + +static void nft_hash_remove(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, __rcu **pprev; + + pprev = elem->cookie; + he = nft_dereference((*pprev)); + + RCU_INIT_POINTER(*pprev, he->next); + synchronize_rcu(); + kfree(he); + + /* Shrink table beneath 30% load */ + if (set->nelems - 1 < tbl->size * 3 / 10 && + tbl->size > NFT_HASH_MIN_SIZE) + nft_hash_tbl_shrink(set, priv); +} + +static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem __rcu * const *pprev; + struct nft_hash_elem *he; + unsigned int h; + + h = nft_hash_data(&elem->key, tbl->size, set->klen); + pprev = &tbl->buckets[h]; + nft_hash_for_each_entry(he, tbl->buckets[h]) { + if (nft_data_cmp(&he->key, &elem->key, set->klen)) { + pprev = &he->next; + continue; + } + + elem->cookie = (void *)pprev; + elem->flags = 0; + if (set->flags & NFT_SET_MAP) + nft_data_copy(&elem->data, he->data); + return 0; + } + return -ENOENT; +} + +static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, + struct nft_set_iter *iter) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + const struct nft_hash_elem *he; + struct nft_set_elem elem; + unsigned int i; + + for (i = 0; i < tbl->size; i++) { + nft_hash_for_each_entry(he, tbl->buckets[i]) { + if (iter->count < iter->skip) + goto cont; + + memcpy(&elem.key, &he->key, sizeof(elem.key)); + if (set->flags & NFT_SET_MAP) + memcpy(&elem.data, he->data, sizeof(elem.data)); + elem.flags = 0; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + return; +cont: + iter->count++; + } + } +} + +static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) +{ + return sizeof(struct nft_hash); +} + +static int nft_hash_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const tb[]) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_table *tbl; + unsigned int size; + + if (unlikely(!nft_hash_rnd_initted)) { + get_random_bytes(&nft_hash_rnd, 4); + nft_hash_rnd_initted = true; + } + + size = NFT_HASH_MIN_SIZE; + if (desc->size) + size = nft_hash_tbl_size(desc->size); + + tbl = nft_hash_tbl_alloc(size); + if (tbl == NULL) + return -ENOMEM; + RCU_INIT_POINTER(priv->tbl, tbl); + return 0; +} + +static void nft_hash_destroy(const struct nft_set *set) +{ + const struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct nft_hash_elem *he, *next; + unsigned int i; + + for (i = 0; i < tbl->size; i++) { + for (he = nft_dereference(tbl->buckets[i]); he != NULL; + he = next) { + next = nft_dereference(he->next); + nft_hash_elem_destroy(set, he); + } + } + kfree(tbl); +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int esize; + + esize = sizeof(struct nft_hash_elem); + if (features & NFT_SET_MAP) + esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); + + if (desc->size) { + est->size = sizeof(struct nft_hash) + + nft_hash_tbl_size(desc->size) * + sizeof(struct nft_hash_elem *) + + desc->size * esize; + } else { + /* Resizing happens when the load drops below 30% or goes + * above 75%. The average of 52.5% load (approximated by 50%) + * is used for the size estimation of the hash buckets, + * meaning we calculate two buckets per element. + */ + est->size = esize + 2 * sizeof(struct nft_hash_elem *); + } + + est->class = NFT_SET_CLASS_O_1; + + return true; +} + +static struct nft_set_ops nft_hash_ops __read_mostly = { + .privsize = nft_hash_privsize, + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .get = nft_hash_get, + .insert = nft_hash_insert, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup, + .walk = nft_hash_walk, + .features = NFT_SET_MAP, + .owner = THIS_MODULE, +}; + +static int __init nft_hash_module_init(void) +{ + return nft_register_set(&nft_hash_ops); +} + +static void __exit nft_hash_module_exit(void) +{ + nft_unregister_set(&nft_hash_ops); +} + +module_init(nft_hash_module_init); +module_exit(nft_hash_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c new file mode 100644 index 00000000000..810385eb724 --- /dev/null +++ b/net/netfilter/nft_immediate.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_immediate_expr { + struct nft_data data; + enum nft_registers dreg:8; + u8 dlen; +}; + +static void nft_immediate_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + nft_data_copy(&data[priv->dreg], &priv->data); +} + +static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { + [NFTA_IMMEDIATE_DREG] = { .type = NLA_U32 }, + [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, +}; + +static int nft_immediate_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_immediate_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc; + int err; + + if (tb[NFTA_IMMEDIATE_DREG] == NULL || + tb[NFTA_IMMEDIATE_DATA] == NULL) + return -EINVAL; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); + if (err < 0) + return err; + priv->dlen = desc.len; + + err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type); + if (err < 0) + goto err1; + + return 0; + +err1: + nft_data_uninit(&priv->data, desc.type); + return err; +} + +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); +} + +static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg))) + goto nla_put_failure; + + return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, + nft_dreg_to_type(priv->dreg), priv->dlen); + +nla_put_failure: + return -1; +} + +static int nft_immediate_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + *data = &priv->data; + + return 0; +} + +static struct nft_expr_type nft_imm_type; +static const struct nft_expr_ops nft_imm_ops = { + .type = &nft_imm_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), + .eval = nft_immediate_eval, + .init = nft_immediate_init, + .destroy = nft_immediate_destroy, + .dump = nft_immediate_dump, + .validate = nft_immediate_validate, +}; + +static struct nft_expr_type nft_imm_type __read_mostly = { + .name = "immediate", + .ops = &nft_imm_ops, + .policy = nft_immediate_policy, + .maxattr = NFTA_IMMEDIATE_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_immediate_module_init(void) +{ + return nft_register_expr(&nft_imm_type); +} + +void nft_immediate_module_exit(void) +{ + nft_unregister_expr(&nft_imm_type); +} diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c new file mode 100644 index 00000000000..85da5bd02f6 --- /dev/null +++ b/net/netfilter/nft_limit.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(limit_lock); + +struct nft_limit { + u64 tokens; + u64 rate; + u64 unit; + unsigned long stamp; +}; + +static void nft_limit_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + spin_lock_bh(&limit_lock); + if (time_after_eq(jiffies, priv->stamp)) { + priv->tokens = priv->rate; + priv->stamp = jiffies + priv->unit * HZ; + } + + if (priv->tokens >= 1) { + priv->tokens--; + spin_unlock_bh(&limit_lock); + return; + } + spin_unlock_bh(&limit_lock); + + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { + [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, + [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, +}; + +static int nft_limit_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + if (tb[NFTA_LIMIT_RATE] == NULL || + tb[NFTA_LIMIT_UNIT] == NULL) + return -EINVAL; + + priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); + priv->stamp = jiffies + priv->unit * HZ; + priv->tokens = priv->rate; + return 0; +} + +static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_limit *priv = nft_expr_priv(expr); + + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate))) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_limit_type; +static const struct nft_expr_ops nft_limit_ops = { + .type = &nft_limit_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), + .eval = nft_limit_eval, + .init = nft_limit_init, + .dump = nft_limit_dump, +}; + +static struct nft_expr_type nft_limit_type __read_mostly = { + .name = "limit", + .ops = &nft_limit_ops, + .policy = nft_limit_policy, + .maxattr = NFTA_LIMIT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_limit_module_init(void) +{ + return nft_register_expr(&nft_limit_type); +} + +static void __exit nft_limit_module_exit(void) +{ + nft_unregister_expr(&nft_limit_type); +} + +module_init(nft_limit_module_init); +module_exit(nft_limit_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("limit"); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c new file mode 100644 index 00000000000..10cfb156cdf --- /dev/null +++ b/net/netfilter/nft_log.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netdevice.h> + +static const char *nft_log_null_prefix = ""; + +struct nft_log { + struct nf_loginfo loginfo; + char *prefix; +}; + +static void nft_log_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_log *priv = nft_expr_priv(expr); + struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + + nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, + pkt->out, &priv->loginfo, "%s", priv->prefix); +} + +static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { + [NFTA_LOG_GROUP] = { .type = NLA_U16 }, + [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, + [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, + [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, +}; + +static int nft_log_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_log *priv = nft_expr_priv(expr); + struct nf_loginfo *li = &priv->loginfo; + const struct nlattr *nla; + + nla = tb[NFTA_LOG_PREFIX]; + if (nla != NULL) { + priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); + if (priv->prefix == NULL) + return -ENOMEM; + nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); + } else + priv->prefix = (char *)nft_log_null_prefix; + + li->type = NF_LOG_TYPE_ULOG; + if (tb[NFTA_LOG_GROUP] != NULL) + li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); + + if (tb[NFTA_LOG_SNAPLEN] != NULL) + li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); + if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { + li->u.ulog.qthreshold = + ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + } + + return 0; +} + +static void nft_log_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_log *priv = nft_expr_priv(expr); + + if (priv->prefix != nft_log_null_prefix) + kfree(priv->prefix); +} + +static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_log *priv = nft_expr_priv(expr); + const struct nf_loginfo *li = &priv->loginfo; + + if (priv->prefix != nft_log_null_prefix) + if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) + goto nla_put_failure; + if (li->u.ulog.group) + if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) + goto nla_put_failure; + if (li->u.ulog.copy_len) + if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, + htonl(li->u.ulog.copy_len))) + goto nla_put_failure; + if (li->u.ulog.qthreshold) + if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, + htons(li->u.ulog.qthreshold))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_log_type; +static const struct nft_expr_ops nft_log_ops = { + .type = &nft_log_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_log)), + .eval = nft_log_eval, + .init = nft_log_init, + .destroy = nft_log_destroy, + .dump = nft_log_dump, +}; + +static struct nft_expr_type nft_log_type __read_mostly = { + .name = "log", + .ops = &nft_log_ops, + .policy = nft_log_policy, + .maxattr = NFTA_LOG_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_log_module_init(void) +{ + return nft_register_expr(&nft_log_type); +} + +static void __exit nft_log_module_exit(void) +{ + nft_unregister_expr(&nft_log_type); +} + +module_init(nft_log_module_init); +module_exit(nft_log_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("log"); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c new file mode 100644 index 00000000000..6404a726d17 --- /dev/null +++ b/net/netfilter/nft_lookup.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> + +struct nft_lookup { + struct nft_set *set; + enum nft_registers sreg:8; + enum nft_registers dreg:8; + struct nft_set_binding binding; +}; + +static void nft_lookup_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_lookup *priv = nft_expr_priv(expr); + const struct nft_set *set = priv->set; + + if (set->ops->lookup(set, &data[priv->sreg], &data[priv->dreg])) + return; + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { + [NFTA_LOOKUP_SET] = { .type = NLA_STRING }, + [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, + [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, +}; + +static int nft_lookup_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + struct nft_set *set; + int err; + + if (tb[NFTA_LOOKUP_SET] == NULL || + tb[NFTA_LOOKUP_SREG] == NULL) + return -EINVAL; + + set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); + if (IS_ERR(set)) { + if (tb[NFTA_LOOKUP_SET_ID]) { + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_LOOKUP_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + if (tb[NFTA_LOOKUP_DREG] != NULL) { + if (!(set->flags & NFT_SET_MAP)) + return -EINVAL; + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + if (priv->dreg == NFT_REG_VERDICT) { + if (set->dtype != NFT_DATA_VERDICT) + return -EINVAL; + } else if (set->dtype == NFT_DATA_VERDICT) + return -EINVAL; + } else if (set->flags & NFT_SET_MAP) + return -EINVAL; + + err = nf_tables_bind_set(ctx, set, &priv->binding); + if (err < 0) + return err; + + priv->set = set; + return 0; +} + +static void nft_lookup_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_lookup *priv = nft_expr_priv(expr); + + if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_LOOKUP_SREG, htonl(priv->sreg))) + goto nla_put_failure; + if (priv->set->flags & NFT_SET_MAP) + if (nla_put_be32(skb, NFTA_LOOKUP_DREG, htonl(priv->dreg))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_lookup_type; +static const struct nft_expr_ops nft_lookup_ops = { + .type = &nft_lookup_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), + .eval = nft_lookup_eval, + .init = nft_lookup_init, + .destroy = nft_lookup_destroy, + .dump = nft_lookup_dump, +}; + +static struct nft_expr_type nft_lookup_type __read_mostly = { + .name = "lookup", + .ops = &nft_lookup_ops, + .policy = nft_lookup_policy, + .maxattr = NFTA_LOOKUP_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_lookup_module_init(void) +{ + return nft_register_expr(&nft_lookup_type); +} + +void nft_lookup_module_exit(void) +{ + nft_unregister_expr(&nft_lookup_type); +} diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c new file mode 100644 index 00000000000..852b178c6ae --- /dev/null +++ b/net/netfilter/nft_meta.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/tcp_states.h> /* for TCP_TIME_WAIT */ +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> + +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + const struct net_device *in = pkt->in, *out = pkt->out; + struct nft_data *dest = &data[priv->dreg]; + + switch (priv->key) { + case NFT_META_LEN: + dest->data[0] = skb->len; + break; + case NFT_META_PROTOCOL: + *(__be16 *)dest->data = skb->protocol; + break; + case NFT_META_NFPROTO: + dest->data[0] = pkt->ops->pf; + break; + case NFT_META_L4PROTO: + dest->data[0] = pkt->tprot; + break; + case NFT_META_PRIORITY: + dest->data[0] = skb->priority; + break; + case NFT_META_MARK: + dest->data[0] = skb->mark; + break; + case NFT_META_IIF: + if (in == NULL) + goto err; + dest->data[0] = in->ifindex; + break; + case NFT_META_OIF: + if (out == NULL) + goto err; + dest->data[0] = out->ifindex; + break; + case NFT_META_IIFNAME: + if (in == NULL) + goto err; + strncpy((char *)dest->data, in->name, sizeof(dest->data)); + break; + case NFT_META_OIFNAME: + if (out == NULL) + goto err; + strncpy((char *)dest->data, out->name, sizeof(dest->data)); + break; + case NFT_META_IIFTYPE: + if (in == NULL) + goto err; + *(u16 *)dest->data = in->type; + break; + case NFT_META_OIFTYPE: + if (out == NULL) + goto err; + *(u16 *)dest->data = out->type; + break; + case NFT_META_SKUID: + if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + goto err; + + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket == NULL || + skb->sk->sk_socket->file == NULL) { + read_unlock_bh(&skb->sk->sk_callback_lock); + goto err; + } + + dest->data[0] = + from_kuid_munged(&init_user_ns, + skb->sk->sk_socket->file->f_cred->fsuid); + read_unlock_bh(&skb->sk->sk_callback_lock); + break; + case NFT_META_SKGID: + if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + goto err; + + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket == NULL || + skb->sk->sk_socket->file == NULL) { + read_unlock_bh(&skb->sk->sk_callback_lock); + goto err; + } + dest->data[0] = + from_kgid_munged(&init_user_ns, + skb->sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&skb->sk->sk_callback_lock); + break; +#ifdef CONFIG_IP_ROUTE_CLASSID + case NFT_META_RTCLASSID: { + const struct dst_entry *dst = skb_dst(skb); + + if (dst == NULL) + goto err; + dest->data[0] = dst->tclassid; + break; + } +#endif +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: + dest->data[0] = skb->secmark; + break; +#endif + default: + WARN_ON(1); + goto err; + } + return; + +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} +EXPORT_SYMBOL_GPL(nft_meta_get_eval); + +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *meta = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + u32 value = data[meta->sreg].data[0]; + + switch (meta->key) { + case NFT_META_MARK: + skb->mark = value; + break; + case NFT_META_PRIORITY: + skb->priority = value; + break; + case NFT_META_NFTRACE: + skb->nf_trace = 1; + break; + default: + WARN_ON(1); + } +} +EXPORT_SYMBOL_GPL(nft_meta_set_eval); + +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { + [NFTA_META_DREG] = { .type = NLA_U32 }, + [NFTA_META_KEY] = { .type = NLA_U32 }, + [NFTA_META_SREG] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_meta_policy); + +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_LEN: + case NFT_META_PROTOCOL: + case NFT_META_NFPROTO: + case NFT_META_L4PROTO: + case NFT_META_PRIORITY: + case NFT_META_MARK: + case NFT_META_IIF: + case NFT_META_OIF: + case NFT_META_IIFNAME: + case NFT_META_OIFNAME: + case NFT_META_IIFTYPE: + case NFT_META_OIFTYPE: + case NFT_META_SKUID: + case NFT_META_SKGID: +#ifdef CONFIG_IP_ROUTE_CLASSID + case NFT_META_RTCLASSID: +#endif +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: +#endif + break; + default: + return -EOPNOTSUPP; + } + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_get_init); + +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_MARK: + case NFT_META_PRIORITY: + case NFT_META_NFTRACE: + break; + default: + return -EOPNOTSUPP; + } + + priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_meta_set_init); + +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_get_dump); + +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_META_SREG, htonl(priv->sreg))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_meta_set_dump); + +static struct nft_expr_type nft_meta_type; +static const struct nft_expr_ops nft_meta_get_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_get_eval, + .init = nft_meta_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_set_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .dump = nft_meta_set_dump, +}; + +static const struct nft_expr_ops * +nft_meta_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_type __read_mostly = { + .name = "meta", + .select_ops = &nft_meta_select_ops, + .policy = nft_meta_policy, + .maxattr = NFTA_META_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_meta_module_init(void) +{ + return nft_register_expr(&nft_meta_type); +} + +static void __exit nft_meta_module_exit(void) +{ + nft_unregister_expr(&nft_meta_type); +} + +module_init(nft_meta_module_init); +module_exit(nft_meta_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("meta"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c new file mode 100644 index 00000000000..79ff58cd36d --- /dev/null +++ b/net/netfilter/nft_nat.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/string.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +struct nft_nat { + enum nft_registers sreg_addr_min:8; + enum nft_registers sreg_addr_max:8; + enum nft_registers sreg_proto_min:8; + enum nft_registers sreg_proto_max:8; + enum nf_nat_manip_type type:8; + u8 family; +}; + +static void nft_nat_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_nat *priv = nft_expr_priv(expr); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); + struct nf_nat_range range; + + memset(&range, 0, sizeof(range)); + if (priv->sreg_addr_min) { + if (priv->family == AF_INET) { + range.min_addr.ip = (__force __be32) + data[priv->sreg_addr_min].data[0]; + range.max_addr.ip = (__force __be32) + data[priv->sreg_addr_max].data[0]; + + } else { + memcpy(range.min_addr.ip6, + data[priv->sreg_addr_min].data, + sizeof(struct nft_data)); + memcpy(range.max_addr.ip6, + data[priv->sreg_addr_max].data, + sizeof(struct nft_data)); + } + range.flags |= NF_NAT_RANGE_MAP_IPS; + } + + if (priv->sreg_proto_min) { + range.min_proto.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + range.max_proto.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + data[NFT_REG_VERDICT].verdict = + nf_nat_setup_info(ct, &range, priv->type); +} + +static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { + [NFTA_NAT_TYPE] = { .type = NLA_U32 }, + [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, + [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, + [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, + [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, + [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, +}; + +static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_nat *priv = nft_expr_priv(expr); + u32 family; + int err; + + if (tb[NFTA_NAT_TYPE] == NULL) + return -EINVAL; + + switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { + case NFT_NAT_SNAT: + priv->type = NF_NAT_MANIP_SRC; + break; + case NFT_NAT_DNAT: + priv->type = NF_NAT_MANIP_DST; + break; + default: + return -EINVAL; + } + + if (tb[NFTA_NAT_FAMILY] == NULL) + return -EINVAL; + + family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + if (family != ctx->afi->family) + return -EOPNOTSUPP; + priv->family = family; + + if (tb[NFTA_NAT_REG_ADDR_MIN]) { + priv->sreg_addr_min = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_ADDR_MIN])); + err = nft_validate_input_register(priv->sreg_addr_min); + if (err < 0) + return err; + } + + if (tb[NFTA_NAT_REG_ADDR_MAX]) { + priv->sreg_addr_max = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_ADDR_MAX])); + err = nft_validate_input_register(priv->sreg_addr_max); + if (err < 0) + return err; + } else + priv->sreg_addr_max = priv->sreg_addr_min; + + if (tb[NFTA_NAT_REG_PROTO_MIN]) { + priv->sreg_proto_min = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_PROTO_MIN])); + err = nft_validate_input_register(priv->sreg_proto_min); + if (err < 0) + return err; + } + + if (tb[NFTA_NAT_REG_PROTO_MAX]) { + priv->sreg_proto_max = ntohl(nla_get_be32( + tb[NFTA_NAT_REG_PROTO_MAX])); + err = nft_validate_input_register(priv->sreg_proto_max); + if (err < 0) + return err; + } else + priv->sreg_proto_max = priv->sreg_proto_min; + + return 0; +} + +static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_nat *priv = nft_expr_priv(expr); + + switch (priv->type) { + case NF_NAT_MANIP_SRC: + if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) + goto nla_put_failure; + break; + case NF_NAT_MANIP_DST: + if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) + goto nla_put_failure; + break; + } + + if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) + goto nla_put_failure; + if (nla_put_be32(skb, + NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min))) + goto nla_put_failure; + if (nla_put_be32(skb, + NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) + goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_nat_type; +static const struct nft_expr_ops nft_nat_ops = { + .type = &nft_nat_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), + .eval = nft_nat_eval, + .init = nft_nat_init, + .dump = nft_nat_dump, +}; + +static struct nft_expr_type nft_nat_type __read_mostly = { + .name = "nat", + .ops = &nft_nat_ops, + .policy = nft_nat_policy, + .maxattr = NFTA_NAT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_nat_module_init(void) +{ + return nft_register_expr(&nft_nat_type); +} + +static void __exit nft_nat_module_exit(void) +{ + nft_unregister_expr(&nft_nat_type); +} + +module_init(nft_nat_module_init); +module_exit(nft_nat_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_EXPR("nat"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c new file mode 100644 index 00000000000..85daa84bfdf --- /dev/null +++ b/net/netfilter/nft_payload.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +static void nft_payload_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + struct nft_data *dest = &data[priv->dreg]; + int offset; + + switch (priv->base) { + case NFT_PAYLOAD_LL_HEADER: + if (!skb_mac_header_was_set(skb)) + goto err; + offset = skb_mac_header(skb) - skb->data; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + offset = skb_network_offset(skb); + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + offset = pkt->xt.thoff; + break; + default: + BUG(); + } + offset += priv->offset; + + if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0) + goto err; + return; +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { + [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, +}; + +static int nft_payload_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload *priv = nft_expr_priv(expr); + int err; + + priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); +} + +static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) || + nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || + nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || + nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_payload_type; +static const struct nft_expr_ops nft_payload_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .eval = nft_payload_eval, + .init = nft_payload_init, + .dump = nft_payload_dump, +}; + +const struct nft_expr_ops nft_payload_fast_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .eval = nft_payload_eval, + .init = nft_payload_init, + .dump = nft_payload_dump, +}; + +static const struct nft_expr_ops * +nft_payload_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + enum nft_payload_bases base; + unsigned int offset, len; + + if (tb[NFTA_PAYLOAD_DREG] == NULL || + tb[NFTA_PAYLOAD_BASE] == NULL || + tb[NFTA_PAYLOAD_OFFSET] == NULL || + tb[NFTA_PAYLOAD_LEN] == NULL) + return ERR_PTR(-EINVAL); + + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + switch (base) { + case NFT_PAYLOAD_LL_HEADER: + case NFT_PAYLOAD_NETWORK_HEADER: + case NFT_PAYLOAD_TRANSPORT_HEADER: + break; + default: + return ERR_PTR(-EOPNOTSUPP); + } + + offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data)) + return ERR_PTR(-EINVAL); + + if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && + base != NFT_PAYLOAD_LL_HEADER) + return &nft_payload_fast_ops; + else + return &nft_payload_ops; +} + +static struct nft_expr_type nft_payload_type __read_mostly = { + .name = "payload", + .select_ops = nft_payload_select_ops, + .policy = nft_payload_policy, + .maxattr = NFTA_PAYLOAD_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_payload_module_init(void) +{ + return nft_register_expr(&nft_payload_type); +} + +void nft_payload_module_exit(void) +{ + nft_unregister_expr(&nft_payload_type); +} diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c new file mode 100644 index 00000000000..e8ae2f6bf23 --- /dev/null +++ b/net/netfilter/nft_queue.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code partly funded by OISF + * (http://www.openinfosecfoundation.org/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/jhash.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_queue.h> + +static u32 jhash_initval __read_mostly; + +struct nft_queue { + u16 queuenum; + u16 queues_total; + u16 flags; +}; + +static void nft_queue_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_queue *priv = nft_expr_priv(expr); + u32 queue = priv->queuenum; + u32 ret; + + if (priv->queues_total > 1) { + if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { + int cpu = smp_processor_id(); + + queue = priv->queuenum + cpu % priv->queues_total; + } else { + queue = nfqueue_hash(pkt->skb, queue, + priv->queues_total, pkt->ops->pf, + jhash_initval); + } + } + + ret = NF_QUEUE_NR(queue); + if (priv->flags & NFT_QUEUE_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + data[NFT_REG_VERDICT].verdict = ret; +} + +static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { + [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, + [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, + [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, +}; + +static int nft_queue_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_queue *priv = nft_expr_priv(expr); + + if (tb[NFTA_QUEUE_NUM] == NULL) + return -EINVAL; + + init_hashrandom(&jhash_initval); + priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); + + if (tb[NFTA_QUEUE_TOTAL] != NULL) + priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); + if (tb[NFTA_QUEUE_FLAGS] != NULL) { + priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); + if (priv->flags & ~NFT_QUEUE_FLAG_MASK) + return -EINVAL; + } + return 0; +} + +static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_queue *priv = nft_expr_priv(expr); + + if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) || + nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) || + nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_queue_type; +static const struct nft_expr_ops nft_queue_ops = { + .type = &nft_queue_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), + .eval = nft_queue_eval, + .init = nft_queue_init, + .dump = nft_queue_dump, +}; + +static struct nft_expr_type nft_queue_type __read_mostly = { + .name = "queue", + .ops = &nft_queue_ops, + .policy = nft_queue_policy, + .maxattr = NFTA_QUEUE_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_queue_module_init(void) +{ + return nft_register_expr(&nft_queue_type); +} + +static void __exit nft_queue_module_exit(void) +{ + nft_unregister_expr(&nft_queue_type); +} + +module_init(nft_queue_module_init); +module_exit(nft_queue_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); +MODULE_ALIAS_NFT_EXPR("queue"); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c new file mode 100644 index 00000000000..e1836ff8819 --- /dev/null +++ b/net/netfilter/nft_rbtree.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +static DEFINE_SPINLOCK(nft_rbtree_lock); + +struct nft_rbtree { + struct rb_root root; +}; + +struct nft_rbtree_elem { + struct rb_node node; + u16 flags; + struct nft_data key; + struct nft_data data[]; +}; + +static bool nft_rbtree_lookup(const struct nft_set *set, + const struct nft_data *key, + struct nft_data *data) +{ + const struct nft_rbtree *priv = nft_set_priv(set); + const struct nft_rbtree_elem *rbe, *interval = NULL; + const struct rb_node *parent = priv->root.rb_node; + int d; + + spin_lock_bh(&nft_rbtree_lock); + while (parent != NULL) { + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + + d = nft_data_cmp(&rbe->key, key, set->klen); + if (d < 0) { + parent = parent->rb_left; + interval = rbe; + } else if (d > 0) + parent = parent->rb_right; + else { +found: + if (rbe->flags & NFT_SET_ELEM_INTERVAL_END) + goto out; + if (set->flags & NFT_SET_MAP) + nft_data_copy(data, rbe->data); + + spin_unlock_bh(&nft_rbtree_lock); + return true; + } + } + + if (set->flags & NFT_SET_INTERVAL && interval != NULL) { + rbe = interval; + goto found; + } +out: + spin_unlock_bh(&nft_rbtree_lock); + return false; +} + +static void nft_rbtree_elem_destroy(const struct nft_set *set, + struct nft_rbtree_elem *rbe) +{ + nft_data_uninit(&rbe->key, NFT_DATA_VALUE); + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(rbe->data, set->dtype); + + kfree(rbe); +} + +static int __nft_rbtree_insert(const struct nft_set *set, + struct nft_rbtree_elem *new) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *parent, **p; + int d; + + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_data_cmp(&rbe->key, &new->key, set->klen); + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else + return -EEXIST; + } + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &priv->root); + return 0; +} + +static int nft_rbtree_insert(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_rbtree_elem *rbe; + unsigned int size; + int err; + + size = sizeof(*rbe); + if (set->flags & NFT_SET_MAP && + !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) + size += sizeof(rbe->data[0]); + + rbe = kzalloc(size, GFP_KERNEL); + if (rbe == NULL) + return -ENOMEM; + + rbe->flags = elem->flags; + nft_data_copy(&rbe->key, &elem->key); + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_copy(rbe->data, &elem->data); + + spin_lock_bh(&nft_rbtree_lock); + err = __nft_rbtree_insert(set, rbe); + if (err < 0) + kfree(rbe); + + spin_unlock_bh(&nft_rbtree_lock); + return err; +} + +static void nft_rbtree_remove(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe = elem->cookie; + + spin_lock_bh(&nft_rbtree_lock); + rb_erase(&rbe->node, &priv->root); + spin_unlock_bh(&nft_rbtree_lock); + kfree(rbe); +} + +static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) +{ + const struct nft_rbtree *priv = nft_set_priv(set); + const struct rb_node *parent = priv->root.rb_node; + struct nft_rbtree_elem *rbe; + int d; + + spin_lock_bh(&nft_rbtree_lock); + while (parent != NULL) { + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + + d = nft_data_cmp(&rbe->key, &elem->key, set->klen); + if (d < 0) + parent = parent->rb_left; + else if (d > 0) + parent = parent->rb_right; + else { + elem->cookie = rbe; + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_copy(&elem->data, rbe->data); + elem->flags = rbe->flags; + spin_unlock_bh(&nft_rbtree_lock); + return 0; + } + } + spin_unlock_bh(&nft_rbtree_lock); + return -ENOENT; +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + const struct nft_set *set, + struct nft_set_iter *iter) +{ + const struct nft_rbtree *priv = nft_set_priv(set); + const struct nft_rbtree_elem *rbe; + struct nft_set_elem elem; + struct rb_node *node; + + spin_lock_bh(&nft_rbtree_lock); + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + if (iter->count < iter->skip) + goto cont; + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + nft_data_copy(&elem.key, &rbe->key); + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_copy(&elem.data, rbe->data); + elem.flags = rbe->flags; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) { + spin_unlock_bh(&nft_rbtree_lock); + return; + } +cont: + iter->count++; + } + spin_unlock_bh(&nft_rbtree_lock); +} + +static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) +{ + return sizeof(struct nft_rbtree); +} + +static int nft_rbtree_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + priv->root = RB_ROOT; + return 0; +} + +static void nft_rbtree_destroy(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *node; + + spin_lock_bh(&nft_rbtree_lock); + while ((node = priv->root.rb_node) != NULL) { + rb_erase(node, &priv->root); + rbe = rb_entry(node, struct nft_rbtree_elem, node); + nft_rbtree_elem_destroy(set, rbe); + } + spin_unlock_bh(&nft_rbtree_lock); +} + +static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int nsize; + + nsize = sizeof(struct nft_rbtree_elem); + if (features & NFT_SET_MAP) + nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); + + if (desc->size) + est->size = sizeof(struct nft_rbtree) + desc->size * nsize; + else + est->size = nsize; + + est->class = NFT_SET_CLASS_O_LOG_N; + + return true; +} + +static struct nft_set_ops nft_rbtree_ops __read_mostly = { + .privsize = nft_rbtree_privsize, + .estimate = nft_rbtree_estimate, + .init = nft_rbtree_init, + .destroy = nft_rbtree_destroy, + .insert = nft_rbtree_insert, + .remove = nft_rbtree_remove, + .get = nft_rbtree_get, + .lookup = nft_rbtree_lookup, + .walk = nft_rbtree_walk, + .features = NFT_SET_INTERVAL | NFT_SET_MAP, + .owner = THIS_MODULE, +}; + +static int __init nft_rbtree_module_init(void) +{ + return nft_register_set(&nft_rbtree_ops); +} + +static void __exit nft_rbtree_module_exit(void) +{ + nft_unregister_set(&nft_rbtree_ops); +} + +module_init(nft_rbtree_module_init); +module_exit(nft_rbtree_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c new file mode 100644 index 00000000000..f3448c29644 --- /dev/null +++ b/net/netfilter/nft_reject.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { + [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, +}; +EXPORT_SYMBOL_GPL(nft_reject_policy); + +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_reject_init); + +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_reject_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 00000000000..b718a52a465 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_inet_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + return nft_reject_ipv4_eval(expr, data, pkt); + case NFPROTO_IPV6: + return nft_reject_ipv6_eval(expr, data, pkt); + } +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { + .type = &nft_reject_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_inet_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "reject", + .ops = &nft_reject_inet_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ + return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ + nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8d987c3573f..227aa11e840 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -2,6 +2,7 @@ * x_tables core - Backend for {ip,ip6,arp}_tables * * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> + * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on existing ip_tables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling @@ -345,19 +346,27 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, } EXPORT_SYMBOL_GPL(xt_find_revision); -static char *textify_hooks(char *buf, size_t size, unsigned int mask) +static char * +textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto) { - static const char *const names[] = { + static const char *const inetbr_names[] = { "PREROUTING", "INPUT", "FORWARD", "OUTPUT", "POSTROUTING", "BROUTING", }; - unsigned int i; + static const char *const arp_names[] = { + "INPUT", "FORWARD", "OUTPUT", + }; + const char *const *names; + unsigned int i, max; char *p = buf; bool np = false; int res; + names = (nfproto == NFPROTO_ARP) ? arp_names : inetbr_names; + max = (nfproto == NFPROTO_ARP) ? ARRAY_SIZE(arp_names) : + ARRAY_SIZE(inetbr_names); *p = '\0'; - for (i = 0; i < ARRAY_SIZE(names); ++i) { + for (i = 0; i < max; ++i) { if (!(mask & (1 << i))) continue; res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]); @@ -402,8 +411,10 @@ int xt_check_match(struct xt_mtchk_param *par, pr_err("%s_tables: %s match: used from hooks %s, but only " "valid from %s\n", xt_prefix[par->family], par->match->name, - textify_hooks(used, sizeof(used), par->hook_mask), - textify_hooks(allow, sizeof(allow), par->match->hooks)); + textify_hooks(used, sizeof(used), par->hook_mask, + par->family), + textify_hooks(allow, sizeof(allow), par->match->hooks, + par->family)); return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { @@ -575,8 +586,10 @@ int xt_check_target(struct xt_tgchk_param *par, pr_err("%s_tables: %s target: used from hooks %s, but only " "usable from %s\n", xt_prefix[par->family], par->target->name, - textify_hooks(used, sizeof(used), par->hook_mask), - textify_hooks(allow, sizeof(allow), par->target->hooks)); + textify_hooks(used, sizeof(used), par->hook_mask, + par->family), + textify_hooks(allow, sizeof(allow), par->target->hooks, + par->family)); return -EINVAL; } if (par->target->proto && (par->target->proto != proto || inv_proto)) { @@ -832,8 +845,13 @@ xt_replace_table(struct xt_table *table, return NULL; } - table->private = newinfo; newinfo->initial_entries = private->initial_entries; + /* + * Ensure contents of newinfo are visible before assigning to + * private. + */ + smp_wmb(); + table->private = newinfo; /* * Even though table entries have now been swapped, other CPU's @@ -987,7 +1005,7 @@ static int xt_table_open(struct inode *inode, struct file *file) sizeof(struct xt_names_priv)); if (!ret) { priv = ((struct seq_file *)file->private_data)->private; - priv->af = (unsigned long)PDE(inode)->data; + priv->af = (unsigned long)PDE_DATA(inode); } return ret; } @@ -1135,7 +1153,7 @@ static int xt_match_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = trav; - trav->nfproto = (unsigned long)PDE(inode)->data; + trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1199,7 +1217,7 @@ static int xt_target_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = trav; - trav->nfproto = (unsigned long)PDE(inode)->data; + trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1311,12 +1329,12 @@ int xt_proto_init(struct net *net, u_int8_t af) out_remove_matches: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); out_remove_tables: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); out: return -1; #endif @@ -1330,15 +1348,15 @@ void xt_proto_fini(struct net *net, u_int8_t af) strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); #endif /*CONFIG_PROC_FS*/ } EXPORT_SYMBOL_GPL(xt_proto_fini); diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index ba92824086f..4973cbddc44 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -124,6 +124,9 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_audit_info *info = par->targinfo; struct audit_buffer *ab; + if (audit_enabled == 0) + goto errout; + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); if (ab == NULL) goto errout; @@ -143,11 +146,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) if (par->family == NFPROTO_BRIDGE) { switch (eth_hdr(skb)->h_proto) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): audit_ip4(ab, skb); break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): audit_ip6(ab, skb); break; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 0221d10de75..75747aecdeb 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -14,20 +14,21 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CT.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> -static unsigned int xt_ct_target(struct sk_buff *skb, - const struct xt_action_param *par) +static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) { - const struct xt_ct_target_info *info = par->targinfo; - struct nf_conn *ct = info->ct; - /* Previously seen (loopback)? Ignore. */ if (skb->nfct != NULL) return XT_CONTINUE; + /* special case the untracked ct : we want the percpu object */ + if (!ct) + ct = nf_ct_untracked_get(); atomic_inc(&ct->ct_general.use); skb->nfct = &ct->ct_general; skb->nfctinfo = IP_CT_NEW; @@ -35,6 +36,24 @@ static unsigned int xt_ct_target(struct sk_buff *skb, return XT_CONTINUE; } +static unsigned int xt_ct_target_v0(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + + return xt_ct_target(skb, ct); +} + +static unsigned int xt_ct_target_v1(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info_v1 *info = par->targinfo; + struct nf_conn *ct = info->ct; + + return xt_ct_target(skb, ct); +} + static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) { if (par->family == NFPROTO_IPV4) { @@ -53,21 +72,124 @@ static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) return 0; } -static int xt_ct_tg_check(const struct xt_tgchk_param *par) +static int +xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, + const struct xt_tgchk_param *par) { - struct xt_ct_target_info *info = par->targinfo; - struct nf_conntrack_tuple t; + struct nf_conntrack_helper *helper; struct nf_conn_help *help; - struct nf_conn *ct; + u8 proto; + + proto = xt_ct_find_proto(par); + if (!proto) { + pr_info("You must specify a L4 protocol, and not use " + "inversions on it.\n"); + return -ENOENT; + } + + helper = nf_conntrack_helper_try_module_get(helper_name, par->family, + proto); + if (helper == NULL) { + pr_info("No such helper \"%s\"\n", helper_name); + return -ENOENT; + } + + help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); + if (help == NULL) { + module_put(helper->me); + return -ENOMEM; + } + + help->helper = helper; + return 0; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) +{ + typeof(nf_ct_timeout_put_hook) timeout_put; + + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + if (timeout_put) + timeout_put(timeout); +} +#endif + +static int +xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, + const char *timeout_name) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + typeof(nf_ct_timeout_find_get_hook) timeout_find_get; + struct ctnl_timeout *timeout; + struct nf_conn_timeout *timeout_ext; + struct nf_conntrack_l4proto *l4proto; int ret = 0; u8 proto; - if (info->flags & ~XT_CT_NOTRACK) - return -EINVAL; + rcu_read_lock(); + timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); + if (timeout_find_get == NULL) { + ret = -ENOENT; + pr_info("Timeout policy base is empty\n"); + goto out; + } + + proto = xt_ct_find_proto(par); + if (!proto) { + ret = -EINVAL; + pr_info("You must specify a L4 protocol, and not use " + "inversions on it.\n"); + goto out; + } + + timeout = timeout_find_get(timeout_name); + if (timeout == NULL) { + ret = -ENOENT; + pr_info("No such timeout policy \"%s\"\n", timeout_name); + goto out; + } + + if (timeout->l3num != par->family) { + ret = -EINVAL; + pr_info("Timeout policy `%s' can only be used by L3 protocol " + "number %d\n", timeout_name, timeout->l3num); + goto err_put_timeout; + } + /* Make sure the timeout policy matches any existing protocol tracker, + * otherwise default to generic. + */ + l4proto = __nf_ct_l4proto_find(par->family, proto); + if (timeout->l4proto->l4proto != l4proto->l4proto) { + ret = -EINVAL; + pr_info("Timeout policy `%s' can only be used by L4 protocol " + "number %d\n", + timeout_name, timeout->l4proto->l4proto); + goto err_put_timeout; + } + timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); + if (timeout_ext == NULL) + ret = -ENOMEM; + +err_put_timeout: + __xt_ct_tg_timeout_put(timeout); +out: + rcu_read_unlock(); + return ret; +#else + return -EOPNOTSUPP; +#endif +} + +static int xt_ct_tg_check(const struct xt_tgchk_param *par, + struct xt_ct_target_info_v1 *info) +{ + struct nf_conntrack_tuple t; + struct nf_conn *ct; + int ret = -EOPNOTSUPP; if (info->flags & XT_CT_NOTRACK) { - ct = nf_ct_untracked_get(); - atomic_inc(&ct->ct_general.use); + ct = NULL; goto out; } @@ -89,35 +211,24 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par) ret = 0; if ((info->ct_events || info->exp_events) && !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, - GFP_KERNEL)) + GFP_KERNEL)) { + ret = -EINVAL; goto err3; + } if (info->helper[0]) { - ret = -ENOENT; - proto = xt_ct_find_proto(par); - if (!proto) { - pr_info("You must specify a L4 protocol, " - "and not use inversions on it.\n"); - goto err3; - } - - ret = -ENOMEM; - help = nf_ct_helper_ext_add(ct, GFP_KERNEL); - if (help == NULL) + ret = xt_ct_set_helper(ct, info->helper, par); + if (ret < 0) goto err3; + } - ret = -ENOENT; - help->helper = nf_conntrack_helper_try_module_get(info->helper, - par->family, - proto); - if (help->helper == NULL) { - pr_info("No such helper \"%s\"\n", info->helper); + if (info->timeout[0]) { + ret = xt_ct_set_timeout(ct, par, info->timeout); + if (ret < 0) goto err3; - } } - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_tmpl_insert(par->net, ct); out: info->ct = ct; return 0; @@ -130,41 +241,196 @@ err1: return ret; } -static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par) +static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par) { struct xt_ct_target_info *info = par->targinfo; + struct xt_ct_target_info_v1 info_v1 = { + .flags = info->flags, + .zone = info->zone, + .ct_events = info->ct_events, + .exp_events = info->exp_events, + }; + int ret; + + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + + ret = xt_ct_tg_check(par, &info_v1); + if (ret < 0) + return ret; + + info->ct = info_v1.ct; + + return ret; +} + +static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + return xt_ct_tg_check(par, par->targinfo); +} + +static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + + if (info->flags & ~XT_CT_MASK) + return -EINVAL; + + return xt_ct_tg_check(par, par->targinfo); +} + +static void xt_ct_destroy_timeout(struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + struct nf_conn_timeout *timeout_ext; + typeof(nf_ct_timeout_put_hook) timeout_put; + + rcu_read_lock(); + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + + if (timeout_put) { + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) + timeout_put(timeout_ext->timeout); + } + rcu_read_unlock(); +#endif +} + +static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, + struct xt_ct_target_info_v1 *info) +{ struct nf_conn *ct = info->ct; struct nf_conn_help *help; - if (!nf_ct_is_untracked(ct)) { + if (ct && !nf_ct_is_untracked(ct)) { help = nfct_help(ct); if (help) module_put(help->helper->me); nf_ct_l3proto_module_put(par->family); + + xt_ct_destroy_timeout(ct); + nf_ct_put(info->ct); + } +} + +static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct xt_ct_target_info_v1 info_v1 = { + .flags = info->flags, + .zone = info->zone, + .ct_events = info->ct_events, + .exp_events = info->exp_events, + .ct = info->ct, + }; + memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + + xt_ct_tg_destroy(par, &info_v1); +} + +static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) +{ + xt_ct_tg_destroy(par, par->targinfo); +} + +static struct xt_target xt_ct_tg_reg[] __read_mostly = { + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .targetsize = sizeof(struct xt_ct_target_info), + .checkentry = xt_ct_tg_check_v0, + .destroy = xt_ct_tg_destroy_v0, + .target = xt_ct_target_v0, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .revision = 1, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .checkentry = xt_ct_tg_check_v1, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .revision = 2, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .checkentry = xt_ct_tg_check_v2, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, +}; + +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + + return XT_CONTINUE; +} + +static int notrack_chk(const struct xt_tgchk_param *par) +{ + if (!par->net->xt.notrack_deprecated_warning) { + pr_info("netfilter: NOTRACK target is deprecated, " + "use CT instead or upgrade iptables\n"); + par->net->xt.notrack_deprecated_warning = true; } - nf_ct_put(info->ct); + return 0; } -static struct xt_target xt_ct_tg __read_mostly = { - .name = "CT", +static struct xt_target notrack_tg_reg __read_mostly = { + .name = "NOTRACK", + .revision = 0, .family = NFPROTO_UNSPEC, - .targetsize = sizeof(struct xt_ct_target_info), - .checkentry = xt_ct_tg_check, - .destroy = xt_ct_tg_destroy, - .target = xt_ct_target, + .checkentry = notrack_chk, + .target = notrack_tg, .table = "raw", .me = THIS_MODULE, }; static int __init xt_ct_tg_init(void) { - return xt_register_target(&xt_ct_tg); + int ret; + + ret = xt_register_target(¬rack_tg_reg); + if (ret < 0) + return ret; + + ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); + if (ret < 0) { + xt_unregister_target(¬rack_tg_reg); + return ret; + } + return 0; } static void __exit xt_ct_tg_exit(void) { - xt_unregister_target(&xt_ct_tg); + xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); + xt_unregister_target(¬rack_tg_reg); } module_init(xt_ct_tg_init); @@ -174,3 +440,5 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: connection tracking target"); MODULE_ALIAS("ipt_CT"); MODULE_ALIAS("ip6t_CT"); +MODULE_ALIAS("ipt_NOTRACK"); +MODULE_ALIAS("ip6t_NOTRACK"); diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c new file mode 100644 index 00000000000..73b73f687c5 --- /dev/null +++ b/net/netfilter/xt_HMARK.c @@ -0,0 +1,372 @@ +/* + * xt_HMARK - Netfilter module to set mark by means of hashing + * + * (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com> + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_HMARK.h> + +#include <net/ip.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#include <net/ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>"); +MODULE_DESCRIPTION("Xtables: packet marking using hash calculation"); +MODULE_ALIAS("ipt_HMARK"); +MODULE_ALIAS("ip6t_HMARK"); + +struct hmark_tuple { + __be32 src; + __be32 dst; + union hmark_ports uports; + u8 proto; +}; + +static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask) +{ + return (addr32[0] & mask[0]) ^ + (addr32[1] & mask[1]) ^ + (addr32[2] & mask[2]) ^ + (addr32[3] & mask[3]); +} + +static inline __be32 +hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask) +{ + switch (l3num) { + case AF_INET: + return *addr32 & *mask; + case AF_INET6: + return hmark_addr6_mask(addr32, mask); + } + return 0; +} + +static inline void hmark_swap_ports(union hmark_ports *uports, + const struct xt_hmark_info *info) +{ + union hmark_ports hp; + u16 src, dst; + + hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32; + src = ntohs(hp.b16.src); + dst = ntohs(hp.b16.dst); + + if (dst > src) + uports->v32 = (dst << 16) | src; + else + uports->v32 = (src << 16) | dst; +} + +static int +hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, + const struct xt_hmark_info *info) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple *otuple; + struct nf_conntrack_tuple *rtuple; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return -1; + + otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6, + info->src_mask.ip6); + t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6, + info->dst_mask.ip6); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) + return 0; + + t->proto = nf_ct_protonum(ct); + if (t->proto != IPPROTO_ICMP) { + t->uports.b16.src = otuple->src.u.all; + t->uports.b16.dst = rtuple->src.u.all; + hmark_swap_ports(&t->uports, info); + } + + return 0; +#else + return -1; +#endif +} + +/* This hash function is endian independent, to ensure consistent hashing if + * the cluster is composed of big and little endian systems. */ +static inline u32 +hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) +{ + u32 hash; + u32 src = ntohl(t->src); + u32 dst = ntohl(t->dst); + + if (dst < src) + swap(src, dst); + + hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); + hash = hash ^ (t->proto & info->proto_mask); + + return (((u64)hash * info->hmodulus) >> 32) + info->hoffset; +} + +static void +hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff, + struct hmark_tuple *t, const struct xt_hmark_info *info) +{ + int protoff; + + protoff = proto_ports_offset(t->proto); + if (protoff < 0) + return; + + nhoff += protoff; + if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0) + return; + + hmark_swap_ports(&t->uports, info); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int get_inner6_hdr(const struct sk_buff *skb, int *offset) +{ + struct icmp6hdr *icmp6h, _ih6; + + icmp6h = skb_header_pointer(skb, *offset, sizeof(_ih6), &_ih6); + if (icmp6h == NULL) + return 0; + + if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) { + *offset += sizeof(struct icmp6hdr); + return 1; + } + return 0; +} + +static int +hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t, + const struct xt_hmark_info *info) +{ + struct ipv6hdr *ip6, _ip6; + int flag = IP6_FH_F_AUTH; + unsigned int nhoff = 0; + u16 fragoff = 0; + int nexthdr; + + ip6 = (struct ipv6hdr *) (skb->data + skb_network_offset(skb)); + nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); + if (nexthdr < 0) + return 0; + /* No need to check for icmp errors on fragments */ + if ((flag & IP6_FH_F_FRAG) || (nexthdr != IPPROTO_ICMPV6)) + goto noicmp; + /* Use inner header in case of ICMP errors */ + if (get_inner6_hdr(skb, &nhoff)) { + ip6 = skb_header_pointer(skb, nhoff, sizeof(_ip6), &_ip6); + if (ip6 == NULL) + return -1; + /* If AH present, use SPI like in ESP. */ + flag = IP6_FH_F_AUTH; + nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); + if (nexthdr < 0) + return -1; + } +noicmp: + t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6); + t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) + return 0; + + t->proto = nexthdr; + if (t->proto == IPPROTO_ICMPV6) + return 0; + + if (flag & IP6_FH_F_FRAG) + return 0; + + hmark_set_tuple_ports(skb, nhoff, t, info); + return 0; +} + +static unsigned int +hmark_tg_v6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_hmark_info *info = par->targinfo; + struct hmark_tuple t; + + memset(&t, 0, sizeof(struct hmark_tuple)); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { + if (hmark_ct_set_htuple(skb, &t, info) < 0) + return XT_CONTINUE; + } else { + if (hmark_pkt_set_htuple_ipv6(skb, &t, info) < 0) + return XT_CONTINUE; + } + + skb->mark = hmark_hash(&t, info); + return XT_CONTINUE; +} +#endif + +static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) +{ + const struct icmphdr *icmph; + struct icmphdr _ih; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, *nhoff + iphsz, sizeof(_ih), &_ih); + if (icmph == NULL || icmph->type > NR_ICMP_TYPES) + return 0; + + /* Error message? */ + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) + return 0; + + *nhoff += iphsz + sizeof(_ih); + return 1; +} + +static int +hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, + const struct xt_hmark_info *info) +{ + struct iphdr *ip, _ip; + int nhoff = skb_network_offset(skb); + + ip = (struct iphdr *) (skb->data + nhoff); + if (ip->protocol == IPPROTO_ICMP) { + /* Use inner header in case of ICMP errors */ + if (get_inner_hdr(skb, ip->ihl * 4, &nhoff)) { + ip = skb_header_pointer(skb, nhoff, sizeof(_ip), &_ip); + if (ip == NULL) + return -1; + } + } + + t->src = ip->saddr & info->src_mask.ip; + t->dst = ip->daddr & info->dst_mask.ip; + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) + return 0; + + t->proto = ip->protocol; + + /* ICMP has no ports, skip */ + if (t->proto == IPPROTO_ICMP) + return 0; + + /* follow-up fragments don't contain ports, skip all fragments */ + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + return 0; + + hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info); + + return 0; +} + +static unsigned int +hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_hmark_info *info = par->targinfo; + struct hmark_tuple t; + + memset(&t, 0, sizeof(struct hmark_tuple)); + + if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { + if (hmark_ct_set_htuple(skb, &t, info) < 0) + return XT_CONTINUE; + } else { + if (hmark_pkt_set_htuple_ipv4(skb, &t, info) < 0) + return XT_CONTINUE; + } + + skb->mark = hmark_hash(&t, info); + return XT_CONTINUE; +} + +static int hmark_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_hmark_info *info = par->targinfo; + + if (!info->hmodulus) { + pr_info("xt_HMARK: hash modulus can't be zero\n"); + return -EINVAL; + } + if (info->proto_mask && + (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) { + pr_info("xt_HMARK: proto mask must be zero with L3 mode\n"); + return -EINVAL; + } + if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI_MASK) && + (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT_MASK) | + XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) { + pr_info("xt_HMARK: spi-mask and port-mask can't be combined\n"); + return -EINVAL; + } + if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI) && + (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT) | + XT_HMARK_FLAG(XT_HMARK_DPORT)))) { + pr_info("xt_HMARK: spi-set and port-set can't be combined\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target hmark_tg_reg[] __read_mostly = { + { + .name = "HMARK", + .family = NFPROTO_IPV4, + .target = hmark_tg_v4, + .targetsize = sizeof(struct xt_hmark_info), + .checkentry = hmark_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "HMARK", + .family = NFPROTO_IPV6, + .target = hmark_tg_v6, + .targetsize = sizeof(struct xt_hmark_info), + .checkentry = hmark_tg_check, + .me = THIS_MODULE, + }, +#endif +}; + +static int __init hmark_tg_init(void) +{ + return xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); +} + +static void __exit hmark_tg_exit(void) +{ + xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); +} + +module_init(hmark_tg_init); +module_exit(hmark_tg_exit); diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c new file mode 100644 index 00000000000..5ab24843370 --- /dev/null +++ b/net/netfilter/xt_LOG.c @@ -0,0 +1,975 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_LOG.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/xt_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static int dump_udp_header(struct sbuff *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + sb_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + sb_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + +out: + return 0; +} + +static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + sb_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & XT_LOG_TCPSEQ) + sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); + + /* Max length: 13 "WINDOW=65535 " */ + sb_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + sb_add(m, "CWR "); + if (th->ece) + sb_add(m, "ECE "); + if (th->urg) + sb_add(m, "URG "); + if (th->ack) + sb_add(m, "ACK "); + if (th->psh) + sb_add(m, "PSH "); + if (th->rst) + sb_add(m, "RST "); + if (th->syn) + sb_add(m, "SYN "); + if (th->fin) + sb_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + const u_int8_t *op; + unsigned int i; + unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + sb_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + + sb_add(m, ") "); + } + + return 0; +} + +static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk) +{ + if (!sk || sk->sk_state == TCP_TIME_WAIT) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + const struct cred *cred = sk->sk_socket->file->f_cred; + sb_add(m, "UID=%u GID=%u ", + from_kuid_munged(&init_user_ns, cred->fsuid), + from_kgid_munged(&init_user_ns, cred->fsgid)); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +/* One level of recursion won't kill us */ +static void dump_ipv4_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, + unsigned int iphoff) +{ + struct iphdr _iph; + const struct iphdr *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + sb_add(m, "SRC=%pI4 DST=%pI4 ", + &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + sb_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + sb_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + sb_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & XT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4)) + return; + break; + case IPPROTO_ICMP: { + struct icmphdr _icmph; + const struct icmphdr *ich; + static const size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + sb_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + sb_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + sb_add(m, "["); + dump_ipv4_packet(m, info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) + sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + sb_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ + sb_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + sb_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && !iphoff) + dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void dump_ipv4_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + sb_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + sb_add(m, ":%02x", *p); + } + sb_add(m, " "); +} + +static void +log_packet_common(struct sbuff *m, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + const struct net_device *physindev; + const struct net_device *physoutdev; + + physindev = skb->nf_bridge->physindev; + if (physindev && in != physindev) + sb_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev && out != physoutdev) + sb_add(m, "PHYSOUT=%s ", physoutdev->name); + } +#endif +} + + +static void +ipt_log_packet(struct net *net, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + + if (in != NULL) + dump_ipv4_mac_header(m, loginfo, skb); + + dump_ipv4_packet(m, loginfo, skb, 0); + + sb_close(m); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +/* One level of recursion won't kill us */ +static void dump_ipv6_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & XT_LOG_IPOPT) + sb_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + sb_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + sb_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + sb_add(m, "INCOMPLETE "); + + sb_add(m, "ID:%08x ", ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & XT_LOG_IPOPT) + sb_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & XT_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + sb_add(m, "AH "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (logflags & XT_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + sb_add(m, "ESP "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + sb_add(m, "SPI=0x%x )", ntohl(eh->spi)); + + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + sb_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & XT_LOG_IPOPT) + sb_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (dump_tcp_header(m, skb, currenthdr, fragment, ptr, + logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + sb_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + sb_add(m, "["); + dump_ipv6_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) + sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + sb_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && recurse) + dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (recurse && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv6_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p != NULL) { + sb_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + sb_add(m, ":%02x", *p++); + } + sb_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, + &iph->daddr); + } + } else + sb_add(m, " "); +} + +static void +ip6t_log_packet(struct net *net, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + + if (in != NULL) + dump_ipv6_mac_header(m, loginfo, skb); + + dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + + sb_close(m); +} +#endif + +static unsigned int +log_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_log_info *loginfo = par->targinfo; + struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; + + if (par->family == NFPROTO_IPV4) + ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in, + par->out, &li, loginfo->prefix); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + else if (par->family == NFPROTO_IPV6) + ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in, + par->out, &li, loginfo->prefix); +#endif + else + WARN_ON_ONCE(1); + + return XT_CONTINUE; +} + +static int log_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_log_info *loginfo = par->targinfo; + + if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) + return -EINVAL; + + if (loginfo->level >= 8) { + pr_debug("level %u >= 8\n", loginfo->level); + return -EINVAL; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + pr_debug("prefix is not null-terminated\n"); + return -EINVAL; + } + + return 0; +} + +static struct xt_target log_tg_regs[] __read_mostly = { + { + .name = "LOG", + .family = NFPROTO_IPV4, + .target = log_tg, + .targetsize = sizeof(struct xt_log_info), + .checkentry = log_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "LOG", + .family = NFPROTO_IPV6, + .target = log_tg, + .targetsize = sizeof(struct xt_log_info), + .checkentry = log_tg_check, + .me = THIS_MODULE, + }, +#endif +}; + +static struct nf_logger ipt_log_logger __read_mostly = { + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static struct nf_logger ip6t_log_logger __read_mostly = { + .name = "ip6t_LOG", + .logfn = &ip6t_log_packet, + .me = THIS_MODULE, +}; +#endif + +static int __net_init log_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger); +#endif + return 0; +} + +static void __net_exit log_net_exit(struct net *net) +{ + nf_log_unset(net, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_unset(net, &ip6t_log_logger); +#endif +} + +static struct pernet_operations log_net_ops = { + .init = log_net_init, + .exit = log_net_exit, +}; + +static int __init log_tg_init(void) +{ + int ret; + + ret = register_pernet_subsys(&log_net_ops); + if (ret < 0) + goto err_pernet; + + ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); + if (ret < 0) + goto err_target; + + nf_log_register(NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); +#endif + return 0; + +err_target: + unregister_pernet_subsys(&log_net_ops); +err_pernet: + return ret; +} + +static void __exit log_tg_exit(void) +{ + unregister_pernet_subsys(&log_net_ops); + nf_log_unregister(&ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_unregister(&ip6t_log_logger); +#endif + xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); +} + +module_init(log_tg_init); +module_exit(log_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); +MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); +MODULE_ALIAS("ipt_LOG"); +MODULE_ALIAS("ip6t_LOG"); diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c new file mode 100644 index 00000000000..b253e07cb1c --- /dev/null +++ b/net/netfilter/xt_NETMAP.c @@ -0,0 +1,165 @@ +/* + * (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> + +static unsigned int +netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + struct nf_nat_range newrange; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + union nf_inet_addr new_addr, netmask; + unsigned int i; + + ct = nf_ct_get(skb, &ctinfo); + for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++) + netmask.ip6[i] = ~(range->min_addr.ip6[i] ^ + range->max_addr.ip6[i]); + + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT) + new_addr.in6 = ipv6_hdr(skb)->daddr; + else + new_addr.in6 = ipv6_hdr(skb)->saddr; + + for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) { + new_addr.ip6[i] &= ~netmask.ip6[i]; + new_addr.ip6[i] |= range->min_addr.ip6[i] & + netmask.ip6[i]; + } + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr = new_addr; + newrange.max_addr = new_addr; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); +} + +static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + + if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) + return -EINVAL; + return 0; +} + +static unsigned int +netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 new_ip, netmask; + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range newrange; + + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_POST_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT || + par->hooknum == NF_INET_LOCAL_IN); + ct = nf_ct_get(skb, &ctinfo); + + netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); + + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT) + new_ip = ip_hdr(skb)->daddr & ~netmask; + else + new_ip = ip_hdr(skb)->saddr & ~netmask; + new_ip |= mr->range[0].min_ip & netmask; + + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = new_ip; + newrange.max_addr.ip = new_ip; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); +} + +static int netmap_tg4_check(const struct xt_tgchk_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + + if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + if (mr->rangesize != 1) { + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; + } + return 0; +} + +static struct xt_target netmap_tg_reg[] __read_mostly = { + { + .name = "NETMAP", + .family = NFPROTO_IPV6, + .revision = 0, + .target = netmap_tg6, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .checkentry = netmap_tg6_checkentry, + .me = THIS_MODULE, + }, + { + .name = "NETMAP", + .family = NFPROTO_IPV4, + .revision = 0, + .target = netmap_tg4, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .checkentry = netmap_tg4_check, + .me = THIS_MODULE, + }, +}; + +static int __init netmap_tg_init(void) +{ + return xt_register_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); +} + +static void netmap_tg_exit(void) +{ + xt_unregister_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); +} + +module_init(netmap_tg_init); +module_exit(netmap_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of subnets"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ip6t_NETMAP"); +MODULE_ALIAS("ipt_NETMAP"); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index a17dd0f589b..fb7497c928a 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -26,13 +26,14 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nfulnl_log_packet(par->family, par->hooknum, skb, par->in, + nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, &li, info->prefix); return XT_CONTINUE; } diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 95237c89607..8f1779ff7e3 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -11,15 +11,13 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/jhash.h> - #include <linux/netfilter.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFQUEUE.h> +#include <net/netfilter/nf_queue.h> + MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); MODULE_LICENSE("GPL"); @@ -28,7 +26,6 @@ MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static u32 jhash_initval __read_mostly; -static bool rnd_inited __read_mostly; static unsigned int nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -38,32 +35,6 @@ nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) return NF_QUEUE_NR(tinfo->queuenum); } -static u32 hash_v4(const struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - __be32 ipaddr; - - /* packets in either direction go into same queue */ - ipaddr = iph->saddr ^ iph->daddr; - - return jhash_2words((__force u32)ipaddr, iph->protocol, jhash_initval); -} - -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -static u32 hash_v6(const struct sk_buff *skb) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - __be32 addr[4]; - - addr[0] = ip6h->saddr.s6_addr32[0] ^ ip6h->daddr.s6_addr32[0]; - addr[1] = ip6h->saddr.s6_addr32[1] ^ ip6h->daddr.s6_addr32[1]; - addr[2] = ip6h->saddr.s6_addr32[2] ^ ip6h->daddr.s6_addr32[2]; - addr[3] = ip6h->saddr.s6_addr32[3] ^ ip6h->daddr.s6_addr32[3]; - - return jhash2((__force u32 *)addr, ARRAY_SIZE(addr), jhash_initval); -} -#endif - static unsigned int nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) { @@ -71,14 +42,8 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) u32 queue = info->queuenum; if (info->queues_total > 1) { - if (par->family == NFPROTO_IPV4) - queue = (((u64) hash_v4(skb) * info->queues_total) >> - 32) + queue; -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - else if (par->family == NFPROTO_IPV6) - queue = (((u64) hash_v6(skb) * info->queues_total) >> - 32) + queue; -#endif + queue = nfqueue_hash(skb, queue, info->queues_total, + par->family, jhash_initval); } return NF_QUEUE_NR(queue); } @@ -96,13 +61,11 @@ nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) static int nfqueue_tg_check(const struct xt_tgchk_param *par) { - const struct xt_NFQ_info_v2 *info = par->targinfo; + const struct xt_NFQ_info_v3 *info = par->targinfo; u32 maxid; - if (unlikely(!rnd_inited)) { - get_random_bytes(&jhash_initval, sizeof(jhash_initval)); - rnd_inited = true; - } + init_hashrandom(&jhash_initval); + if (info->queues_total == 0) { pr_err("NFQUEUE: number of total queues is 0\n"); return -EINVAL; @@ -113,11 +76,39 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par) info->queues_total, maxid); return -ERANGE; } - if (par->target->revision == 2 && info->bypass > 1) + if (par->target->revision == 2 && info->flags > 1) + return -EINVAL; + if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) return -EINVAL; + return 0; } +static unsigned int +nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v3 *info = par->targinfo; + u32 queue = info->queuenum; + int ret; + + if (info->queues_total > 1) { + if (info->flags & NFQ_FLAG_CPU_FANOUT) { + int cpu = smp_processor_id(); + + queue = info->queuenum + cpu % info->queues_total; + } else { + queue = nfqueue_hash(skb, queue, info->queues_total, + par->family, jhash_initval); + } + } + + ret = NF_QUEUE_NR(queue); + if (info->flags & NFQ_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + return ret; +} + static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", @@ -144,6 +135,15 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_NFQ_info_v2), .me = THIS_MODULE, }, + { + .name = "NFQUEUE", + .revision = 3, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v3, + .targetsize = sizeof(struct xt_NFQ_info_v3), + .me = THIS_MODULE, + }, }; static int __init nfqueue_tg_init(void) diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c deleted file mode 100644 index 9d782181b6c..00000000000 --- a/net/netfilter/xt_NOTRACK.c +++ /dev/null @@ -1,53 +0,0 @@ -/* This is a module which is used for setting up fake conntracks - * on packets so that they are not seen by the conntrack/NAT code. - */ -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_conntrack.h> - -MODULE_DESCRIPTION("Xtables: Disabling connection tracking for packets"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ipt_NOTRACK"); -MODULE_ALIAS("ip6t_NOTRACK"); - -static unsigned int -notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - /* Previously seen (loopback)? Ignore. */ - if (skb->nfct != NULL) - return XT_CONTINUE; - - /* Attach fake conntrack entry. - If there is a real ct entry correspondig to this packet, - it'll hang aroun till timing out. We don't deal with it - for performance reasons. JK */ - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); - - return XT_CONTINUE; -} - -static struct xt_target notrack_tg_reg __read_mostly = { - .name = "NOTRACK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = notrack_tg, - .table = "raw", - .me = THIS_MODULE, -}; - -static int __init notrack_tg_init(void) -{ - return xt_register_target(¬rack_tg_reg); -} - -static void __exit notrack_tg_exit(void) -{ - xt_unregister_target(¬rack_tg_reg); -} - -module_init(notrack_tg_init); -module_exit(notrack_tg_exit); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index f264032b8c5..370adf622ce 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -43,12 +43,11 @@ static void xt_rateest_hash_insert(struct xt_rateest *est) struct xt_rateest *xt_rateest_lookup(const char *name) { struct xt_rateest *est; - struct hlist_node *n; unsigned int h; h = xt_rateest_hash(name); mutex_lock(&xt_rateest_mutex); - hlist_for_each_entry(est, n, &rateest_hash[h], list) { + hlist_for_each_entry(est, &rateest_hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; mutex_unlock(&xt_rateest_mutex); diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c new file mode 100644 index 00000000000..22a10309297 --- /dev/null +++ b/net/netfilter/xt_REDIRECT.c @@ -0,0 +1,190 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/types.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/addrconf.h> +#include <net/checksum.h> +#include <net/protocol.h> +#include <net/netfilter/nf_nat.h> + +static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; + +static unsigned int +redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + struct nf_nat_range newrange; + struct in6_addr newdst; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (par->hooknum == NF_INET_LOCAL_OUT) + newdst = loopback_addr; + else { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + bool addr = false; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + if (idev != NULL) { + list_for_each_entry(ifa, &idev->addr_list, if_list) { + newdst = ifa->addr; + addr = true; + break; + } + } + rcu_read_unlock(); + + if (!addr) + return NF_DROP; + } + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = newdst; + newrange.max_addr.in6 = newdst; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + +static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) + return -EINVAL; + return 0; +} + +/* FIXME: Take multiple ranges --RR */ +static int redirect_tg4_check(const struct xt_tgchk_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + + if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + if (mr->rangesize != 1) { + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; + } + return 0; +} + +static unsigned int +redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 newdst; + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range newrange; + + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (par->hooknum == NF_INET_LOCAL_OUT) + newdst = htonl(0x7F000001); + else { + struct in_device *indev; + struct in_ifaddr *ifa; + + newdst = 0; + + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + if (indev && (ifa = indev->ifa_list)) + newdst = ifa->ifa_local; + rcu_read_unlock(); + + if (!newdst) + return NF_DROP; + } + + /* Transfer from original range. */ + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newdst; + newrange.max_addr.ip = newdst; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + +static struct xt_target redirect_tg_reg[] __read_mostly = { + { + .name = "REDIRECT", + .family = NFPROTO_IPV6, + .revision = 0, + .table = "nat", + .checkentry = redirect_tg6_checkentry, + .target = redirect_tg6, + .targetsize = sizeof(struct nf_nat_range), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, + { + .name = "REDIRECT", + .family = NFPROTO_IPV4, + .revision = 0, + .table = "nat", + .target = redirect_tg4, + .checkentry = redirect_tg4_check, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, +}; + +static int __init redirect_tg_init(void) +{ + return xt_register_targets(redirect_tg_reg, + ARRAY_SIZE(redirect_tg_reg)); +} + +static void __exit redirect_tg_exit(void) +{ + xt_unregister_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg)); +} + +module_init(redirect_tg_init); +module_exit(redirect_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); +MODULE_ALIAS("ip6t_REDIRECT"); +MODULE_ALIAS("ipt_REDIRECT"); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 190ad37c5cf..e762de5ee89 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -2,6 +2,7 @@ * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -42,40 +43,82 @@ optlen(const u_int8_t *opt, unsigned int offset) return opt[offset+1]; } +static u_int32_t tcpmss_reverse_mtu(struct net *net, + const struct sk_buff *skb, + unsigned int family) +{ + struct flowi fl; + const struct nf_afinfo *ai; + struct rtable *rt = NULL; + u_int32_t mtu = ~0U; + + if (family == PF_INET) { + struct flowi4 *fl4 = &fl.u.ip4; + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = ip_hdr(skb)->saddr; + } else { + struct flowi6 *fl6 = &fl.u.ip6; + + memset(fl6, 0, sizeof(*fl6)); + fl6->daddr = ipv6_hdr(skb)->saddr; + } + rcu_read_lock(); + ai = nf_get_afinfo(family); + if (ai != NULL) + ai->route(net, (struct dst_entry **)&rt, &fl, false); + rcu_read_unlock(); + + if (rt != NULL) { + mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); + } + return mtu; +} + static int tcpmss_mangle_packet(struct sk_buff *skb, - const struct xt_tcpmss_info *info, - unsigned int in_mtu, + const struct xt_action_param *par, + unsigned int family, unsigned int tcphoff, unsigned int minlen) { + const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; - unsigned int tcplen, i; + int len, tcp_hdrlen; + unsigned int i; __be16 oldval; u16 newmss; u8 *opt; + /* This is a fragment, no TCP header is available */ + if (par->fragoff != 0) + return 0; + if (!skb_make_writable(skb, skb->len)) return -1; - tcplen = skb->len - tcphoff; + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr)) + return -1; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + tcp_hdrlen = tcph->doff * 4; - /* Header cannot be larger than the packet */ - if (tcplen < tcph->doff*4) + if (len < tcp_hdrlen) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { + struct net *net = dev_net(par->in ? par->in : par->out); + unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); + if (dst_mtu(skb_dst(skb)) <= minlen) { - if (net_ratelimit()) - pr_err("unknown or invalid path-MTU (%u)\n", - dst_mtu(skb_dst(skb))); + net_err_ratelimited("unknown or invalid path-MTU (%u)\n", + dst_mtu(skb_dst(skb))); return -1; } if (in_mtu <= minlen) { - if (net_ratelimit()) - pr_err("unknown or invalid path-MTU (%u)\n", - in_mtu); + net_err_ratelimited("unknown or invalid path-MTU (%u)\n", + in_mtu); return -1; } newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen; @@ -83,9 +126,8 @@ tcpmss_mangle_packet(struct sk_buff *skb, newmss = info->mss; opt = (u_int8_t *)tcph; - for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) { - if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS && - opt[i+1] == TCPOLEN_MSS) { + for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { + if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; @@ -108,9 +150,10 @@ tcpmss_mangle_packet(struct sk_buff *skb, } /* There is data after the header so the option can't be added - without moving it, and doing so may make the SYN packet - itself too large. Accept the packet unmodified instead. */ - if (tcplen > tcph->doff*4) + * without moving it, and doing so may make the SYN packet + * itself too large. Accept the packet unmodified instead. + */ + if (len > tcp_hdrlen) return 0; /* @@ -126,11 +169,23 @@ tcpmss_mangle_packet(struct sk_buff *skb, skb_put(skb, TCPOLEN_MSS); + /* + * IPv4: RFC 1122 states "If an MSS option is not received at + * connection setup, TCP MUST assume a default send MSS of 536". + * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum + * length IPv6 header of 60, ergo the default MSS value is 1220 + * Since no MSS was provided, we must use the default values + */ + if (par->family == NFPROTO_IPV4) + newmss = min(newmss, (u16)536); + else + newmss = min(newmss, (u16)1220); + opt = (u_int8_t *)tcph + sizeof(struct tcphdr); - memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, - htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); + htons(len), htons(len + TCPOLEN_MSS), 1); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; @@ -145,37 +200,6 @@ tcpmss_mangle_packet(struct sk_buff *skb, return TCPOLEN_MSS; } -static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, - unsigned int family) -{ - struct flowi fl; - const struct nf_afinfo *ai; - struct rtable *rt = NULL; - u_int32_t mtu = ~0U; - - if (family == PF_INET) { - struct flowi4 *fl4 = &fl.u.ip4; - memset(fl4, 0, sizeof(*fl4)); - fl4->daddr = ip_hdr(skb)->saddr; - } else { - struct flowi6 *fl6 = &fl.u.ip6; - - memset(fl6, 0, sizeof(*fl6)); - fl6->daddr = ipv6_hdr(skb)->saddr; - } - rcu_read_lock(); - ai = nf_get_afinfo(family); - if (ai != NULL) - ai->route(&init_net, (struct dst_entry **)&rt, &fl, false); - rcu_read_unlock(); - - if (rt != NULL) { - mtu = dst_mtu(&rt->dst); - dst_release(&rt->dst); - } - return mtu; -} - static unsigned int tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) { @@ -183,8 +207,8 @@ tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) __be16 newlen; int ret; - ret = tcpmss_mangle_packet(skb, par->targinfo, - tcpmss_reverse_mtu(skb, PF_INET), + ret = tcpmss_mangle_packet(skb, par, + PF_INET, iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) @@ -212,8 +236,8 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; - ret = tcpmss_mangle_packet(skb, par->targinfo, - tcpmss_reverse_mtu(skb, PF_INET6), + ret = tcpmss_mangle_packet(skb, par, + PF_INET6, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 25fd1c4e1ee..625fa1d636a 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -30,28 +30,43 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) static unsigned int tcpoptstrip_mangle_packet(struct sk_buff *skb, - const struct xt_tcpoptstrip_target_info *info, + const struct xt_action_param *par, unsigned int tcphoff, unsigned int minlen) { + const struct xt_tcpoptstrip_target_info *info = par->targinfo; unsigned int optl, i, j; struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; + int len, tcp_hdrlen; + + /* This is a fragment, no TCP header is available */ + if (par->fragoff != 0) + return XT_CONTINUE; if (!skb_make_writable(skb, skb->len)) return NF_DROP; + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr)) + return NF_DROP; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + tcp_hdrlen = tcph->doff * 4; + + if (len < tcp_hdrlen) + return NF_DROP; + opt = (u_int8_t *)tcph; /* * Walk through all TCP options - if we find some option to remove, * set all octets to %TCPOPT_NOP and adjust checksum. */ - for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) { + for (i = sizeof(struct tcphdr); i < tcp_hdrlen - 1; i += optl) { optl = optlen(opt, i); - if (i + optl > tcp_hdrlen(skb)) + if (i + optl > tcp_hdrlen) break; if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) @@ -76,7 +91,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, static unsigned int tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), sizeof(struct iphdr) + sizeof(struct tcphdr)); } @@ -94,7 +109,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, + return tcpoptstrip_mangle_packet(skb, par, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); } #endif diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 4d505790283..292934d2348 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -70,6 +70,7 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) fl4.daddr = info->gw.ip; fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return false; @@ -87,7 +88,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_tee_tginfo *info = par->targinfo; struct iphdr *iph; - if (percpu_read(tee_active)) + if (__this_cpu_read(tee_active)) return XT_CONTINUE; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -124,9 +125,9 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) ip_send_check(iph); if (tee_tg_route4(skb, info)) { - percpu_write(tee_active, true); + __this_cpu_write(tee_active, true); ip_local_out(skb); - percpu_write(tee_active, false); + __this_cpu_write(tee_active, false); } else { kfree_skb(skb); } @@ -168,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - if (percpu_read(tee_active)) + if (__this_cpu_read(tee_active)) return XT_CONTINUE; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -186,9 +187,9 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) --iph->hop_limit; } if (tee_tg_route6(skb, info)) { - percpu_write(tee_active, true); + __this_cpu_write(tee_active, true); ip6_local_out(skb); - percpu_write(tee_active, false); + __this_cpu_write(tee_active, false); } else { kfree_skb(skb); } @@ -199,7 +200,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct xt_tee_priv *priv; priv = container_of(this, struct xt_tee_priv, notifier); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 35a959a096e..ef8a926752a 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -15,7 +15,9 @@ #include <linux/ip.h> #include <net/checksum.h> #include <net/udp.h> +#include <net/tcp.h> #include <net/inet_sock.h> +#include <net/inet_hashtables.h> #include <linux/inetdevice.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -26,13 +28,18 @@ #define XT_TPROXY_HAVE_IPV6 1 #include <net/if_inet6.h> #include <net/addrconf.h> +#include <net/inet6_hashtables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif -#include <net/netfilter/nf_tproxy_core.h> #include <linux/netfilter/xt_TPROXY.h> +enum nf_tproxy_lookup_t { + NFT_LOOKUP_LISTENER, + NFT_LOOKUP_ESTABLISHED, +}; + static bool tproxy_sk_is_transparent(struct sock *sk) { if (sk->sk_state != TCP_TIME_WAIT) { @@ -68,8 +75,159 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) return laddr ? laddr : daddr; } +/* + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple, it is returned, assuming the redirection + * already took place and we process a packet belonging to an + * established connection + * + * - match: if there's a listening socket matching the redirection + * (e.g. on-port & on-ip of the connection), it is returned, + * regardless if it was bound to 0.0.0.0 or an explicit + * address. The reasoning is that if there's an explicit rule, it + * does not really matter if the listener is bound to an interface + * or to 0. The user already stated that he wants redirection + * (since he added the rule). + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +static inline struct sock * +nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, dport, + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); + + return sk; +} + +#ifdef XT_TPROXY_HAVE_IPV6 +static inline struct sock * +nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet6_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, ntohs(dport), + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = __inet6_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, ntohs(dport), + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", + protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); + + return sk; +} +#endif + /** - * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections + * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections * @skb: The skb being processed. * @laddr: IPv4 address to redirect to or zero. * @lport: TCP port to redirect to or zero. @@ -117,6 +275,15 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, return sk; } +/* assign a socket to the skb -- consumes sk */ +static void +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_edemux; +} + static unsigned int tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) @@ -220,7 +387,7 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, } /** - * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections + * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections * @skb: The skb being processed. * @tproto: Transport protocol. * @thoff: Transport protocol header offset. @@ -282,10 +449,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) struct sock *sk; const struct in6_addr *laddr; __be16 lport; - int thoff; + int thoff = 0; int tproto; - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { pr_debug("unable to find transport header in IPv6 packet, dropping\n"); return NF_DROP; diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 49c5ff7f6dd..fab6eea1bf3 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -22,6 +22,7 @@ #include <net/ip6_fib.h> #endif +#include <linux/netfilter_ipv6.h> #include <linux/netfilter/xt_addrtype.h> #include <linux/netfilter/x_tables.h> @@ -33,12 +34,12 @@ MODULE_ALIAS("ip6t_addrtype"); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, - const struct in6_addr *addr) + const struct in6_addr *addr, u16 mask) { const struct nf_afinfo *afinfo; struct flowi6 flow; struct rt6_info *rt; - u32 ret; + u32 ret = 0; int route_err; memset(&flow, 0, sizeof(flow)); @@ -49,12 +50,19 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, rcu_read_lock(); afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (afinfo != NULL) + if (afinfo != NULL) { + const struct nf_ipv6_ops *v6ops; + + if (dev && (mask & XT_ADDRTYPE_LOCAL)) { + v6ops = nf_get_ipv6_ops(); + if (v6ops && v6ops->chk_addr(net, addr, dev, true)) + ret = XT_ADDRTYPE_LOCAL; + } route_err = afinfo->route(net, (struct dst_entry **)&rt, - flowi6_to_flowi(&flow), !!dev); - else + flowi6_to_flowi(&flow), false); + } else { route_err = 1; - + } rcu_read_unlock(); if (route_err) @@ -62,15 +70,12 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (rt->rt6i_flags & RTF_REJECT) ret = XT_ADDRTYPE_UNREACHABLE; - else - ret = 0; - if (rt->rt6i_flags & RTF_LOCAL) + if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; if (rt->rt6i_flags & RTF_ANYCAST) ret |= XT_ADDRTYPE_ANYCAST; - dst_release(&rt->dst); return ret; } @@ -90,7 +95,7 @@ static bool match_type6(struct net *net, const struct net_device *dev, if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | XT_ADDRTYPE_UNREACHABLE) & mask) - return !!(mask & match_lookup_rt6(net, dev, addr)); + return !!(mask & match_lookup_rt6(net, dev, addr, mask)); return true; } @@ -197,7 +202,7 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) return -EINVAL; } if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { - pr_err("ipv6 PROHIBT (THROW, NAT ..) matching not supported\n"); + pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n"); return -EINVAL; } if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c new file mode 100644 index 00000000000..bbffdbdaf60 --- /dev/null +++ b/net/netfilter/xt_bpf.c @@ -0,0 +1,74 @@ +/* Xtables module to match packets using a BPF filter. + * Copyright 2013 Google Inc. + * Written by Willem de Bruijn <willemb@google.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/filter.h> + +#include <linux/netfilter/xt_bpf.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Willem de Bruijn <willemb@google.com>"); +MODULE_DESCRIPTION("Xtables: BPF filter match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_bpf"); +MODULE_ALIAS("ip6t_bpf"); + +static int bpf_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info *info = par->matchinfo; + struct sock_fprog_kern program; + + program.len = info->bpf_program_num_elem; + program.filter = info->bpf_program; + + if (sk_unattached_filter_create(&info->filter, &program)) { + pr_info("bpf: check failed: parse error\n"); + return -EINVAL; + } + + return 0; +} + +static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_bpf_info *info = par->matchinfo; + + return SK_RUN_FILTER(info->filter, skb); +} + +static void bpf_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_bpf_info *info = par->matchinfo; + sk_unattached_filter_destroy(info->filter); +} + +static struct xt_match bpf_mt_reg __read_mostly = { + .name = "bpf", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check, + .match = bpf_mt, + .destroy = bpf_mt_destroy, + .matchsize = sizeof(struct xt_bpf_info), + .me = THIS_MODULE, +}; + +static int __init bpf_mt_init(void) +{ + return xt_register_match(&bpf_mt_reg); +} + +static void __exit bpf_mt_exit(void) +{ + xt_unregister_match(&bpf_mt_reg); +} + +module_init(bpf_mt_init); +module_exit(bpf_mt_exit); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c new file mode 100644 index 00000000000..f4e83300532 --- /dev/null +++ b/net/netfilter/xt_cgroup.c @@ -0,0 +1,72 @@ +/* + * Xtables module to match the process control group. + * + * Might be used to implement individual "per-application" firewall + * policies in contrast to global policies based on control groups. + * Matching is based upon processes tagged to net_cls' classid marker. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/skbuff.h> +#include <linux/module.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_cgroup.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("Xtables: process control group matching"); +MODULE_ALIAS("ipt_cgroup"); +MODULE_ALIAS("ip6t_cgroup"); + +static int cgroup_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + + return info->id ? 0 : -EINVAL; +} + +static bool +cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info *info = par->matchinfo; + + if (skb->sk == NULL) + return false; + + return (info->id == skb->sk->sk_classid) ^ info->invert; +} + +static struct xt_match cgroup_mt_reg __read_mostly = { + .name = "cgroup", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check, + .match = cgroup_mt, + .matchsize = sizeof(struct xt_cgroup_info), + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), +}; + +static int __init cgroup_mt_init(void) +{ + return xt_register_match(&cgroup_mt_reg); +} + +static void __exit cgroup_mt_exit(void) +{ + xt_unregister_match(&cgroup_mt_reg); +} + +module_init(cgroup_mt_init); +module_exit(cgroup_mt_exit); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index e595e07a759..1e634615ab9 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -26,16 +26,18 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; + const struct nf_conn_acct *acct; const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; - counters = nf_conn_acct_find(ct); - if (!counters) + acct = nf_conn_acct_find(ct); + if (!acct) return false; + counters = acct->counter; switch (sinfo->what) { case XT_CONNBYTES_PKTS: switch (sinfo->direction) { diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c new file mode 100644 index 00000000000..9f8719df200 --- /dev/null +++ b/net/netfilter/xt_connlabel.c @@ -0,0 +1,99 @@ +/* + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); +MODULE_ALIAS("ipt_connlabel"); +MODULE_ALIAS("ip6t_connlabel"); + +static bool +connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connlabel_mtinfo *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + bool invert = info->options & XT_CONNLABEL_OP_INVERT; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL || nf_ct_is_untracked(ct)) + return invert; + + if (info->options & XT_CONNLABEL_OP_SET) + return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; + + return nf_connlabel_match(ct, info->bit) ^ invert; +} + +static int connlabel_mt_check(const struct xt_mtchk_param *par) +{ + const int options = XT_CONNLABEL_OP_INVERT | + XT_CONNLABEL_OP_SET; + struct xt_connlabel_mtinfo *info = par->matchinfo; + int ret; + size_t words; + + if (info->bit > XT_CONNLABEL_MAXBIT) + return -ERANGE; + + if (info->options & ~options) { + pr_err("Unknown options in mask %x\n", info->options); + return -EINVAL; + } + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + + par->net->ct.labels_used++; + words = BITS_TO_LONGS(info->bit+1); + if (words > par->net->ct.label_words) + par->net->ct.label_words = words; + + return ret; +} + +static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) +{ + par->net->ct.labels_used--; + if (par->net->ct.labels_used == 0) + par->net->ct.label_words = 0; + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match connlabels_mt_reg __read_mostly = { + .name = "connlabel", + .family = NFPROTO_UNSPEC, + .checkentry = connlabel_mt_check, + .match = connlabel_mt, + .matchsize = sizeof(struct xt_connlabel_mtinfo), + .destroy = connlabel_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init connlabel_mt_init(void) +{ + return xt_register_match(&connlabels_mt_reg); +} + +static void __exit connlabel_mt_exit(void) +{ + xt_unregister_match(&connlabels_mt_reg); +} + +module_init(connlabel_mt_init); +module_exit(connlabel_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index c6d5a83450c..fbc66bb250d 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -19,6 +19,7 @@ #include <linux/jhash.h> #include <linux/slab.h> #include <linux/list.h> +#include <linux/rbtree.h> #include <linux/module.h> #include <linux/random.h> #include <linux/skbuff.h> @@ -31,6 +32,16 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> +#define CONNLIMIT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS 8U +#else +#define CONNLIMIT_LOCK_SLOTS 256U +#endif + +#define CONNLIMIT_GC_MAX_NODES 8 + /* we will save the tuples of all connections we care about */ struct xt_connlimit_conn { struct hlist_node node; @@ -38,16 +49,27 @@ struct xt_connlimit_conn { union nf_inet_addr addr; }; +struct xt_connlimit_rb { + struct rb_node node; + struct hlist_head hhead; /* connections/hosts in same subnet */ + union nf_inet_addr addr; /* search key */ +}; + +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; + struct xt_connlimit_data { - struct hlist_head iphash[256]; - spinlock_t lock; + struct rb_root climit_root4[CONNLIMIT_SLOTS]; + struct rb_root climit_root6[CONNLIMIT_SLOTS]; }; static u_int32_t connlimit_rnd __read_mostly; +static struct kmem_cache *connlimit_rb_cachep __read_mostly; +static struct kmem_cache *connlimit_conn_cachep __read_mostly; static inline unsigned int connlimit_iphash(__be32 addr) { - return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; + return jhash_1word((__force __u32)addr, + connlimit_rnd) % CONNLIMIT_SLOTS; } static inline unsigned int @@ -60,7 +82,8 @@ connlimit_iphash6(const union nf_inet_addr *addr, for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) res.ip6[i] = addr->ip6[i] & mask->ip6[i]; - return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; + return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), + connlimit_rnd) % CONNLIMIT_SLOTS; } static inline bool already_closed(const struct nf_conn *conn) @@ -72,13 +95,14 @@ static inline bool already_closed(const struct nf_conn *conn) return 0; } -static inline unsigned int +static int same_source_net(const union nf_inet_addr *addr, const union nf_inet_addr *mask, const union nf_inet_addr *u3, u_int8_t family) { if (family == NFPROTO_IPV4) { - return (addr->ip & mask->ip) == (u3->ip & mask->ip); + return ntohl(addr->ip & mask->ip) - + ntohl(u3->ip & mask->ip); } else { union nf_inet_addr lh, rh; unsigned int i; @@ -88,89 +112,205 @@ same_source_net(const union nf_inet_addr *addr, rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; } - return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; + return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)); } } -static int count_them(struct net *net, - struct xt_connlimit_data *data, +static bool add_hlist(struct hlist_head *head, const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, - const union nf_inet_addr *mask, - u_int8_t family) + const union nf_inet_addr *addr) +{ + struct xt_connlimit_conn *conn; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return false; + conn->tuple = *tuple; + conn->addr = *addr; + hlist_add_head(&conn->node, head); + return true; +} + +static unsigned int check_hlist(struct net *net, + struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + bool *addit) { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; - struct hlist_node *pos, *n; + struct hlist_node *n; struct nf_conn *found_ct; - struct hlist_head *hash; - bool addit = true; - int matches = 0; - - if (family == NFPROTO_IPV6) - hash = &data->iphash[connlimit_iphash6(addr, mask)]; - else - hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; + unsigned int length = 0; + *addit = true; rcu_read_lock(); /* check the saved connections */ - hlist_for_each_entry_safe(conn, pos, n, hash, node) { + hlist_for_each_entry_safe(conn, n, head, node) { found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, &conn->tuple); - found_ct = NULL; + if (found == NULL) { + hlist_del(&conn->node); + kmem_cache_free(connlimit_conn_cachep, conn); + continue; + } - if (found != NULL) - found_ct = nf_ct_tuplehash_to_ctrack(found); + found_ct = nf_ct_tuplehash_to_ctrack(found); - if (found_ct != NULL && - nf_ct_tuple_equal(&conn->tuple, tuple) && - !already_closed(found_ct)) + if (nf_ct_tuple_equal(&conn->tuple, tuple)) { /* * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". */ - addit = false; - - if (found == NULL) { - /* this one is gone */ - hlist_del(&conn->node); - kfree(conn); - continue; - } - - if (already_closed(found_ct)) { + *addit = false; + } else if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); hlist_del(&conn->node); - kfree(conn); + kmem_cache_free(connlimit_conn_cachep, conn); continue; } - if (same_source_net(addr, mask, &conn->addr, family)) - /* same source network -> be counted! */ - ++matches; nf_ct_put(found_ct); + length++; } rcu_read_unlock(); - if (addit) { - /* save the new connection in our list */ - conn = kmalloc(sizeof(*conn), GFP_ATOMIC); - if (conn == NULL) - return -ENOMEM; - conn->tuple = *tuple; - conn->addr = *addr; - hlist_add_head(&conn->node, hash); - ++matches; + return length; +} + +static void tree_nodes_free(struct rb_root *root, + struct xt_connlimit_rb *gc_nodes[], + unsigned int gc_count) +{ + struct xt_connlimit_rb *rbconn; + + while (gc_count) { + rbconn = gc_nodes[--gc_count]; + rb_erase(&rbconn->node, root); + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, const union nf_inet_addr *mask, + u8 family) +{ + struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; + struct rb_node **rbnode, *parent; + struct xt_connlimit_rb *rbconn; + struct xt_connlimit_conn *conn; + unsigned int gc_count; + bool no_gc = false; + + restart: + gc_count = 0; + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { + int diff; + bool addit; + + rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); + + parent = *rbnode; + diff = same_source_net(addr, mask, &rbconn->addr, family); + if (diff < 0) { + rbnode = &((*rbnode)->rb_left); + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { + /* same source network -> be counted! */ + unsigned int count; + count = check_hlist(net, &rbconn->hhead, tuple, &addit); + + tree_nodes_free(root, gc_nodes, gc_count); + if (!addit) + return count; + + if (!add_hlist(&rbconn->hhead, tuple, addr)) + return 0; /* hotdrop */ + + return count + 1; + } + + if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + /* only used for GC on hhead, retval and 'addit' ignored */ + check_hlist(net, &rbconn->hhead, tuple, &addit); + if (hlist_empty(&rbconn->hhead)) + gc_nodes[gc_count++] = rbconn; } - return matches; + if (gc_count) { + no_gc = true; + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + goto restart; + } + + /* no match, need to insert new node */ + rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + return 0; + + conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(connlimit_rb_cachep, rbconn); + return 0; + } + + conn->tuple = *tuple; + conn->addr = *addr; + rbconn->addr = *addr; + + INIT_HLIST_HEAD(&rbconn->hhead); + hlist_add_head(&conn->node, &rbconn->hhead); + + rb_link_node(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + return 1; +} + +static int count_them(struct net *net, + struct xt_connlimit_data *data, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + u_int8_t family) +{ + struct rb_root *root; + int count; + u32 hash; + + if (family == NFPROTO_IPV6) { + hash = connlimit_iphash6(addr, mask); + root = &data->climit_root6[hash]; + } else { + hash = connlimit_iphash(addr->ip & mask->ip); + root = &data->climit_root4[hash]; + } + + spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + count = count_tree(net, root, tuple, addr, mask, family); + + spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); + + return count; } static bool @@ -183,7 +323,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct nf_conntrack_tuple *tuple_ptr = &tuple; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; - int connections; + unsigned int connections; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) @@ -202,12 +342,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) iph->daddr : iph->saddr; } - spin_lock_bh(&info->data->lock); connections = count_them(net, info->data, tuple_ptr, &addr, &info->mask, par->family); - spin_unlock_bh(&info->data->lock); - - if (connections < 0) + if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; @@ -247,65 +384,95 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - spin_lock_init(&info->data->lock); - for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) - INIT_HLIST_HEAD(&info->data->iphash[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + info->data->climit_root4[i] = RB_ROOT; + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + info->data->climit_root6[i] = RB_ROOT; return 0; } +static void destroy_tree(struct rb_root *r) +{ + struct xt_connlimit_conn *conn; + struct xt_connlimit_rb *rbconn; + struct hlist_node *n; + struct rb_node *node; + + while ((node = rb_first(r)) != NULL) { + rbconn = container_of(node, struct xt_connlimit_rb, node); + + rb_erase(node, r); + + hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) + kmem_cache_free(connlimit_conn_cachep, conn); + + kmem_cache_free(connlimit_rb_cachep, rbconn); + } +} + static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; - struct xt_connlimit_conn *conn; - struct hlist_node *pos, *n; - struct hlist_head *hash = info->data->iphash; unsigned int i; nf_ct_l3proto_module_put(par->family); - for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { - hlist_for_each_entry_safe(conn, pos, n, &hash[i], node) { - hlist_del(&conn->node); - kfree(conn); - } - } + for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) + destroy_tree(&info->data->climit_root4[i]); + for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) + destroy_tree(&info->data->climit_root6[i]); kfree(info->data); } -static struct xt_match connlimit_mt_reg[] __read_mostly = { - { - .name = "connlimit", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, - }, - { - .name = "connlimit", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, - }, +static struct xt_match connlimit_mt_reg __read_mostly = { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, }; static int __init connlimit_mt_init(void) { - return xt_register_matches(connlimit_mt_reg, - ARRAY_SIZE(connlimit_mt_reg)); + int ret, i; + + BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); + BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) + spin_lock_init(&xt_connlimit_locks[i]); + + connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", + sizeof(struct xt_connlimit_conn), + 0, 0, NULL); + if (!connlimit_conn_cachep) + return -ENOMEM; + + connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", + sizeof(struct xt_connlimit_rb), + 0, 0, NULL); + if (!connlimit_rb_cachep) { + kmem_cache_destroy(connlimit_conn_cachep); + return -ENOMEM; + } + ret = xt_register_match(&connlimit_mt_reg); + if (ret != 0) { + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); + } + return ret; } static void __exit connlimit_mt_exit(void) { - xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); + xt_unregister_match(&connlimit_mt_reg); + kmem_cache_destroy(connlimit_conn_cachep); + kmem_cache_destroy(connlimit_rb_cachep); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 7278145e6a6..69f78e96fdb 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -17,8 +17,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/module.h> diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 61805d7b38a..188404b9b00 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -3,6 +3,7 @@ * information. (Superset of Rusty's minimalistic state match.) * * (C) 2001 Marc Boucher (marc@mbsi.ca). + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index d95f9c963cd..a3910fc2122 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -3,6 +3,7 @@ * separately for each hashbucket (sourceip/sourceport/dstip/dstport) * * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * Development of this code was funded by Astaro AG, http://www.astaro.com/ @@ -107,6 +108,7 @@ struct xt_hashlimit_htable { /* seq_file stuff */ struct proc_dir_entry *pde; + const char *name; struct net *net; struct hlist_head hash[0]; /* hashtable itself */ @@ -141,11 +143,10 @@ dsthash_find(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) { struct dsthash_ent *ent; - struct hlist_node *pos; u_int32_t hash = hash_dst(ht, dst); if (!hlist_empty(&ht->hash[hash])) { - hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node) + hlist_for_each_entry_rcu(ent, &ht->hash[hash], node) if (dst_cmp(ent, dst)) { spin_lock(&ent->lock); return ent; @@ -157,11 +158,22 @@ dsthash_find(const struct xt_hashlimit_htable *ht, /* allocate dsthash_ent, initialize dst, put in htable and lock it */ static struct dsthash_ent * dsthash_alloc_init(struct xt_hashlimit_htable *ht, - const struct dsthash_dst *dst) + const struct dsthash_dst *dst, bool *race) { struct dsthash_ent *ent; spin_lock(&ht->lock); + + /* Two or more packets may race to create the same entry in the + * hashtable, double check if this packet lost race. + */ + ent = dsthash_find(ht, dst); + if (ent != NULL) { + spin_unlock(&ht->lock); + *race = true; + return ent; + } + /* initialize hash with random val at the time we allocate * the first hashtable entry */ if (unlikely(!ht->rnd_initialized)) { @@ -171,8 +183,7 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht, if (ht->cfg.max && ht->count >= ht->cfg.max) { /* FIXME: do something. question is what.. */ - if (net_ratelimit()) - pr_err("max count of %u reached\n", ht->cfg.max); + net_err_ratelimited("max count of %u reached\n", ht->cfg.max); ent = NULL; } else ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); @@ -244,6 +255,11 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; + hinfo->name = kstrdup(minfo->name, GFP_KERNEL); + if (!hinfo->name) { + vfree(hinfo); + return -ENOMEM; + } spin_lock_init(&hinfo->lock); hinfo->pde = proc_create_data(minfo->name, 0, @@ -251,6 +267,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, &dl_file_ops, hinfo); if (hinfo->pde == NULL) { + kfree(hinfo->name); vfree(hinfo); return -ENOMEM; } @@ -287,8 +304,8 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); for (i = 0; i < ht->cfg.size; i++) { struct dsthash_ent *dh; - struct hlist_node *pos, *n; - hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + struct hlist_node *n; + hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { if ((*select)(ht, dh)) dsthash_free(ht, dh); } @@ -308,19 +325,26 @@ static void htable_gc(unsigned long htlong) add_timer(&ht->timer); } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) +static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net); struct proc_dir_entry *parent; - del_timer_sync(&hinfo->timer); - if (hinfo->family == NFPROTO_IPV4) parent = hashlimit_net->ipt_hashlimit; else parent = hashlimit_net->ip6t_hashlimit; - remove_proc_entry(hinfo->pde->name, parent); + + if (parent != NULL) + remove_proc_entry(hinfo->name, parent); +} + +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ + del_timer_sync(&hinfo->timer); + htable_remove_proc_entry(hinfo); htable_selective_cleanup(hinfo, select_all); + kfree(hinfo->name); vfree(hinfo); } @@ -330,10 +354,9 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; - struct hlist_node *pos; - hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) { - if (!strcmp(name, hinfo->pde->name) && + hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { + if (!strcmp(name, hinfo->name) && hinfo->family == family) { hinfo->use++; return hinfo; @@ -388,9 +411,20 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) +/* in byte mode, the lowest possible rate is one packet/second. + * credit_cap is used as a counter that tells us how many times we can + * refill the "credits available" counter when it becomes empty. + */ +#define MAX_CPJ_BYTES (0xFFFFFFFF / HZ) +#define CREDITS_PER_JIFFY_BYTES POW2_BELOW32(MAX_CPJ_BYTES) + +static u32 xt_hashlimit_len_to_chunks(u32 len) +{ + return (len >> XT_HASHLIMIT_BYTE_SHIFT) + 1; +} + /* Precision saver. */ -static inline u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user) { /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -400,12 +434,53 @@ user2credits(u_int32_t user) return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; } -static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +static u32 user2credits_byte(u32 user) { - dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY; - if (dh->rateinfo.credit > dh->rateinfo.credit_cap) - dh->rateinfo.credit = dh->rateinfo.credit_cap; + u64 us = user; + us *= HZ * CREDITS_PER_JIFFY_BYTES; + return (u32) (us >> 32); +} + +static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) +{ + unsigned long delta = now - dh->rateinfo.prev; + u32 cap; + + if (delta == 0) + return; + dh->rateinfo.prev = now; + + if (mode & XT_HASHLIMIT_BYTES) { + u32 tmp = dh->rateinfo.credit; + dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; + cap = CREDITS_PER_JIFFY_BYTES * HZ; + if (tmp >= dh->rateinfo.credit) {/* overflow */ + dh->rateinfo.credit = cap; + return; + } + } else { + dh->rateinfo.credit += delta * CREDITS_PER_JIFFY; + cap = dh->rateinfo.credit_cap; + } + if (dh->rateinfo.credit > cap) + dh->rateinfo.credit = cap; +} + +static void rateinfo_init(struct dsthash_ent *dh, + struct xt_hashlimit_htable *hinfo) +{ + dh->rateinfo.prev = jiffies; + if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { + dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; + dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg); + dh->rateinfo.credit_cap = hinfo->cfg.burst; + } else { + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + dh->rateinfo.credit_cap = dh->rateinfo.credit; + } } static inline __be32 maskl(__be32 a, unsigned int l) @@ -511,6 +586,21 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, return 0; } +static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) +{ + u64 tmp = xt_hashlimit_len_to_chunks(len); + tmp = tmp * dh->rateinfo.cost; + + if (unlikely(tmp > CREDITS_PER_JIFFY_BYTES * HZ)) + tmp = CREDITS_PER_JIFFY_BYTES * HZ; + + if (dh->rateinfo.credit < tmp && dh->rateinfo.credit_cap) { + dh->rateinfo.credit_cap--; + dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; + } + return (u32) tmp; +} + static bool hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { @@ -519,6 +609,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; + bool race = false; + u32 cost; if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; @@ -526,27 +618,32 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) rcu_read_lock_bh(); dh = dsthash_find(hinfo, &dst); if (dh == NULL) { - dh = dsthash_alloc_init(hinfo, &dst); + dh = dsthash_alloc_init(hinfo, &dst, &race); if (dh == NULL) { rcu_read_unlock_bh(); goto hotdrop; + } else if (race) { + /* Already got an entry, update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + rateinfo_recalc(dh, now, hinfo->cfg.mode); + } else { + dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); + rateinfo_init(dh, hinfo); } - dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); - dh->rateinfo.prev = jiffies; - dh->rateinfo.credit = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); - dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); - dh->rateinfo.cost = user2credits(hinfo->cfg.avg); } else { /* update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now); + rateinfo_recalc(dh, now, hinfo->cfg.mode); } - if (dh->rateinfo.credit >= dh->rateinfo.cost) { + if (info->cfg.mode & XT_HASHLIMIT_BYTES) + cost = hashlimit_byte_cost(skb->len, dh); + else + cost = dh->rateinfo.cost; + + if (dh->rateinfo.credit >= cost) { /* below the limit */ - dh->rateinfo.credit -= dh->rateinfo.cost; + dh->rateinfo.credit -= cost; spin_unlock(&dh->lock); rcu_read_unlock_bh(); return !(info->cfg.mode & XT_HASHLIMIT_INVERT); @@ -568,14 +665,6 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par) struct xt_hashlimit_mtinfo1 *info = par->matchinfo; int ret; - /* Check for overflow. */ - if (info->cfg.burst == 0 || - user2credits(info->cfg.avg * info->cfg.burst) < - user2credits(info->cfg.avg)) { - pr_info("overflow, try lower: %u/%u\n", - info->cfg.avg, info->cfg.burst); - return -ERANGE; - } if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) return -EINVAL; if (info->name[sizeof(info->name)-1] != '\0') @@ -588,6 +677,26 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->cfg.mode & ~XT_HASHLIMIT_ALL) { + pr_info("Unknown mode mask %X, kernel too old?\n", + info->cfg.mode); + return -EINVAL; + } + + /* Check for overflow. */ + if (info->cfg.mode & XT_HASHLIMIT_BYTES) { + if (user2credits_byte(info->cfg.avg) == 0) { + pr_info("overflow, rate too high: %u\n", info->cfg.avg); + return -EINVAL; + } + } else if (info->cfg.burst == 0 || + user2credits(info->cfg.avg * info->cfg.burst) < + user2credits(info->cfg.avg)) { + pr_info("overflow, try lower: %u/%u\n", + info->cfg.avg, info->cfg.burst); + return -ERANGE; + } + mutex_lock(&hashlimit_mutex); info->hinfo = htable_find_get(net, info->name, par->family); if (info->hinfo == NULL) { @@ -680,10 +789,11 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { int res; + const struct xt_hashlimit_htable *ht = s->private; spin_lock(&ent->lock); /* recalculate to show accurate numbers */ - rateinfo_recalc(ent, jiffies); + rateinfo_recalc(ent, jiffies, ht->cfg.mode); switch (family) { case NFPROTO_IPV4: @@ -721,10 +831,9 @@ static int dl_seq_show(struct seq_file *s, void *v) struct xt_hashlimit_htable *htable = s->private; unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; - struct hlist_node *pos; if (!hlist_empty(&htable->hash[*bucket])) { - hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) + hlist_for_each_entry(ent, &htable->hash[*bucket], node) if (dl_seq_real_show(ent, htable->family, s)) return -1; } @@ -744,7 +853,7 @@ static int dl_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode)->data; + sf->private = PDE_DATA(inode); } return ret; } @@ -767,7 +876,7 @@ static int __net_init hashlimit_proc_net_init(struct net *net) #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net); if (!hashlimit_net->ip6t_hashlimit) { - proc_net_remove(net, "ipt_hashlimit"); + remove_proc_entry("ipt_hashlimit", net->proc_net); return -ENOMEM; } #endif @@ -776,9 +885,23 @@ static int __net_init hashlimit_proc_net_init(struct net *net) static void __net_exit hashlimit_proc_net_exit(struct net *net) { - proc_net_remove(net, "ipt_hashlimit"); + struct xt_hashlimit_htable *hinfo; + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + /* hashlimit_net_exit() is called before hashlimit_mt_destroy(). + * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc + * entries is empty before trying to remove it. + */ + mutex_lock(&hashlimit_mutex); + hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) + htable_remove_proc_entry(hinfo); + hashlimit_net->ipt_hashlimit = NULL; + hashlimit_net->ip6t_hashlimit = NULL; + mutex_unlock(&hashlimit_mutex); + + remove_proc_entry("ipt_hashlimit", net->proc_net); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - proc_net_remove(net, "ip6t_hashlimit"); + remove_proc_entry("ip6t_hashlimit", net->proc_net); #endif } @@ -792,9 +915,6 @@ static int __net_init hashlimit_net_init(struct net *net) static void __net_exit hashlimit_net_exit(struct net *net) { - struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); - - BUG_ON(!hlist_empty(&hashlimit_net->htables)); hashlimit_proc_net_exit(net); } diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c new file mode 100644 index 00000000000..89d53104c6b --- /dev/null +++ b/net/netfilter/xt_ipcomp.c @@ -0,0 +1,111 @@ +/* Kernel module to match IPComp parameters for IPv4 and IPv6 + * + * Copyright (C) 2013 WindRiver + * + * Author: + * Fan Du <fan.du@windriver.com> + * + * Based on: + * net/netfilter/xt_esp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter/xt_ipcomp.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fan Du <fan.du@windriver.com>"); +MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r = (spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip_comp_hdr _comphdr; + const struct ip_comp_hdr *chdr; + const struct xt_ipcomp *compinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr); + if (chdr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil IPComp tinygram.\n"); + par->hotdrop = true; + return 0; + } + + return spi_match(compinfo->spis[0], compinfo->spis[1], + ntohs(chdr->cpi), + !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); +} + +static int comp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_ipcomp *compinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) { + pr_err("unknown flags %X\n", compinfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match comp_mt_reg[] __read_mostly = { + { + .name = "ipcomp", + .family = NFPROTO_IPV4, + .match = comp_mt, + .matchsize = sizeof(struct xt_ipcomp), + .proto = IPPROTO_COMP, + .checkentry = comp_mt_check, + .me = THIS_MODULE, + }, + { + .name = "ipcomp", + .family = NFPROTO_IPV6, + .match = comp_mt, + .matchsize = sizeof(struct xt_ipcomp), + .proto = IPPROTO_COMP, + .checkentry = comp_mt_check, + .me = THIS_MODULE, + }, +}; + +static int __init comp_mt_init(void) +{ + return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +static void __exit comp_mt_exit(void) +{ + xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); +} + +module_init(comp_mt_init); +module_exit(comp_mt_exit); diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index bb10b0717f1..8d47c3780fd 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -67,7 +67,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) goto out; } - ip_vs_fill_iphdr(family, skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(family, skb, &iph); if (data->bitmask & XT_IPVS_PROTO) if ((iph.protocol == data->l4proto) ^ @@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */); + cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */); if (unlikely(cp == NULL)) { match = false; goto out; diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c new file mode 100644 index 00000000000..8aee572771f --- /dev/null +++ b/net/netfilter/xt_l2tp.c @@ -0,0 +1,354 @@ +/* Kernel module to match L2TP header parameters. */ + +/* (C) 2013 James Chapman <jchapman@katalix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <linux/l2tp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_l2tp.h> + +/* L2TP header masks */ +#define L2TP_HDR_T_BIT 0x8000 +#define L2TP_HDR_L_BIT 0x4000 +#define L2TP_HDR_VER 0x000f + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); +MODULE_DESCRIPTION("Xtables: L2TP header match"); +MODULE_ALIAS("ipt_l2tp"); +MODULE_ALIAS("ip6t_l2tp"); + +/* The L2TP fields that can be matched */ +struct l2tp_data { + u32 tid; + u32 sid; + u8 type; + u8 version; +}; + +union l2tp_val { + __be16 val16[2]; + __be32 val32; +}; + +static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data) +{ + if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type)) + return false; + + if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version)) + return false; + + /* Check tid only for L2TPv3 control or any L2TPv2 packets */ + if ((info->flags & XT_L2TP_TID) && + ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) && + (info->tid != data->tid)) + return false; + + /* Check sid only for L2TP data packets */ + if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) && + (info->sid != data->sid)) + return false; + + return true; +} + +/* Parse L2TP header fields when UDP encapsulation is used. Handles + * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a + * different format. See + * RFC2661, Section 3.1, L2TPv2 Header Format + * RFC3931, Section 3.2.1, L2TPv3 Control Message Header + * RFC3931, Section 3.2.2, L2TPv3 Data Message Header + * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP + */ +static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ + const struct xt_l2tp_info *info = par->matchinfo; + int uhlen = sizeof(struct udphdr); + int offs = thoff + uhlen; + union l2tp_val *lh; + union l2tp_val lhbuf; + u16 flags; + struct l2tp_data data = { 0, }; + + if (par->fragoff != 0) + return false; + + /* Extract L2TP header fields. The flags in the first 16 bits + * tell us where the other fields are. + */ + lh = skb_header_pointer(skb, offs, 2, &lhbuf); + if (lh == NULL) + return false; + + flags = ntohs(lh->val16[0]); + if (flags & L2TP_HDR_T_BIT) + data.type = XT_L2TP_TYPE_CONTROL; + else + data.type = XT_L2TP_TYPE_DATA; + data.version = (u8) flags & L2TP_HDR_VER; + + /* Now extract the L2TP tid/sid. These are in different places + * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we + * must also check to see if the length field is present, + * since this affects the offsets into the packet of the + * tid/sid fields. + */ + if (data.version == 3) { + lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf); + if (lh == NULL) + return false; + if (data.type == XT_L2TP_TYPE_CONTROL) + data.tid = ntohl(lh->val32); + else + data.sid = ntohl(lh->val32); + } else if (data.version == 2) { + if (flags & L2TP_HDR_L_BIT) + offs += 2; + lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf); + if (lh == NULL) + return false; + data.tid = (u32) ntohs(lh->val16[0]); + data.sid = (u32) ntohs(lh->val16[1]); + } else + return false; + + return l2tp_match(info, &data); +} + +/* Parse L2TP header fields for IP encapsulation (no UDP header). + * L2TPv3 data packets have a different form with IP encap. See + * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP. + * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP. + */ +static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) +{ + const struct xt_l2tp_info *info = par->matchinfo; + union l2tp_val *lh; + union l2tp_val lhbuf; + struct l2tp_data data = { 0, }; + + /* For IP encap, the L2TP sid is the first 32-bits. */ + lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf); + if (lh == NULL) + return false; + if (lh->val32 == 0) { + /* Must be a control packet. The L2TP tid is further + * into the packet. + */ + data.type = XT_L2TP_TYPE_CONTROL; + lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf), + &lhbuf); + if (lh == NULL) + return false; + data.tid = ntohl(lh->val32); + } else { + data.sid = ntohl(lh->val32); + data.type = XT_L2TP_TYPE_DATA; + } + + data.version = 3; + + return l2tp_match(info, &data); +} + +static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct iphdr *iph = ip_hdr(skb); + u8 ipproto = iph->protocol; + + /* l2tp_mt_check4 already restricts the transport protocol */ + switch (ipproto) { + case IPPROTO_UDP: + return l2tp_udp_mt(skb, par, par->thoff); + case IPPROTO_L2TP: + return l2tp_ip_mt(skb, par, par->thoff); + } + + return false; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + unsigned int thoff = 0; + unsigned short fragoff = 0; + int ipproto; + + ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (fragoff != 0) + return false; + + /* l2tp_mt_check6 already restricts the transport protocol */ + switch (ipproto) { + case IPPROTO_UDP: + return l2tp_udp_mt(skb, par, thoff); + case IPPROTO_L2TP: + return l2tp_ip_mt(skb, par, thoff); + } + + return false; +} +#endif + +static int l2tp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + + /* Check for invalid flags */ + if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | + XT_L2TP_TYPE)) { + pr_info("unknown flags: %x\n", info->flags); + return -EINVAL; + } + + /* At least one of tid, sid or type=control must be specified */ + if ((!(info->flags & XT_L2TP_TID)) && + (!(info->flags & XT_L2TP_SID)) && + ((!(info->flags & XT_L2TP_TYPE)) || + (info->type != XT_L2TP_TYPE_CONTROL))) { + pr_info("invalid flags combination: %x\n", info->flags); + return -EINVAL; + } + + /* If version 2 is specified, check that incompatible params + * are not supplied + */ + if (info->flags & XT_L2TP_VERSION) { + if ((info->version < 2) || (info->version > 3)) { + pr_info("wrong L2TP version: %u\n", info->version); + return -EINVAL; + } + + if (info->version == 2) { + if ((info->flags & XT_L2TP_TID) && + (info->tid > 0xffff)) { + pr_info("v2 tid > 0xffff: %u\n", info->tid); + return -EINVAL; + } + if ((info->flags & XT_L2TP_SID) && + (info->sid > 0xffff)) { + pr_info("v2 sid > 0xffff: %u\n", info->sid); + return -EINVAL; + } + } + } + + return 0; +} + +static int l2tp_mt_check4(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + const struct ipt_entry *e = par->entryinfo; + const struct ipt_ip *ip = &e->ip; + int ret; + + ret = l2tp_mt_check(par); + if (ret != 0) + return ret; + + if ((ip->proto != IPPROTO_UDP) && + (ip->proto != IPPROTO_L2TP)) { + pr_info("missing protocol rule (udp|l2tpip)\n"); + return -EINVAL; + } + + if ((ip->proto == IPPROTO_L2TP) && + (info->version == 2)) { + pr_info("v2 doesn't support IP mode\n"); + return -EINVAL; + } + + return 0; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int l2tp_mt_check6(const struct xt_mtchk_param *par) +{ + const struct xt_l2tp_info *info = par->matchinfo; + const struct ip6t_entry *e = par->entryinfo; + const struct ip6t_ip6 *ip = &e->ipv6; + int ret; + + ret = l2tp_mt_check(par); + if (ret != 0) + return ret; + + if ((ip->proto != IPPROTO_UDP) && + (ip->proto != IPPROTO_L2TP)) { + pr_info("missing protocol rule (udp|l2tpip)\n"); + return -EINVAL; + } + + if ((ip->proto == IPPROTO_L2TP) && + (info->version == 2)) { + pr_info("v2 doesn't support IP mode\n"); + return -EINVAL; + } + + return 0; +} +#endif + +static struct xt_match l2tp_mt_reg[] __read_mostly = { + { + .name = "l2tp", + .revision = 0, + .family = NFPROTO_IPV4, + .match = l2tp_mt4, + .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), + .checkentry = l2tp_mt_check4, + .hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD)), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "l2tp", + .revision = 0, + .family = NFPROTO_IPV6, + .match = l2tp_mt6, + .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), + .checkentry = l2tp_mt_check6, + .hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD)), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init l2tp_mt_init(void) +{ + return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +static void __exit l2tp_mt_exit(void) +{ + xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); +} + +module_init(l2tp_mt_init); +module_exit(l2tp_mt_exit); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 32b7a579a03..bef85059655 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -1,5 +1,6 @@ /* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -88,8 +89,7 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) } /* Precision saver. */ -static u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user) { /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -118,12 +118,12 @@ static int limit_mt_check(const struct xt_mtchk_param *par) /* For SMP, we only want to use one set of state. */ r->master = priv; + /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * + 128. */ + priv->prev = jiffies; + priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ if (r->cost == 0) { - /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * - 128. */ - priv->prev = jiffies; - priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ - r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } return 0; diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 8160f6b1435..d5b4fd4f91e 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -36,7 +36,7 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (skb_mac_header(skb) + ETH_HLEN > skb->data) return false; - ret = compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr) == 0; + ret = ether_addr_equal(eth_hdr(skb)->h_source, info->srcaddr); ret ^= info->invert; return ret; } diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c new file mode 100644 index 00000000000..bea7464cc43 --- /dev/null +++ b/net/netfilter/xt_nat.c @@ -0,0 +1,170 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat_core.h> + +static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + + if (mr->rangesize != 1) { + pr_info("%s: multiple ranges no longer supported\n", + par->target->name); + return -EINVAL; + } + return 0; +} + +static void xt_nat_convert_range(struct nf_nat_range *dst, + const struct nf_nat_ipv4_range *src) +{ + memset(&dst->min_addr, 0, sizeof(dst->min_addr)); + memset(&dst->max_addr, 0, sizeof(dst->max_addr)); + + dst->flags = src->flags; + dst->min_addr.ip = src->min_ip; + dst->max_addr.ip = src->max_ip; + dst->min_proto = src->min; + dst->max_proto = src->max; +} + +static unsigned int +xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range range; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + xt_nat_convert_range(&range, &mr->range[0]); + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range range; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + xt_nat_convert_range(&range, &mr->range[0]); + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} + +static unsigned int +xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST); +} + +static struct xt_target xt_nat_target_reg[] __read_mostly = { + { + .name = "SNAT", + .revision = 0, + .checkentry = xt_nat_checkentry_v0, + .target = xt_snat_target_v0, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .family = NFPROTO_IPV4, + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "DNAT", + .revision = 0, + .checkentry = xt_nat_checkentry_v0, + .target = xt_dnat_target_v0, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .family = NFPROTO_IPV4, + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, + { + .name = "SNAT", + .revision = 1, + .target = xt_snat_target_v1, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "DNAT", + .revision = 1, + .target = xt_dnat_target_v1, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, +}; + +static int __init xt_nat_init(void) +{ + return xt_register_targets(xt_nat_target_reg, + ARRAY_SIZE(xt_nat_target_reg)); +} + +static void __exit xt_nat_exit(void) +{ + xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); +} + +module_init(xt_nat_init); +module_exit(xt_nat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ipt_SNAT"); +MODULE_ALIAS("ipt_DNAT"); +MODULE_ALIAS("ip6t_SNAT"); +MODULE_ALIAS("ip6t_DNAT"); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index b3be0ef21f1..8c646ed9c92 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -21,11 +21,14 @@ MODULE_ALIAS("ip6t_nfacct"); static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) { + int overquota; const struct xt_nfacct_match_info *info = par->targinfo; nfnl_acct_update(skb, info->nfacct); - return true; + overquota = nfnl_acct_overquota(skb, info->nfacct); + + return overquota == NFACCT_UNDERQUOTA ? false : true; } static int diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 846f895cb65..c529161cdbf 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -13,8 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> @@ -201,6 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) unsigned char opts[MAX_IPOPTLEN]; const struct xt_osf_finger *kf; const struct xt_osf_user_finger *f; + struct net *net = dev_net(p->in ? p->in : p->out); if (!info) return false; @@ -269,7 +269,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) mss <<= 8; mss |= optp[2]; - mss = ntohs(mss); + mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: loop_cont = 1; @@ -325,7 +325,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) fcount++; if (info->flags & XT_OSF_LOG) - nf_log_packet(p->family, p->hooknum, skb, + nf_log_packet(net, p->family, p->hooknum, skb, p->in, p->out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, @@ -341,7 +341,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) rcu_read_unlock(); if (!fcount && (info->flags & XT_OSF_LOG)) - nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL, + nf_log_packet(net, p->family, p->hooknum, skb, p->in, + p->out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); @@ -421,4 +422,6 @@ module_exit(xt_osf_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 772d7389b33..ca2e577ed8a 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -17,6 +17,17 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_owner.h> +static int owner_check(const struct xt_mtchk_param *par) +{ + struct xt_owner_match_info *info = par->matchinfo; + + /* For now only allow adding matches from the initial user namespace */ + if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && + (current_user_ns() != &init_user_ns)) + return -EINVAL; + return 0; +} + static bool owner_mt(const struct sk_buff *skb, struct xt_action_param *par) { @@ -37,17 +48,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; - if (info->match & XT_OWNER_UID) - if ((filp->f_cred->fsuid >= info->uid_min && - filp->f_cred->fsuid <= info->uid_max) ^ + if (info->match & XT_OWNER_UID) { + kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); + kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); + if ((uid_gte(filp->f_cred->fsuid, uid_min) && + uid_lte(filp->f_cred->fsuid, uid_max)) ^ !(info->invert & XT_OWNER_UID)) return false; + } - if (info->match & XT_OWNER_GID) - if ((filp->f_cred->fsgid >= info->gid_min && - filp->f_cred->fsgid <= info->gid_max) ^ + if (info->match & XT_OWNER_GID) { + kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); + kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); + if ((gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_OWNER_GID)) return false; + } return true; } @@ -56,6 +73,7 @@ static struct xt_match owner_mt_reg __read_mostly = { .name = "owner", .revision = 1, .family = NFPROTO_UNSPEC, + .checkentry = owner_check, .match = owner_mt, .matchsize = sizeof(struct xt_owner_match_info), .hooks = (1 << NF_INET_LOCAL_OUT) | diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index ed0db15ab00..7720b036d76 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -18,7 +18,7 @@ static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; - struct gnet_stats_rate_est *r; + struct gnet_stats_rate_est64 *r; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index d2ff15a2412..a9faae89f95 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -29,6 +29,7 @@ #include <linux/skbuff.h> #include <linux/inet.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -75,6 +76,7 @@ struct recent_entry { struct recent_table { struct list_head list; char name[XT_RECENT_NAME_LEN]; + union nf_inet_addr mask; unsigned int refcnt; unsigned int entries; struct list_head lru_list; @@ -228,10 +230,10 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = dev_net(par->in ? par->in : par->out); struct recent_net *recent_net = recent_pernet(net); - const struct xt_recent_mtinfo *info = par->matchinfo; + const struct xt_recent_mtinfo_v1 *info = par->matchinfo; struct recent_table *t; struct recent_entry *e; - union nf_inet_addr addr = {}; + union nf_inet_addr addr = {}, addr_mask; u_int8_t ttl; bool ret = info->invert; @@ -261,12 +263,15 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) spin_lock_bh(&recent_lock); t = recent_table_lookup(recent_net, info->name); - e = recent_entry_lookup(t, &addr, par->family, + + nf_inet_addr_mask(&addr, &addr_mask, &t->mask); + + e = recent_entry_lookup(t, &addr_mask, par->family, (info->check_set & XT_RECENT_TTL) ? ttl : 0); if (e == NULL) { if (!(info->check_set & XT_RECENT_SET)) goto out; - e = recent_entry_init(t, &addr, par->family, ttl); + e = recent_entry_init(t, &addr_mask, par->family, ttl); if (e == NULL) par->hotdrop = true; ret = !ret; @@ -306,16 +311,24 @@ out: return ret; } -static int recent_mt_check(const struct xt_mtchk_param *par) +static void recent_table_free(void *addr) +{ + kvfree(addr); +} + +static int recent_mt_check(const struct xt_mtchk_param *par, + const struct xt_recent_mtinfo_v1 *info) { struct recent_net *recent_net = recent_pernet(par->net); - const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; #ifdef CONFIG_PROC_FS struct proc_dir_entry *pde; + kuid_t uid; + kgid_t gid; #endif - unsigned i; + unsigned int i; int ret = -EINVAL; + size_t sz; if (unlikely(!hash_rnd_inited)) { get_random_bytes(&hash_rnd, sizeof(hash_rnd)); @@ -354,27 +367,38 @@ static int recent_mt_check(const struct xt_mtchk_param *par) goto out; } - t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, - GFP_KERNEL); + sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size; + if (sz <= PAGE_SIZE) + t = kzalloc(sz, GFP_KERNEL); + else + t = vzalloc(sz); if (t == NULL) { ret = -ENOMEM; goto out; } t->refcnt = 1; + + memcpy(&t->mask, &info->mask, sizeof(t->mask)); strcpy(t->name, info->name); INIT_LIST_HEAD(&t->lru_list); for (i = 0; i < ip_list_hash_size; i++) INIT_LIST_HEAD(&t->iphash[i]); #ifdef CONFIG_PROC_FS + uid = make_kuid(&init_user_ns, ip_list_uid); + gid = make_kgid(&init_user_ns, ip_list_gid); + if (!uid_valid(uid) || !gid_valid(gid)) { + recent_table_free(t); + ret = -EINVAL; + goto out; + } pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent, &recent_mt_fops, t); if (pde == NULL) { - kfree(t); + recent_table_free(t); ret = -ENOMEM; goto out; } - pde->uid = ip_list_uid; - pde->gid = ip_list_gid; + proc_set_user(pde, uid, gid); #endif spin_lock_bh(&recent_lock); list_add_tail(&t->list, &recent_net->tables); @@ -385,10 +409,28 @@ out: return ret; } +static int recent_mt_check_v0(const struct xt_mtchk_param *par) +{ + const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo; + struct xt_recent_mtinfo_v1 info_v1; + + /* Copy revision 0 structure to revision 1 */ + memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo)); + /* Set default mask to ensure backward compatible behaviour */ + memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all)); + + return recent_mt_check(par, &info_v1); +} + +static int recent_mt_check_v1(const struct xt_mtchk_param *par) +{ + return recent_mt_check(par, par->matchinfo); +} + static void recent_mt_destroy(const struct xt_mtdtor_param *par) { struct recent_net *recent_net = recent_pernet(par->net); - const struct xt_recent_mtinfo *info = par->matchinfo; + const struct xt_recent_mtinfo_v1 *info = par->matchinfo; struct recent_table *t; mutex_lock(&recent_mutex); @@ -398,10 +440,11 @@ static void recent_mt_destroy(const struct xt_mtdtor_param *par) list_del(&t->list); spin_unlock_bh(&recent_lock); #ifdef CONFIG_PROC_FS - remove_proc_entry(t->name, recent_net->xt_recent); + if (recent_net->xt_recent != NULL) + remove_proc_entry(t->name, recent_net->xt_recent); #endif recent_table_flush(t); - kfree(t); + recent_table_free(t); } mutex_unlock(&recent_mutex); } @@ -478,14 +521,13 @@ static const struct seq_operations recent_seq_ops = { static int recent_seq_open(struct inode *inode, struct file *file) { - struct proc_dir_entry *pde = PDE(inode); struct recent_iter_state *st; st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); if (st == NULL) return -ENOMEM; - st->table = pde->data; + st->table = PDE_DATA(inode); return 0; } @@ -493,8 +535,7 @@ static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - struct recent_table *t = pde->data; + struct recent_table *t = PDE_DATA(file_inode(file)); struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; const char *c = buf; @@ -582,7 +623,21 @@ static int __net_init recent_proc_net_init(struct net *net) static void __net_exit recent_proc_net_exit(struct net *net) { - proc_net_remove(net, "xt_recent"); + struct recent_net *recent_net = recent_pernet(net); + struct recent_table *t; + + /* recent_net_exit() is called before recent_mt_destroy(). Make sure + * that the parent xt_recent proc entry is is empty before trying to + * remove it. + */ + spin_lock_bh(&recent_lock); + list_for_each_entry(t, &recent_net->tables, list) + remove_proc_entry(t->name, recent_net->xt_recent); + + recent_net->xt_recent = NULL; + spin_unlock_bh(&recent_lock); + + remove_proc_entry("xt_recent", net->proc_net); } #else static inline int recent_proc_net_init(struct net *net) @@ -605,9 +660,6 @@ static int __net_init recent_net_init(struct net *net) static void __net_exit recent_net_exit(struct net *net) { - struct recent_net *recent_net = recent_pernet(net); - - BUG_ON(!list_empty(&recent_net->tables)); recent_proc_net_exit(net); } @@ -625,7 +677,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), - .checkentry = recent_mt_check, + .checkentry = recent_mt_check_v0, .destroy = recent_mt_destroy, .me = THIS_MODULE, }, @@ -635,10 +687,30 @@ static struct xt_match recent_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), - .checkentry = recent_mt_check, + .checkentry = recent_mt_check_v0, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "recent", + .revision = 1, + .family = NFPROTO_IPV4, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo_v1), + .checkentry = recent_mt_check_v1, .destroy = recent_mt_destroy, .me = THIS_MODULE, }, + { + .name = "recent", + .revision = 1, + .family = NFPROTO_IPV6, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo_v1), + .checkentry = recent_mt_check_v1, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + } }; static int __init recent_mt_init(void) diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 6efe4e5a81c..8fd324116e6 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -5,23 +5,35 @@ * they serve as the hanging-off data accessed through repl.data[]. */ +/* tbl has the following structure equivalent, but is C99 compliant: + * struct { + * struct type##_replace repl; + * struct type##_standard entries[nhooks]; + * struct type##_error term; + * } *tbl; + */ + #define xt_alloc_initial_table(type, typ2) ({ \ unsigned int hook_mask = info->valid_hooks; \ unsigned int nhooks = hweight32(hook_mask); \ unsigned int bytes = 0, hooknum = 0, i = 0; \ struct { \ struct type##_replace repl; \ - struct type##_standard entries[nhooks]; \ - struct type##_error term; \ - } *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ + struct type##_standard entries[]; \ + } *tbl; \ + struct type##_error *term; \ + size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \ + __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \ + tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \ if (tbl == NULL) \ return NULL; \ + term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ - tbl->term = (struct type##_error)typ2##_ERROR_INIT; \ + *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ - sizeof(struct type##_error); \ + sizeof(struct type##_error); \ for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \ if (!(hook_mask & 1)) \ continue; \ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 0ec8138aa47..80c2e2d603e 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,6 +16,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -29,7 +30,7 @@ MODULE_ALIAS("ip6t_SET"); static inline int match_set(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, - const struct ip_set_adt_opt *opt, int inv) + struct ip_set_adt_opt *opt, int inv) { if (ip_set_test(index, skb, par, opt)) inv = !inv; @@ -37,12 +38,12 @@ match_set(ip_set_id_t index, const struct sk_buff *skb, } #define ADT_OPT(n, f, d, fs, cfs, t) \ -const struct ip_set_adt_opt n = { \ +struct ip_set_adt_opt n = { \ .family = f, \ .dim = d, \ .flags = fs, \ .cmdflags = cfs, \ - .timeout = t, \ + .ext.timeout = t, \ } /* Revision 0 interface: backward compatible with netfilter/iptables */ @@ -80,17 +81,17 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) struct xt_set_info_match_v0 *info = par->matchinfo; ip_set_id_t index; - index = ip_set_nfnl_get_byindex(info->match_set.index); + index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find set indentified by id %u to match\n", + pr_warning("Cannot find set identified by id %u to match\n", info->match_set.index); return -ENOENT; } if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { pr_warning("Protocol error: set match dimension " "is over the limit!\n"); - ip_set_nfnl_put(info->match_set.index); + ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; } @@ -105,9 +106,104 @@ set_match_v0_destroy(const struct xt_mtdtor_param *par) { struct xt_set_info_match_v0 *info = par->matchinfo; - ip_set_nfnl_put(info->match_set.index); + ip_set_nfnl_put(par->net, info->match_set.index); } +/* Revision 1 match */ + +static bool +set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v1 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, 0, UINT_MAX); + + if (opt.flags & IPSET_RETURN_NOMATCH) + opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; + + return match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); +} + +static int +set_match_v1_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_set_info_match_v1 *info = par->matchinfo; + ip_set_id_t index; + + index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); + + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find set identified by id %u to match\n", + info->match_set.index); + return -ENOENT; + } + if (info->match_set.dim > IPSET_DIM_MAX) { + pr_warning("Protocol error: set match dimension " + "is over the limit!\n"); + ip_set_nfnl_put(par->net, info->match_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_match_v1_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_set_info_match_v1 *info = par->matchinfo; + + ip_set_nfnl_put(par->net, info->match_set.index); +} + +/* Revision 3 match */ + +static bool +match_counter(u64 counter, const struct ip_set_counter_match *info) +{ + switch (info->op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == info->value; + case IPSET_COUNTER_NE: + return counter != info->value; + case IPSET_COUNTER_LT: + return counter < info->value; + case IPSET_COUNTER_GT: + return counter > info->value; + } + return false; +} + +static bool +set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v3 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, info->flags, UINT_MAX); + int ret; + + if (info->packets.op != IPSET_COUNTER_NONE || + info->bytes.op != IPSET_COUNTER_NONE) + opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; + + ret = match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); + + if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) + return ret; + + if (!match_counter(opt.ext.packets, &info->packets)) + return 0; + return match_counter(opt.ext.bytes, &info->bytes); +} + +#define set_match_v3_checkentry set_match_v1_checkentry +#define set_match_v3_destroy set_match_v1_destroy + +/* Revision 0 interface: backward compatible with netfilter/iptables */ + static unsigned int set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { @@ -132,7 +228,7 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) ip_set_id_t index; if (info->add_set.index != IPSET_INVALID_ID) { - index = ip_set_nfnl_get_byindex(info->add_set.index); + index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { pr_warning("Cannot find add_set index %u as target\n", info->add_set.index); @@ -141,12 +237,12 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) } if (info->del_set.index != IPSET_INVALID_ID) { - index = ip_set_nfnl_get_byindex(info->del_set.index); + index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { pr_warning("Cannot find del_set index %u as target\n", info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->add_set.index); + ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; } } @@ -155,9 +251,9 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) pr_warning("Protocol error: SET target dimension " "is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->add_set.index); + ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->del_set.index); + ip_set_nfnl_put(par->net, info->del_set.index); return -ERANGE; } @@ -174,54 +270,12 @@ set_target_v0_destroy(const struct xt_tgdtor_param *par) const struct xt_set_info_target_v0 *info = par->targinfo; if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->add_set.index); + ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->del_set.index); -} - -/* Revision 1 match and target */ - -static bool -set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) -{ - const struct xt_set_info_match_v1 *info = par->matchinfo; - ADT_OPT(opt, par->family, info->match_set.dim, - info->match_set.flags, 0, UINT_MAX); - - return match_set(info->match_set.index, skb, par, &opt, - info->match_set.flags & IPSET_INV_MATCH); -} - -static int -set_match_v1_checkentry(const struct xt_mtchk_param *par) -{ - struct xt_set_info_match_v1 *info = par->matchinfo; - ip_set_id_t index; - - index = ip_set_nfnl_get_byindex(info->match_set.index); - - if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find set indentified by id %u to match\n", - info->match_set.index); - return -ENOENT; - } - if (info->match_set.dim > IPSET_DIM_MAX) { - pr_warning("Protocol error: set match dimension " - "is over the limit!\n"); - ip_set_nfnl_put(info->match_set.index); - return -ERANGE; - } - - return 0; + ip_set_nfnl_put(par->net, info->del_set.index); } -static void -set_match_v1_destroy(const struct xt_mtdtor_param *par) -{ - struct xt_set_info_match_v1 *info = par->matchinfo; - - ip_set_nfnl_put(info->match_set.index); -} +/* Revision 1 target */ static unsigned int set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) @@ -247,7 +301,7 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) ip_set_id_t index; if (info->add_set.index != IPSET_INVALID_ID) { - index = ip_set_nfnl_get_byindex(info->add_set.index); + index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { pr_warning("Cannot find add_set index %u as target\n", info->add_set.index); @@ -256,12 +310,12 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) } if (info->del_set.index != IPSET_INVALID_ID) { - index = ip_set_nfnl_get_byindex(info->del_set.index); + index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { pr_warning("Cannot find del_set index %u as target\n", info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->add_set.index); + ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; } } @@ -270,9 +324,9 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) pr_warning("Protocol error: SET target dimension " "is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->add_set.index); + ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->del_set.index); + ip_set_nfnl_put(par->net, info->del_set.index); return -ERANGE; } @@ -285,9 +339,9 @@ set_target_v1_destroy(const struct xt_tgdtor_param *par) const struct xt_set_info_target_v1 *info = par->targinfo; if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->add_set.index); + ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(info->del_set.index); + ip_set_nfnl_put(par->net, info->del_set.index); } /* Revision 2 target */ @@ -301,6 +355,10 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) ADT_OPT(del_opt, par->family, info->del_set.dim, info->del_set.flags, 0, UINT_MAX); + /* Normalize to fit into jiffies */ + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -343,6 +401,48 @@ static struct xt_match set_matches[] __read_mostly = { .destroy = set_match_v1_destroy, .me = THIS_MODULE }, + /* --return-nomatch flag support */ + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 2, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 2, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, + /* counters support: update, match */ + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, }; static struct xt_target set_targets[] __read_mostly = { @@ -376,6 +476,7 @@ static struct xt_target set_targets[] __read_mostly = { .destroy = set_target_v1_destroy, .me = THIS_MODULE }, + /* --timeout and --exist flags support */ { .name = "SET", .revision = 2, diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 72bb07f57f9..1ba67931eb1 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -19,12 +19,12 @@ #include <net/icmp.h> #include <net/sock.h> #include <net/inet_sock.h> -#include <net/netfilter/nf_tproxy_core.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #define XT_SOCKET_HAVE_IPV6 1 #include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/inet6_hashtables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif @@ -35,15 +35,6 @@ #include <net/netfilter/nf_conntrack.h> #endif -static void -xt_socket_put_sk(struct sock *sk) -{ - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put(inet_twsk(sk)); - else - sock_put(sk); -} - static int extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, @@ -101,16 +92,53 @@ extract_icmp4_fields(const struct sk_buff *skb, return 0; } +/* "socket" match based redirection (no specific rule) + * =================================================== + * + * There are connections with dynamic endpoints (e.g. FTP data + * connection) that the user is unable to add explicit rules + * for. These are taken care of by a generic "socket" rule. It is + * assumed that the proxy application is trusted to open such + * connections without explicit iptables rule (except of course the + * generic 'socket' rule). In this case the following sockets are + * matched in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple + * + * - match: if there's a non-zero bound listener (possibly with a + * non-local address) We don't accept zero-bound listeners, since + * then local services could intercept traffic going through the + * box. + */ +static struct sock * +xt_socket_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return __inet_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + return NULL; +} + static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) { const struct iphdr *iph = ip_hdr(skb); struct udphdr _hdr, *hp = NULL; - struct sock *sk; - __be32 daddr, saddr; - __be16 dport, sport; - u8 protocol; + struct sock *sk = skb->sk; + __be32 uninitialized_var(daddr), uninitialized_var(saddr); + __be16 uninitialized_var(dport), uninitialized_var(sport); + u8 uninitialized_var(protocol); #ifdef XT_SOCKET_HAVE_CONNTRACK struct nf_conn const *ct; enum ip_conntrack_info ctinfo; @@ -155,25 +183,31 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, } #endif - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); - if (sk != NULL) { + if (!sk) + sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, + saddr, daddr, sport, dport, + par->in); + if (sk) { bool wildcard; bool transparent = true; - /* Ignore sockets listening on INADDR_ANY */ - wildcard = (sk->sk_state != TCP_TIME_WAIT && + /* Ignore sockets listening on INADDR_ANY, + * unless XT_SOCKET_NOWILDCARD is set + */ + wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && + sk->sk_state != TCP_TIME_WAIT && inet_sk(sk)->inet_rcv_saddr == 0); /* Ignore non-transparent sockets, if XT_SOCKET_TRANSPARENT is used */ - if (info && info->flags & XT_SOCKET_TRANSPARENT) + if (info->flags & XT_SOCKET_TRANSPARENT) transparent = ((sk->sk_state != TCP_TIME_WAIT && inet_sk(sk)->transparent) || (sk->sk_state == TCP_TIME_WAIT && inet_twsk(sk)->tw_transparent)); - xt_socket_put_sk(sk); + if (sk != skb->sk) + sock_gen_put(sk); if (wildcard || !transparent) sk = NULL; @@ -190,11 +224,15 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, static bool socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) { - return socket_match(skb, par, NULL); + static struct xt_socket_mtinfo1 xt_info_v0 = { + .flags = 0, + }; + + return socket_match(skb, par, &xt_info_v0); } static bool -socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } @@ -255,18 +293,37 @@ extract_icmp6_fields(const struct sk_buff *skb, return 0; } +static struct sock * +xt_socket_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return inet6_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + + return NULL; +} + static bool -socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6hdr *iph = ipv6_hdr(skb); struct udphdr _hdr, *hp = NULL; - struct sock *sk; - struct in6_addr *daddr, *saddr; - __be16 dport, sport; - int thoff, tproto; + struct sock *sk = skb->sk; + struct in6_addr *daddr = NULL, *saddr = NULL; + __be16 uninitialized_var(dport), uninitialized_var(sport); + int thoff = 0, uninitialized_var(tproto); const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { pr_debug("unable to find transport header in IPv6 packet, dropping\n"); return NF_DROP; @@ -291,25 +348,31 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) return false; } - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, - saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); - if (sk != NULL) { + if (!sk) + sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, + saddr, daddr, sport, dport, + par->in); + if (sk) { bool wildcard; bool transparent = true; - /* Ignore sockets listening on INADDR_ANY */ - wildcard = (sk->sk_state != TCP_TIME_WAIT && - ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)); + /* Ignore sockets listening on INADDR_ANY + * unless XT_SOCKET_NOWILDCARD is set + */ + wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && + sk->sk_state != TCP_TIME_WAIT && + ipv6_addr_any(&sk->sk_v6_rcv_saddr)); /* Ignore non-transparent sockets, if XT_SOCKET_TRANSPARENT is used */ - if (info && info->flags & XT_SOCKET_TRANSPARENT) + if (info->flags & XT_SOCKET_TRANSPARENT) transparent = ((sk->sk_state != TCP_TIME_WAIT && inet_sk(sk)->transparent) || (sk->sk_state == TCP_TIME_WAIT && inet_twsk(sk)->tw_transparent)); - xt_socket_put_sk(sk); + if (sk != skb->sk) + sock_gen_put(sk); if (wildcard || !transparent) sk = NULL; @@ -325,6 +388,28 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) } #endif +static int socket_mt_v1_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V1) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); + return -EINVAL; + } + return 0; +} + +static int socket_mt_v2_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V2) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); + return -EINVAL; + } + return 0; +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -339,7 +424,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, - .match = socket_mt4_v1, + .match = socket_mt4_v1_v2, + .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -350,7 +436,32 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV6, - .match = socket_mt6_v1, + .match = socket_mt6_v1_v2, + .checkentry = socket_mt_v1_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif + { + .name = "socket", + .revision = 2, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2, + .checkentry = socket_mt_v2_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 2, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2, + .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 4fe4fb4276d..11de55e7a86 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -37,7 +37,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: - if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) + if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability) ret = !ret; break; case XT_STATISTIC_MODE_NTH: diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index c48975ff8ea..0ae55a36f49 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -42,6 +42,7 @@ static const u_int16_t days_since_leapyear[] = { */ enum { DSE_FIRST = 2039, + SECONDS_PER_DAY = 86400, }; static const u_int16_t days_since_epoch[] = { /* 2039 - 2030 */ @@ -78,7 +79,7 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time) unsigned int v, w; /* Each day has 86400s, so finding the hour/minute is actually easy. */ - v = time % 86400; + v = time % SECONDS_PER_DAY; r->second = v % 60; w = v / 60; r->minute = w % 60; @@ -199,6 +200,18 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) if (packet_time < info->daytime_start && packet_time > info->daytime_stop) return false; + + /** if user asked to ignore 'next day', then e.g. + * '1 PM Wed, August 1st' should be treated + * like 'Tue 1 PM July 31st'. + * + * This also causes + * 'Monday, "23:00 to 01:00", to match for 2 hours, starting + * Monday 23:00 to Tuesday 01:00. + */ + if ((info->flags & XT_TIME_CONTIGUOUS) && + packet_time <= info->daytime_stop) + stamp -= SECONDS_PER_DAY; } localtime_2(¤t_time, stamp); @@ -227,6 +240,15 @@ static int time_mt_check(const struct xt_mtchk_param *par) return -EDOM; } + if (info->flags & ~XT_TIME_ALL_FLAGS) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS); + return -EINVAL; + } + + if ((info->flags & XT_TIME_CONTIGUOUS) && + info->daytime_start < info->daytime_stop) + return -EINVAL; + return 0; } |
