aboutsummaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig485
-rw-r--r--net/netfilter/Makefile74
-rw-r--r--net/netfilter/core.c136
-rw-r--r--net/netfilter/ipset/Kconfig159
-rw-r--r--net/netfilter/ipset/Makefile28
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h289
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c377
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c414
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c311
-rw-r--r--net/netfilter/ipset/ip_set_core.c2011
-rw-r--r--net/netfilter/ipset/ip_set_getport.c174
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h1160
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c315
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c321
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c390
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c402
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c561
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c397
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c610
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c481
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c509
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c587
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c685
-rw-r--r--net/netfilter/ipset/pfxlen.c313
-rw-r--r--net/netfilter/ipvs/Kconfig26
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c148
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c636
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c1173
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c2071
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c99
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c158
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c139
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c291
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c357
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c26
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c19
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c16
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c72
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c36
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c152
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c54
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c1162
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c203
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c152
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c69
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c76
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c18
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c236
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1647
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c36
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c184
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1228
-rw-r--r--net/netfilter/nf_conntrack_acct.c59
-rw-r--r--net/netfilter/nf_conntrack_amanda.c14
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c82
-rw-r--r--net/netfilter/nf_conntrack_core.c985
-rw-r--r--net/netfilter/nf_conntrack_ecache.c152
-rw-r--r--net/netfilter/nf_conntrack_expect.c245
-rw-r--r--net/netfilter/nf_conntrack_extend.c37
-rw-r--r--net/netfilter/nf_conntrack_ftp.c123
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c2
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c327
-rw-r--r--net/netfilter/nf_conntrack_helper.c298
-rw-r--r--net/netfilter/nf_conntrack_irc.c30
-rw-r--r--net/netfilter/nf_conntrack_labels.c108
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c74
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1778
-rw-r--r--net/netfilter/nf_conntrack_pptp.c63
-rw-r--r--net/netfilter/nf_conntrack_proto.c335
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c274
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c153
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c179
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c299
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c485
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c201
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c247
-rw-r--r--net/netfilter/nf_conntrack_sane.c19
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c243
-rw-r--r--net/netfilter/nf_conntrack_sip.c460
-rw-r--r--net/netfilter/nf_conntrack_snmp.c78
-rw-r--r--net/netfilter/nf_conntrack_standalone.c149
-rw-r--r--net/netfilter/nf_conntrack_tftp.c18
-rw-r--r--net/netfilter/nf_conntrack_timeout.c51
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c114
-rw-r--r--net/netfilter/nf_internals.h28
-rw-r--r--net/netfilter/nf_log.c222
-rw-r--r--net/netfilter/nf_nat_amanda.c90
-rw-r--r--net/netfilter/nf_nat_core.c898
-rw-r--r--net/netfilter/nf_nat_ftp.c146
-rw-r--r--net/netfilter/nf_nat_helper.c212
-rw-r--r--net/netfilter/nf_nat_irc.c119
-rw-r--r--net/netfilter/nf_nat_proto_common.c114
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c116
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c96
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c85
-rw-r--r--net/netfilter/nf_nat_proto_udp.c76
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c106
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c54
-rw-r--r--net/netfilter/nf_nat_sip.c653
-rw-r--r--net/netfilter/nf_nat_tftp.c52
-rw-r--r--net/netfilter/nf_queue.c289
-rw-r--r--net/netfilter/nf_synproxy_core.c434
-rw-r--r--net/netfilter/nf_tables_api.c4041
-rw-r--r--net/netfilter/nf_tables_core.c271
-rw-r--r--net/netfilter/nf_tables_inet.c104
-rw-r--r--net/netfilter/nf_tproxy_core.c65
-rw-r--r--net/netfilter/nfnetlink.c329
-rw-r--r--net/netfilter/nfnetlink_acct.c454
-rw-r--r--net/netfilter/nfnetlink_cthelper.c680
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c585
-rw-r--r--net/netfilter/nfnetlink_log.c413
-rw-r--r--net/netfilter/nfnetlink_queue.c943
-rw-r--r--net/netfilter/nfnetlink_queue_core.c1352
-rw-r--r--net/netfilter/nfnetlink_queue_ct.c113
-rw-r--r--net/netfilter/nft_bitwise.c146
-rw-r--r--net/netfilter/nft_byteorder.c173
-rw-r--r--net/netfilter/nft_cmp.c223
-rw-r--r--net/netfilter/nft_compat.c793
-rw-r--r--net/netfilter/nft_counter.c113
-rw-r--r--net/netfilter/nft_ct.c421
-rw-r--r--net/netfilter/nft_expr_template.c94
-rw-r--r--net/netfilter/nft_exthdr.c133
-rw-r--r--net/netfilter/nft_hash.c433
-rw-r--r--net/netfilter/nft_immediate.c133
-rw-r--r--net/netfilter/nft_limit.c119
-rw-r--r--net/netfilter/nft_log.c144
-rw-r--r--net/netfilter/nft_lookup.c149
-rw-r--r--net/netfilter/nft_meta.c334
-rw-r--r--net/netfilter/nft_nat.c224
-rw-r--r--net/netfilter/nft_payload.c161
-rw-r--r--net/netfilter/nft_queue.c132
-rw-r--r--net/netfilter/nft_rbtree.c294
-rw-r--r--net/netfilter/nft_reject.c74
-rw-r--r--net/netfilter/nft_reject_inet.c63
-rw-r--r--net/netfilter/x_tables.c190
-rw-r--r--net/netfilter/xt_AUDIT.c231
-rw-r--r--net/netfilter/xt_CLASSIFY.c36
-rw-r--r--net/netfilter/xt_CT.c355
-rw-r--r--net/netfilter/xt_DSCP.c2
-rw-r--r--net/netfilter/xt_HL.c64
-rw-r--r--net/netfilter/xt_HMARK.c372
-rw-r--r--net/netfilter/xt_IDLETIMER.c4
-rw-r--r--net/netfilter/xt_LED.c2
-rw-r--r--net/netfilter/xt_LOG.c975
-rw-r--r--net/netfilter/xt_NETMAP.c165
-rw-r--r--net/netfilter/xt_NFLOG.c3
-rw-r--r--net/netfilter/xt_NFQUEUE.c112
-rw-r--r--net/netfilter/xt_NOTRACK.c53
-rw-r--r--net/netfilter/xt_RATEEST.c11
-rw-r--r--net/netfilter/xt_REDIRECT.c190
-rw-r--r--net/netfilter/xt_TCPMSS.c138
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c32
-rw-r--r--net/netfilter/xt_TEE.c58
-rw-r--r--net/netfilter/xt_TPROXY.c201
-rw-r--r--net/netfilter/xt_addrtype.c248
-rw-r--r--net/netfilter/xt_bpf.c74
-rw-r--r--net/netfilter/xt_cgroup.c72
-rw-r--r--net/netfilter/xt_connbytes.c44
-rw-r--r--net/netfilter/xt_connlabel.c99
-rw-r--r--net/netfilter/xt_connlimit.c354
-rw-r--r--net/netfilter/xt_connmark.c3
-rw-r--r--net/netfilter/xt_conntrack.c78
-rw-r--r--net/netfilter/xt_cpu.c2
-rw-r--r--net/netfilter/xt_devgroup.c82
-rw-r--r--net/netfilter/xt_ecn.c179
-rw-r--r--net/netfilter/xt_hashlimit.c245
-rw-r--r--net/netfilter/xt_hl.c32
-rw-r--r--net/netfilter/xt_ipcomp.c111
-rw-r--r--net/netfilter/xt_iprange.c34
-rw-r--r--net/netfilter/xt_ipvs.c4
-rw-r--r--net/netfilter/xt_l2tp.c354
-rw-r--r--net/netfilter/xt_limit.c14
-rw-r--r--net/netfilter/xt_mac.c2
-rw-r--r--net/netfilter/xt_nat.c170
-rw-r--r--net/netfilter/xt_nfacct.c79
-rw-r--r--net/netfilter/xt_osf.c24
-rw-r--r--net/netfilter/xt_owner.c30
-rw-r--r--net/netfilter/xt_quota.c1
-rw-r--r--net/netfilter/xt_rateest.c11
-rw-r--r--net/netfilter/xt_recent.c122
-rw-r--r--net/netfilter/xt_repldata.h22
-rw-r--r--net/netfilter/xt_set.c523
-rw-r--r--net/netfilter/xt_socket.c192
-rw-r--r--net/netfilter/xt_statistic.c3
-rw-r--r--net/netfilter/xt_time.c24
185 files changed, 44616 insertions, 8661 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1534f2b44ca..e9410d17619 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -4,6 +4,14 @@ menu "Core Netfilter Configuration"
config NETFILTER_NETLINK
tristate
+config NETFILTER_NETLINK_ACCT
+tristate "Netfilter NFACCT over NFNETLINK interface"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for extended accounting via NFNETLINK.
+
config NETFILTER_NETLINK_QUEUE
tristate "Netfilter NFQUEUE over NFNETLINK interface"
depends on NETFILTER_ADVANCED
@@ -75,6 +83,16 @@ config NF_CONNTRACK_ZONES
If unsure, say `N'.
+config NF_CONNTRACK_PROCFS
+ bool "Supply CT list in procfs (OBSOLETE)"
+ default y
+ depends on PROC_FS
+ ---help---
+ This option enables for the list of known conntrack entries
+ to be shown in procfs under net/netfilter/nf_conntrack. This
+ is considered obsolete in favor of using the conntrack(8)
+ tool which uses Netlink.
+
config NF_CONNTRACK_EVENTS
bool "Connection tracking events"
depends on NETFILTER_ADVANCED
@@ -85,9 +103,35 @@ config NF_CONNTRACK_EVENTS
If unsure, say `N'.
+config NF_CONNTRACK_TIMEOUT
+ bool 'Connection tracking timeout'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timeout
+ extension. This allows you to attach timeout policies to flow
+ via the CT target.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_TIMESTAMP
+ bool 'Connection tracking timestamping'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timestamping.
+ This allows you to store the flow start-time and to obtain
+ the flow-stop time (once it has been destroyed) via Connection
+ tracking events.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_LABELS
+ bool
+ help
+ This option enables support for assigning user-defined flag bits
+ to connection tracking entries. It selected by the connlabel match.
+
config NF_CT_PROTO_DCCP
- tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
- depends on EXPERIMENTAL
+ tristate 'DCCP protocol connection tracking support'
depends on NETFILTER_ADVANCED
default IP_DCCP
help
@@ -100,8 +144,7 @@ config NF_CT_PROTO_GRE
tristate
config NF_CT_PROTO_SCTP
- tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
- depends on EXPERIMENTAL
+ tristate 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
default IP_SCTP
help
@@ -185,9 +228,12 @@ config NF_CONNTRACK_IRC
To compile it as a module, choose M here. If unsure, say N.
+config NF_CONNTRACK_BROADCAST
+ tristate
+
config NF_CONNTRACK_NETBIOS_NS
tristate "NetBIOS name service protocol support"
- depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
help
NetBIOS name service requests are sent as broadcast messages from an
unprivileged port and responded to with unicast messages to the
@@ -204,6 +250,21 @@ config NF_CONNTRACK_NETBIOS_NS
To compile it as a module, choose M here. If unsure, say N.
+config NF_CONNTRACK_SNMP
+ tristate "SNMP service protocol support"
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
+ help
+ SNMP service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating SNMP service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NF_CONNTRACK_PPTP
tristate "PPtP protocol support"
depends on NETFILTER_ADVANCED
@@ -224,8 +285,7 @@ config NF_CONNTRACK_PPTP
To compile it as a module, choose M here. If unsure, say N.
config NF_CONNTRACK_SANE
- tristate "SANE protocol support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ tristate "SANE protocol support"
depends on NETFILTER_ADVANCED
help
SANE is a protocol for remote access to scanners as implemented
@@ -267,22 +327,212 @@ config NF_CT_NETLINK
help
This option enables support for a netlink-based userspace interface
-endif # NF_CONNTRACK
+config NF_CT_NETLINK_TIMEOUT
+ tristate 'Connection tracking timeout tuning via Netlink'
+ select NETFILTER_NETLINK
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timeout
+ fine-grain tuning. This allows you to attach specific timeout
+ policies to flows, instead of using the global timeout policy.
-# transparent proxy support
-config NETFILTER_TPROXY
- tristate "Transparent proxying support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- depends on IP_NF_MANGLE
+ If unsure, say `N'.
+
+config NF_CT_NETLINK_HELPER
+ tristate 'Connection tracking helpers in user-space via Netlink'
+ select NETFILTER_NETLINK
+ depends on NF_CT_NETLINK
+ depends on NETFILTER_NETLINK_QUEUE
+ depends on NETFILTER_NETLINK_QUEUE_CT
depends on NETFILTER_ADVANCED
help
- This option enables transparent proxying support, that is,
- support for handling non-locally bound IPv4 TCP and UDP sockets.
- For it to work you will have to configure certain iptables rules
- and use policy routing. For more information on how to set it up
- see Documentation/networking/tproxy.txt.
+ This option enables the user-space connection tracking helpers
+ infrastructure.
- To compile it as a module, choose M here. If unsure, say N.
+ If unsure, say `N'.
+
+config NETFILTER_NETLINK_QUEUE_CT
+ bool "NFQUEUE integration with Connection Tracking"
+ default n
+ depends on NETFILTER_NETLINK_QUEUE
+ help
+ If this option is enabled, NFQUEUE can include Connection Tracking
+ information together with the packet is the enqueued via NFNETLINK.
+
+config NF_NAT
+ tristate
+
+config NF_NAT_NEEDED
+ bool
+ depends on NF_NAT
+ default y
+
+config NF_NAT_PROTO_DCCP
+ tristate
+ depends on NF_NAT && NF_CT_PROTO_DCCP
+ default NF_NAT && NF_CT_PROTO_DCCP
+
+config NF_NAT_PROTO_UDPLITE
+ tristate
+ depends on NF_NAT && NF_CT_PROTO_UDPLITE
+ default NF_NAT && NF_CT_PROTO_UDPLITE
+
+config NF_NAT_PROTO_SCTP
+ tristate
+ default NF_NAT && NF_CT_PROTO_SCTP
+ depends on NF_NAT && NF_CT_PROTO_SCTP
+ select LIBCRC32C
+
+config NF_NAT_AMANDA
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_AMANDA
+
+config NF_NAT_FTP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_FTP
+
+config NF_NAT_IRC
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_IRC
+
+config NF_NAT_SIP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_SIP
+
+config NF_NAT_TFTP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_TFTP
+
+config NETFILTER_SYNPROXY
+ tristate
+
+endif # NF_CONNTRACK
+
+config NF_TABLES
+ select NETFILTER_NETLINK
+ tristate "Netfilter nf_tables support"
+ help
+ nftables is the new packet classification framework that intends to
+ replace the existing {ip,ip6,arp,eb}_tables infrastructure. It
+ provides a pseudo-state machine with an extensible instruction-set
+ (also known as expressions) that the userspace 'nft' utility
+ (http://www.netfilter.org/projects/nftables) uses to build the
+ rule-set. It also comes with the generic set infrastructure that
+ allows you to construct mappings between matchings and actions
+ for performance lookups.
+
+ To compile it as a module, choose M here.
+
+config NF_TABLES_INET
+ depends on NF_TABLES && IPV6
+ select NF_TABLES_IPV4
+ select NF_TABLES_IPV6
+ tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support"
+ help
+ This option enables support for a mixed IPv4/IPv6 "inet" table.
+
+config NFT_EXTHDR
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables IPv6 exthdr module"
+ help
+ This option adds the "exthdr" expression that you can use to match
+ IPv6 extension headers.
+
+config NFT_META
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables meta module"
+ help
+ This option adds the "meta" expression that you can use to match and
+ to set packet metainformation such as the packet mark.
+
+config NFT_CT
+ depends on NF_TABLES
+ depends on NF_CONNTRACK
+ tristate "Netfilter nf_tables conntrack module"
+ help
+ This option adds the "meta" expression that you can use to match
+ connection tracking information such as the flow state.
+
+config NFT_RBTREE
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables rbtree set module"
+ help
+ This option adds the "rbtree" set type (Red Black tree) that is used
+ to build interval-based sets.
+
+config NFT_HASH
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables hash set module"
+ help
+ This option adds the "hash" set type that is used to build one-way
+ mappings between matchings and actions.
+
+config NFT_COUNTER
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables counter module"
+ help
+ This option adds the "counter" expression that you can use to
+ include packet and byte counters in a rule.
+
+config NFT_LOG
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables log module"
+ help
+ This option adds the "log" expression that you can use to log
+ packets matching some criteria.
+
+config NFT_LIMIT
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables limit module"
+ help
+ This option adds the "limit" expression that you can use to
+ ratelimit rule matchings.
+
+config NFT_NAT
+ depends on NF_TABLES
+ depends on NF_CONNTRACK
+ depends on NF_NAT
+ tristate "Netfilter nf_tables nat module"
+ help
+ This option adds the "nat" expression that you can use to perform
+ typical Network Address Translation (NAT) packet transformations.
+
+config NFT_QUEUE
+ depends on NF_TABLES
+ depends on NETFILTER_XTABLES
+ depends on NETFILTER_NETLINK_QUEUE
+ tristate "Netfilter nf_tables queue module"
+ help
+ This is required if you intend to use the userspace queueing
+ infrastructure (also known as NFQUEUE) from nftables.
+
+config NFT_REJECT
+ depends on NF_TABLES
+ default m if NETFILTER_ADVANCED=n
+ tristate "Netfilter nf_tables reject support"
+ help
+ This option adds the "reject" expression that you can use to
+ explicitly deny and notify via TCP reset/ICMP informational errors
+ unallowed traffic.
+
+config NFT_REJECT_INET
+ depends on NF_TABLES_INET
+ default NFT_REJECT
+ tristate
+
+config NFT_COMPAT
+ depends on NF_TABLES
+ depends on NETFILTER_XTABLES
+ tristate "Netfilter x_tables over nf_tables module"
+ help
+ This is required if you intend to use any of existing
+ x_tables match/target extensions over the nf_tables
+ framework.
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
@@ -322,10 +572,32 @@ config NETFILTER_XT_CONNMARK
ctmark), similarly to the packet mark (nfmark). Using this
target and match, you can set and match on this mark.
+config NETFILTER_XT_SET
+ tristate 'set target and match support'
+ depends on IP_SET
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds the "SET" target and "set" match.
+
+ Using this target and match, you can add/delete and match
+ elements in the sets created by ipset(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# alphabetically ordered list of targets
comment "Xtables targets"
+config NETFILTER_XT_TARGET_AUDIT
+ tristate "AUDIT target support"
+ depends on AUDIT
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a 'AUDIT' target, which can be used to create
+ audit records for packets dropped/accepted.
+
+ To compileit as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_CHECKSUM
tristate "CHECKSUM target support"
depends on IP_NF_MANGLE || IP6_NF_MANGLE
@@ -419,6 +691,21 @@ config NETFILTER_XT_TARGET_HL
since you can easily create immortal packets that loop
forever on the network.
+config NETFILTER_XT_TARGET_HMARK
+ tristate '"HMARK" target support'
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds the "HMARK" target.
+
+ The target allows you to create rules in the "raw" and "mangle" tables
+ which set the skbuff mark by means of hash calculation within a given
+ range. The nfmark can influence the routing method (see "Use netfilter
+ MARK value as routing key") and can also be used by other subsystems to
+ change their behaviour.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_IDLETIMER
tristate "IDLETIMER target support"
depends on NETFILTER_ADVANCED
@@ -453,7 +740,16 @@ config NETFILTER_XT_TARGET_LED
echo netfilter-ssh > /sys/class/leds/<ledname>/trigger
For more information on the LEDs available on your system, see
- Documentation/leds-class.txt
+ Documentation/leds/leds-class.txt
+
+config NETFILTER_XT_TARGET_LOG
+ tristate "LOG target support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option adds a `LOG' target, which allows you to create rules in
+ any iptables table which records the packet header to the syslog.
+
+ To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_TARGET_MARK
tristate '"MARK" target support'
@@ -464,6 +760,16 @@ config NETFILTER_XT_TARGET_MARK
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+config NETFILTER_XT_TARGET_NETMAP
+ tristate '"NETMAP" target support'
+ depends on NF_NAT
+ ---help---
+ NETMAP is an implementation of static 1:1 NAT mapping of network
+ addresses. It maps the network address part, while keeping the host
+ address part intact.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_NFLOG
tristate '"NFLOG" target support'
default m if NETFILTER_ADVANCED=n
@@ -477,6 +783,7 @@ config NETFILTER_XT_TARGET_NFLOG
config NETFILTER_XT_TARGET_NFQUEUE
tristate '"NFQUEUE" target Support'
depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_QUEUE
help
This target replaced the old obsolete QUEUE target.
@@ -486,18 +793,11 @@ config NETFILTER_XT_TARGET_NFQUEUE
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_TARGET_NOTRACK
- tristate '"NOTRACK" target support'
- depends on IP_NF_RAW || IP6_NF_RAW
+ tristate '"NOTRACK" target support (DEPRECATED)'
depends on NF_CONNTRACK
+ depends on IP_NF_RAW || IP6_NF_RAW
depends on NETFILTER_ADVANCED
- help
- The NOTRACK target allows a select rule to specify
- which packets *not* to enter the conntrack/NAT
- subsystem with all the consequences (no ICMP error tracking,
- no protocol helpers for the selected packets).
-
- If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ select NETFILTER_XT_TARGET_CT
config NETFILTER_XT_TARGET_RATEEST
tristate '"RATEEST" target support'
@@ -509,6 +809,17 @@ config NETFILTER_XT_TARGET_RATEEST
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_TARGET_REDIRECT
+ tristate "REDIRECT target support"
+ depends on NF_NAT
+ ---help---
+ REDIRECT is a special case of NAT: all incoming connections are
+ mapped onto the incoming interface's address, causing the packets to
+ come to the local machine instead of passing through. This is
+ useful for transparent proxies.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_TEE
tristate '"TEE" - packet cloning to alternate destination'
depends on NETFILTER_ADVANCED
@@ -519,11 +830,10 @@ config NETFILTER_XT_TARGET_TEE
this clone be rerouted to another nexthop.
config NETFILTER_XT_TARGET_TPROXY
- tristate '"TPROXY" target support (EXPERIMENTAL)'
- depends on EXPERIMENTAL
- depends on NETFILTER_TPROXY
+ tristate '"TPROXY" target transparent proxying support'
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
+ depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
help
@@ -531,6 +841,9 @@ config NETFILTER_XT_TARGET_TPROXY
REDIRECT. It can only be used in the mangle table and is useful
to redirect traffic to a transparent proxy. It does _not_ depend
on Netfilter connection tracking and NAT, unlike REDIRECT.
+ For it to work you will have to configure certain iptables rules
+ and use policy routing. For more information on how to set it up
+ see Documentation/networking/tproxy.txt.
To compile it as a module, choose M here. If unsure, say N.
@@ -584,8 +897,7 @@ config NETFILTER_XT_TARGET_TCPMSS
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_TARGET_TCPOPTSTRIP
- tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)'
- depends on EXPERIMENTAL
+ tristate '"TCPOPTSTRIP" target support'
depends on IP_NF_MANGLE || IP6_NF_MANGLE
depends on NETFILTER_ADVANCED
help
@@ -596,6 +908,35 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP
comment "Xtables matches"
+config NETFILTER_XT_MATCH_ADDRTYPE
+ tristate '"addrtype" address type match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option allows you to match what routing thinks of an address,
+ eg. UNICAST, LOCAL, BROADCAST, ...
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_BPF
+ tristate '"bpf" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ BPF matching applies a linux socket filter to each packet and
+ accepts those for which the filter returns non-zero.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_CGROUP
+ tristate '"control group" match support'
+ depends on NETFILTER_ADVANCED
+ depends on CGROUPS
+ select CGROUP_NET_CLASSID
+ ---help---
+ Socket/process control group matching allows you to match locally
+ generated packets based on which net_cls control group processes
+ belong to.
+
config NETFILTER_XT_MATCH_CLUSTER
tristate '"cluster" match support'
depends on NF_CONNTRACK
@@ -633,8 +974,21 @@ config NETFILTER_XT_MATCH_CONNBYTES
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+config NETFILTER_XT_MATCH_CONNLABEL
+ tristate '"connlabel" match support'
+ select NF_CONNTRACK_LABELS
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This match allows you to test and assign userspace-defined labels names
+ to a connection. The kernel only stores bit values - mapping
+ names to bits is done by userspace.
+
+ Unlike connmark, more than 32 flag bits may be assigned to a
+ connection simultaneously.
+
config NETFILTER_XT_MATCH_CONNLIMIT
- tristate '"connlimit" match support"'
+ tristate '"connlimit" match support'
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
---help---
@@ -685,6 +1039,15 @@ config NETFILTER_XT_MATCH_DCCP
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+config NETFILTER_XT_MATCH_DEVGROUP
+ tristate '"devgroup" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This options adds a `devgroup' match, which allows to match on the
+ device group a network device is assigned to.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_DSCP
tristate '"dscp" and "tos" match support'
depends on NETFILTER_ADVANCED
@@ -700,6 +1063,15 @@ config NETFILTER_XT_MATCH_DSCP
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_ECN
+ tristate '"ecn" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds an "ECN" match, which allows you to match against
+ the IPv4 and TCP header ECN fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_ESP
tristate '"esp" match support'
depends on NETFILTER_ADVANCED
@@ -742,6 +1114,15 @@ config NETFILTER_XT_MATCH_HL
in the IPv6 header, or the time-to-live field in the IPv4
header of the packet.
+config NETFILTER_XT_MATCH_IPCOMP
+ tristate '"ipcomp" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This match extension allows you to match a range of CPIs(16 bits)
+ inside IPComp header of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_IPRANGE
tristate '"iprange" address range match support'
depends on NETFILTER_ADVANCED
@@ -762,6 +1143,16 @@ config NETFILTER_XT_MATCH_IPVS
If unsure, say N.
+config NETFILTER_XT_MATCH_L2TP
+ tristate '"l2tp" match support'
+ depends on NETFILTER_ADVANCED
+ default L2TP
+ ---help---
+ This option adds an "L2TP" match, which allows you to match against
+ L2TP protocol header fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_LENGTH
tristate '"length" match support'
depends on NETFILTER_ADVANCED
@@ -809,6 +1200,16 @@ config NETFILTER_XT_MATCH_MULTIPORT
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_NFACCT
+ tristate '"nfacct" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_ACCT
+ help
+ This option allows you to use the extended accounting through
+ nfnetlink_acct.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_OSF
tristate '"osf" Passive OS fingerprint match'
depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
@@ -886,7 +1287,7 @@ config NETFILTER_XT_MATCH_RATEEST
config NETFILTER_XT_MATCH_REALM
tristate '"realm" match support'
depends on NETFILTER_ADVANCED
- select NET_CLS_ROUTE
+ select IP_ROUTE_CLASSID
help
This option adds a `realm' match, which allows you to use the realm
key from the routing subsystem inside iptables.
@@ -908,8 +1309,7 @@ config NETFILTER_XT_MATCH_RECENT
Official Website: <http://snowman.net/projects/ipt_recent/>
config NETFILTER_XT_MATCH_SCTP
- tristate '"sctp" protocol match support (EXPERIMENTAL)'
- depends on EXPERIMENTAL
+ tristate '"sctp" protocol match support'
depends on NETFILTER_ADVANCED
default IP_SCTP
help
@@ -921,12 +1321,11 @@ config NETFILTER_XT_MATCH_SCTP
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
config NETFILTER_XT_MATCH_SOCKET
- tristate '"socket" match support (EXPERIMENTAL)'
- depends on EXPERIMENTAL
- depends on NETFILTER_TPROXY
+ tristate '"socket" match support'
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
depends on !NF_CONNTRACK || NF_CONNTRACK
+ depends on (IPV6 || IPV6=n)
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
help
@@ -1011,4 +1410,6 @@ endif # NETFILTER_XTABLES
endmenu
+source "net/netfilter/ipset/Kconfig"
+
source "net/netfilter/ipvs/Kconfig"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 441050f3111..bffdad774da 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,11 +1,17 @@
netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
-nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
+nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
obj-$(CONFIG_NETFILTER) = netfilter.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
+nfnetlink_queue-y := nfnetlink_queue_core.o
+nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
@@ -20,6 +26,8 @@ obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
# netlink interface for nf_conntrack
obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
+obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o
+obj-$(CONFIG_NF_CT_NETLINK_HELPER) += nfnetlink_cthelper.o
# connection tracking helpers
nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o
@@ -28,14 +36,54 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
+obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
+obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
-# transparent proxy support
-obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o
+nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
+ nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
+
+obj-$(CONFIG_NF_NAT) += nf_nat.o
+
+# NAT protocols (nf_nat)
+obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
+obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+
+# NAT helpers
+obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
+obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
+obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
+obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
+obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
+
+# SYNPROXY
+obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
+
+# nf_tables
+nf_tables-objs += nf_tables_core.o nf_tables_api.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
+nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+
+obj-$(CONFIG_NF_TABLES) += nf_tables.o
+obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
+obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
+obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
+obj-$(CONFIG_NFT_META) += nft_meta.o
+obj-$(CONFIG_NFT_CT) += nft_ct.o
+obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
+obj-$(CONFIG_NFT_NAT) += nft_nat.o
+obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
+obj-$(CONFIG_NFT_REJECT) += nft_reject.o
+obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o
+obj-$(CONFIG_NFT_HASH) += nft_hash.o
+obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
+obj-$(CONFIG_NFT_LOG) += nft_log.o
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
@@ -43,19 +91,25 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
# combos
obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
+obj-$(CONFIG_NF_NAT) += xt_nat.o
# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o
obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
-obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
@@ -65,26 +119,35 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
# matches
+obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_BPF) += xt_bpf.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLABEL) += xt_connlabel.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_ECN) += xt_ecn.o
obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o
obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o
obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
@@ -101,5 +164,8 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+# ipset
+obj-$(CONFIG_IP_SET) += ipset/
+
# IPVS
obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 32fcbe290c0..1fbab0cdd30 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -5,6 +5,7 @@
* way.
*
* Rusty Russell (C)2000 -- This code is GPL.
+ * Patrick McHardy (c) 2006-2012
*/
#include <linux/kernel.h>
#include <linux/netfilter.h>
@@ -29,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex);
const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
EXPORT_SYMBOL(nf_afinfo);
+const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ipv6_ops);
int nf_register_afinfo(const struct nf_afinfo *afinfo)
{
@@ -37,7 +40,7 @@ int nf_register_afinfo(const struct nf_afinfo *afinfo)
err = mutex_lock_interruptible(&afinfo_mutex);
if (err < 0)
return err;
- rcu_assign_pointer(nf_afinfo[afinfo->family], afinfo);
+ RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);
mutex_unlock(&afinfo_mutex);
return 0;
}
@@ -46,7 +49,7 @@ EXPORT_SYMBOL_GPL(nf_register_afinfo);
void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
{
mutex_lock(&afinfo_mutex);
- rcu_assign_pointer(nf_afinfo[afinfo->family], NULL);
+ RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL);
mutex_unlock(&afinfo_mutex);
synchronize_rcu();
}
@@ -54,6 +57,12 @@ EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
EXPORT_SYMBOL(nf_hooks);
+
+#if defined(CONFIG_JUMP_LABEL)
+struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+EXPORT_SYMBOL(nf_hooks_needed);
+#endif
+
static DEFINE_MUTEX(nf_hook_mutex);
int nf_register_hook(struct nf_hook_ops *reg)
@@ -70,6 +79,9 @@ int nf_register_hook(struct nf_hook_ops *reg)
}
list_add_rcu(&reg->list, elem->list.prev);
mutex_unlock(&nf_hook_mutex);
+#if defined(CONFIG_JUMP_LABEL)
+ static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
+#endif
return 0;
}
EXPORT_SYMBOL(nf_register_hook);
@@ -79,7 +91,9 @@ void nf_unregister_hook(struct nf_hook_ops *reg)
mutex_lock(&nf_hook_mutex);
list_del_rcu(&reg->list);
mutex_unlock(&nf_hook_mutex);
-
+#if defined(CONFIG_JUMP_LABEL)
+ static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
+#endif
synchronize_net();
}
EXPORT_SYMBOL(nf_unregister_hook);
@@ -115,7 +129,7 @@ unsigned int nf_iterate(struct list_head *head,
unsigned int hook,
const struct net_device *indev,
const struct net_device *outdev,
- struct list_head **i,
+ struct nf_hook_ops **elemp,
int (*okfn)(struct sk_buff *),
int hook_thresh)
{
@@ -125,27 +139,26 @@ unsigned int nf_iterate(struct list_head *head,
* The caller must not block between calls to this
* function because of risk of continuing from deleted element.
*/
- list_for_each_continue_rcu(*i, head) {
- struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
-
- if (hook_thresh > elem->priority)
+ list_for_each_entry_continue_rcu((*elemp), head, list) {
+ if (hook_thresh > (*elemp)->priority)
continue;
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
- verdict = elem->hook(hook, skb, indev, outdev, okfn);
+repeat:
+ verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn);
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
if (unlikely((verdict & NF_VERDICT_MASK)
> NF_MAX_VERDICT)) {
NFDEBUG("Evil return from %p(%u).\n",
- elem->hook, hook);
+ (*elemp)->hook, hook);
continue;
}
#endif
if (verdict != NF_REPEAT)
return verdict;
- *i = (*i)->prev;
+ goto repeat;
}
}
return NF_ACCEPT;
@@ -160,14 +173,14 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
int (*okfn)(struct sk_buff *),
int hook_thresh)
{
- struct list_head *elem;
+ struct nf_hook_ops *elem;
unsigned int verdict;
int ret = 0;
/* We may already have this, but read-locks nest anyway */
rcu_read_lock();
- elem = &nf_hooks[pf][hook];
+ elem = list_entry_rcu(&nf_hooks[pf][hook], struct nf_hook_ops, list);
next_hook:
verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,
outdev, &elem, okfn, hook_thresh);
@@ -175,13 +188,20 @@ next_hook:
ret = 1;
} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
kfree_skb(skb);
- ret = -(verdict >> NF_VERDICT_BITS);
+ ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
- if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
}
rcu_read_unlock();
return ret;
@@ -210,16 +230,17 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
}
EXPORT_SYMBOL(skb_make_writable);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
+ __rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
-void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
- void (*attach)(struct sk_buff *, struct sk_buff *);
+ void (*attach)(struct sk_buff *, const struct sk_buff *);
if (skb->nfct) {
rcu_read_lock();
@@ -231,7 +252,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
}
EXPORT_SYMBOL(nf_ct_attach);
-void (*nf_ct_destroy)(struct nf_conntrack *);
+void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
EXPORT_SYMBOL(nf_ct_destroy);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
@@ -245,38 +266,65 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
rcu_read_unlock();
}
EXPORT_SYMBOL(nf_conntrack_destroy);
+
+struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfq_ct_hook);
+
+struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfq_ct_nat_hook);
+
#endif /* CONFIG_NF_CONNTRACK */
+#ifdef CONFIG_NF_NAT_NEEDED
+void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(nf_nat_decode_session_hook);
+#endif
+
+static int __net_init netfilter_net_init(struct net *net)
+{
#ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_net_netfilter;
-EXPORT_SYMBOL(proc_net_netfilter);
+ net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
+ net->proc_net);
+ if (!net->nf.proc_netfilter) {
+ if (!net_eq(net, &init_net))
+ pr_err("cannot create netfilter proc entry");
+
+ return -ENOMEM;
+ }
#endif
+ return 0;
+}
+
+static void __net_exit netfilter_net_exit(struct net *net)
+{
+ remove_proc_entry("netfilter", net->proc_net);
+}
+
+static struct pernet_operations netfilter_net_ops = {
+ .init = netfilter_net_init,
+ .exit = netfilter_net_exit,
+};
-void __init netfilter_init(void)
+int __init netfilter_init(void)
{
- int i, h;
+ int i, h, ret;
+
for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
for (h = 0; h < NF_MAX_HOOKS; h++)
INIT_LIST_HEAD(&nf_hooks[i][h]);
}
-#ifdef CONFIG_PROC_FS
- proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net);
- if (!proc_net_netfilter)
- panic("cannot create netfilter proc entry");
-#endif
+ ret = register_pernet_subsys(&netfilter_net_ops);
+ if (ret < 0)
+ goto err;
- if (netfilter_queue_init() < 0)
- panic("cannot initialize nf_queue");
- if (netfilter_log_init() < 0)
- panic("cannot initialize nf_log");
-}
+ ret = netfilter_log_init();
+ if (ret < 0)
+ goto err_pernet;
-#ifdef CONFIG_SYSCTL
-struct ctl_path nf_net_netfilter_sysctl_path[] = {
- { .procname = "net", },
- { .procname = "netfilter", },
- { }
-};
-EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path);
-#endif /* CONFIG_SYSCTL */
+ return 0;
+err_pernet:
+ unregister_pernet_subsys(&netfilter_net_ops);
+err:
+ return ret;
+}
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
new file mode 100644
index 00000000000..2f7f5c32c6f
--- /dev/null
+++ b/net/netfilter/ipset/Kconfig
@@ -0,0 +1,159 @@
+menuconfig IP_SET
+ tristate "IP set support"
+ depends on INET && NETFILTER
+ select NETFILTER_NETLINK
+ help
+ This option adds IP set support to the kernel.
+ In order to define and use the sets, you need the userspace utility
+ ipset(8). You can use the sets in netfilter via the "set" match
+ and "SET" target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_SET
+
+config IP_SET_MAX
+ int "Maximum number of IP sets"
+ default 256
+ range 2 65534
+ depends on IP_SET
+ help
+ You can define here default value of the maximum number
+ of IP sets for the kernel.
+
+ The value can be overridden by the 'max_sets' module
+ parameter of the 'ip_set' module.
+
+config IP_SET_BITMAP_IP
+ tristate "bitmap:ip set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip set type support, by which one
+ can store IPv4 addresses (or network addresse) from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_IPMAC
+ tristate "bitmap:ip,mac set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip,mac set type support, by which one
+ can store IPv4 address and (source) MAC address pairs from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_PORT
+ tristate "bitmap:port set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:port set type support, by which one
+ can store TCP/UDP port numbers from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IP
+ tristate "hash:ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip set type support, by which one
+ can store arbitrary IPv4 or IPv6 addresses (or network addresses)
+ in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPMARK
+ tristate "hash:ip,mark set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,mark set type support, by which one
+ can store IPv4/IPv6 address and mark pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORT
+ tristate "hash:ip,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port set type support, by which one
+ can store IPv4/IPv6 address and protocol/port pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTIP
+ tristate "hash:ip,port,ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,ip set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ address triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTNET
+ tristate "hash:ip,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,net set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ network address/prefix triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORTNET
+ tristate "hash:net,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port,net set type support, by which
+ one can store two IPv4/IPv6 subnets, and a protocol/port in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NET
+ tristate "hash:net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net set type support, by which
+ one can store IPv4/IPv6 network address/prefix elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETNET
+ tristate "hash:net,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,net set type support, by which
+ one can store IPv4/IPv6 network address/prefix pairs in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORT
+ tristate "hash:net,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ protocol/port pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETIFACE
+ tristate "hash:net,iface set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,iface set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ interface name pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_LIST_SET
+ tristate "list:set set support"
+ depends on IP_SET
+ help
+ This option adds the list:set set type support. In this
+ kind of set one can store the name of other sets and it forms
+ an ordered union of the member sets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endif # IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
new file mode 100644
index 00000000000..231f10196cb
--- /dev/null
+++ b/net/netfilter/ipset/Makefile
@@ -0,0 +1,28 @@
+#
+# Makefile for the ipset modules
+#
+
+ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o
+
+# ipset core
+obj-$(CONFIG_IP_SET) += ip_set.o
+
+# bitmap types
+obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o
+obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o
+obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
+
+# hash types
+obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
+obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
+obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o
+obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o
+obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o
+obj-$(CONFIG_IP_SET_HASH_NETNET) += ip_set_hash_netnet.o
+obj-$(CONFIG_IP_SET_HASH_NETPORTNET) += ip_set_hash_netportnet.o
+
+# list types
+obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
new file mode 100644
index 00000000000..f2c7d83dc23
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -0,0 +1,289 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __IP_SET_BITMAP_IP_GEN_H
+#define __IP_SET_BITMAP_IP_GEN_H
+
+#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test)
+#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test)
+#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled)
+#define mtype_do_add IPSET_TOKEN(MTYPE, _do_add)
+#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
+#define mtype_do_del IPSET_TOKEN(MTYPE, _do_del)
+#define mtype_do_list IPSET_TOKEN(MTYPE, _do_list)
+#define mtype_do_head IPSET_TOKEN(MTYPE, _do_head)
+#define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem)
+#define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
+#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
+#define mtype_head IPSET_TOKEN(MTYPE, _head)
+#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_add IPSET_TOKEN(MTYPE, _add)
+#define mtype_del IPSET_TOKEN(MTYPE, _del)
+#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype MTYPE
+
+#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id))
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct mtype *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+mtype_ext_cleanup(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+ u32 id;
+
+ for (id = 0; id < map->elements; id++)
+ if (test_bit(id, map->members))
+ ip_set_ext_destroy(set, get_ext(set, map, id));
+}
+
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ if (set->dsize) {
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ ip_set_free(map->extensions);
+ }
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct mtype *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (mtype_do_head(skb, map) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) +
+ map->memsize +
+ set->dsize * map->elements)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+ int ret = mtype_do_test(e, map, set->dsize);
+
+ if (ret <= 0)
+ return ret;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ return 0;
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(x, set), ext, mext, flags);
+ return 1;
+}
+
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+ int ret = mtype_do_add(e, map, flags, set->dsize);
+
+ if (ret == IPSET_ADD_FAILED) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ ret = 0;
+ else if (!(flags & IPSET_FLAG_EXIST))
+ return -IPSET_ERR_EXIST;
+ /* Element is re-added, cleanup extensions */
+ ip_set_ext_destroy(set, x);
+ }
+
+ if (SET_WITH_TIMEOUT(set))
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret);
+#else
+ ip_set_timeout_set(ext_timeout(x, set), ext->timeout);
+#endif
+
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(x, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(x, set), ext);
+ return 0;
+}
+
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+
+ if (mtype_do_del(e, map))
+ return -IPSET_ERR_EXIST;
+
+ ip_set_ext_destroy(set, x);
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+#ifndef IP_SET_BITMAP_STORED_TIMEOUT
+static inline bool
+mtype_is_filled(const struct mtype_elem *x)
+{
+ return true;
+}
+#endif
+
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct mtype *map = set->data;
+ struct nlattr *adt, *nested;
+ void *x;
+ u32 id, first = cb->args[IPSET_CB_ARG0];
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[IPSET_CB_ARG0] < map->elements;
+ cb->args[IPSET_CB_ARG0]++) {
+ id = cb->args[IPSET_CB_ARG0];
+ x = get_ext(set, map, id);
+ if (!test_bit(id, map->members) ||
+ (SET_WITH_TIMEOUT(set) &&
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_is_filled((const struct mtype_elem *) x) &&
+#endif
+ ip_set_timeout_expired(ext_timeout(x, set))))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_do_list(skb, map, id, set->dsize))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, x,
+ mtype_is_filled((const struct mtype_elem *) x)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ if (unlikely(id == first)) {
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, adt);
+ return 0;
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct mtype *map = set->data;
+ void *x;
+ u32 id;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id < map->elements; id++)
+ if (mtype_gc_test(id, map, set->dsize)) {
+ x = get_ext(set, map, id);
+ if (ip_set_timeout_expired(ext_timeout(x, set))) {
+ clear_bit(id, map->members);
+ ip_set_ext_destroy(set, x);
+ }
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static const struct ip_set_type_variant mtype = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .same_set = mtype_same_set,
+};
+
+#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
new file mode 100644
index 00000000000..6f1f9f49480
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -0,0 +1,377 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comment support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:ip");
+
+#define MTYPE bitmap_ip
+
+/* Type structure */
+struct bitmap_ip {
+ void *members; /* the set members */
+ void *extensions; /* data extensions */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ u32 hosts; /* number of hosts in a subnet */
+ size_t memsize; /* members size */
+ u8 netmask; /* subnet netmask */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_ip_adt_elem {
+ u16 id;
+};
+
+static inline u32
+ip_to_id(const struct bitmap_ip *m, u32 ip)
+{
+ return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+}
+
+/* Common functions */
+
+static inline int
+bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e,
+ struct bitmap_ip *map, size_t dsize)
+{
+ return !!test_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize)
+{
+ return !!test_bit(id, map->members);
+}
+
+static inline int
+bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
+ u32 flags, size_t dsize)
+{
+ return !!test_and_set_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id,
+ size_t dsize)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
+}
+
+static inline int
+bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) ||
+ (map->netmask != 32 &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask));
+}
+
+static int
+bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ip_adt_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ u32 ip;
+
+ ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = ip_to_id(map, ip);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 ip = 0, ip_to = 0;
+ struct bitmap_ip_adt_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (adt == IPSET_TEST) {
+ e.id = ip_to_id(map, ip);
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to) {
+ swap(ip, ip_to);
+ if (ip < map->first_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ } else
+ ip_to = ip;
+
+ if (ip_to > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; !before(ip_to, ip); ip += map->hosts) {
+ e.id = ip_to_id(map, ip);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ip *x = a->data;
+ const struct bitmap_ip *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ x->netmask == y->netmask &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+struct bitmap_ip_elem {
+};
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_ip(struct ip_set *set, struct bitmap_ip *map,
+ u32 first_ip, u32 last_ip,
+ u32 elements, u32 hosts, u8 netmask)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ map->hosts = hosts;
+ map->netmask = netmask;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_IPV4;
+
+ return true;
+}
+
+static int
+bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_ip *map;
+ u32 first_ip = 0, last_ip = 0, hosts;
+ u64 elements;
+ u8 netmask = 32;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(first_ip, last_ip, cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if (netmask > 32)
+ return -IPSET_ERR_INVALID_NETMASK;
+
+ first_ip &= ip_set_hostmask(netmask);
+ last_ip |= ~ip_set_hostmask(netmask);
+ }
+
+ if (netmask == 32) {
+ hosts = 1;
+ elements = (u64)last_ip - first_ip + 1;
+ } else {
+ u8 mask_bits;
+ u32 mask;
+
+ mask = range_to_mask(first_ip, last_ip, &mask_bits);
+
+ if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) ||
+ netmask <= mask_bits)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
+ hosts = 2 << (32 - netmask - 1);
+ elements = 2 << (netmask - mask_bits - 1);
+ }
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ pr_debug("hosts %u, elements %llu\n",
+ hosts, (unsigned long long)elements);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ip;
+ set->dsize = ip_set_elem_len(set, tb, 0);
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_ip_gc_init(set, bitmap_ip_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ip_type __read_mostly = {
+ .name = "bitmap:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_IPV4,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ip_init(void)
+{
+ return ip_set_type_register(&bitmap_ip_type);
+}
+
+static void __exit
+bitmap_ip_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ip_type);
+}
+
+module_init(bitmap_ip_init);
+module_exit(bitmap_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
new file mode 100644
index 00000000000..740eabededd
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -0,0 +1,414 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comment support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:ip,mac");
+
+#define MTYPE bitmap_ipmac
+#define IP_SET_BITMAP_STORED_TIMEOUT
+
+enum {
+ MAC_UNSET, /* element is set, without MAC */
+ MAC_FILLED, /* element is set with MAC */
+};
+
+/* Type structure */
+struct bitmap_ipmac {
+ void *members; /* the set members */
+ void *extensions; /* MAC + data extensions */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ size_t memsize; /* members size */
+ struct timer_list gc; /* garbage collector */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_ipmac_adt_elem {
+ u16 id;
+ unsigned char *ether;
+};
+
+struct bitmap_ipmac_elem {
+ unsigned char ether[ETH_ALEN];
+ unsigned char filled;
+} __attribute__ ((aligned));
+
+static inline u32
+ip_to_id(const struct bitmap_ipmac *m, u32 ip)
+{
+ return ip - m->first_ip;
+}
+
+static inline struct bitmap_ipmac_elem *
+get_elem(void *extensions, u16 id, size_t dsize)
+{
+ return (struct bitmap_ipmac_elem *)(extensions + id * dsize);
+}
+
+/* Common functions */
+
+static inline int
+bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
+ const struct bitmap_ipmac *map, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem;
+
+ if (!test_bit(e->id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, e->id, dsize);
+ if (elem->filled == MAC_FILLED)
+ return e->ether == NULL ||
+ ether_addr_equal(e->ether, elem->ether);
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
+}
+
+static inline int
+bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem;
+
+ if (!test_bit(id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, id, dsize);
+ /* Timer not started for the incomplete elements */
+ return elem->filled == MAC_FILLED;
+}
+
+static inline int
+bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem)
+{
+ return elem->filled == MAC_FILLED;
+}
+
+static inline int
+bitmap_ipmac_add_timeout(unsigned long *timeout,
+ const struct bitmap_ipmac_adt_elem *e,
+ const struct ip_set_ext *ext, struct ip_set *set,
+ struct bitmap_ipmac *map, int mode)
+{
+ u32 t = ext->timeout;
+
+ if (mode == IPSET_ADD_START_STORED_TIMEOUT) {
+ if (t == set->timeout)
+ /* Timeout was not specified, get stored one */
+ t = *timeout;
+ ip_set_timeout_set(timeout, t);
+ } else {
+ /* If MAC is unset yet, we store plain timeout value
+ * because the timer is not activated yet
+ * and we can reuse it later when MAC is filled out,
+ * possibly by the kernel */
+ if (e->ether)
+ ip_set_timeout_set(timeout, t);
+ else
+ *timeout = t;
+ }
+ return 0;
+}
+
+static inline int
+bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map, u32 flags, size_t dsize)
+{
+ struct bitmap_ipmac_elem *elem;
+
+ elem = get_elem(map->extensions, e->id, dsize);
+ if (test_and_set_bit(e->id, map->members)) {
+ if (elem->filled == MAC_FILLED) {
+ if (e->ether && (flags & IPSET_FLAG_EXIST))
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ return IPSET_ADD_FAILED;
+ } else if (!e->ether)
+ /* Already added without ethernet address */
+ return IPSET_ADD_FAILED;
+ /* Fill the MAC address and trigger the timer activation */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return IPSET_ADD_START_STORED_TIMEOUT;
+ } else if (e->ether) {
+ /* We can store MAC too */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return 0;
+ } else {
+ elem->filled = MAC_UNSET;
+ /* MAC is not stored yet, don't start timer */
+ return IPSET_ADD_STORE_PLAIN_TIMEOUT;
+ }
+}
+
+static inline int
+bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
+ u32 id, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem =
+ get_elem(map->extensions, id, dsize);
+
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id)) ||
+ (elem->filled == MAC_FILLED &&
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether));
+}
+
+static inline int
+bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+}
+
+static int
+bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ipmac_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ u32 ip;
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_TWO_SRC))
+ return 0;
+
+ ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ /* Backward compatibility: we don't check the second flag */
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ e.id = ip_to_id(map, ip);
+ e.ether = eth_hdr(skb)->h_source;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ipmac_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = ip_to_id(map, ip);
+ if (tb[IPSET_ATTR_ETHER])
+ e.ether = nla_data(tb[IPSET_ATTR_ETHER]);
+ else
+ e.ether = NULL;
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static bool
+bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ipmac *x = a->data;
+ const struct bitmap_ipmac *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip,mac type of sets */
+
+static bool
+init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
+ u32 first_ip, u32 last_ip, u32 elements)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_IPV4;
+
+ return true;
+}
+
+static int
+bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 first_ip = 0, last_ip = 0;
+ u64 elements;
+ struct bitmap_ipmac *map;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(first_ip, last_ip, cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ elements = (u64)last_ip - first_ip + 1;
+
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ipmac;
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct bitmap_ipmac_elem));
+ if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_ipmac_gc_init(set, bitmap_ipmac_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ipmac_type = {
+ .name = "bitmap:ip,mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_IPV4,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_ipmac_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ipmac_init(void)
+{
+ return ip_set_type_register(&bitmap_ipmac_type);
+}
+
+static void __exit
+bitmap_ipmac_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ipmac_type);
+}
+
+module_init(bitmap_ipmac_init);
+module_exit(bitmap_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
new file mode 100644
index 00000000000..cf99676e69f
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -0,0 +1,311 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:port type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comment support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:port");
+
+#define MTYPE bitmap_port
+
+/* Type structure */
+struct bitmap_port {
+ void *members; /* the set members */
+ void *extensions; /* data extensions */
+ u16 first_port; /* host byte order, included in range */
+ u16 last_port; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ size_t memsize; /* members size */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_port_adt_elem {
+ u16 id;
+};
+
+static inline u16
+port_to_id(const struct bitmap_port *m, u16 port)
+{
+ return port - m->first_port;
+}
+
+/* Common functions */
+
+static inline int
+bitmap_port_do_test(const struct bitmap_port_adt_elem *e,
+ const struct bitmap_port *map, size_t dsize)
+{
+ return !!test_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize)
+{
+ return !!test_bit(id, map->members);
+}
+
+static inline int
+bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map, u32 flags, size_t dsize)
+{
+ return !!test_and_set_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_do_del(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id,
+ size_t dsize)
+{
+ return nla_put_net16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
+}
+
+static inline int
+bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map)
+{
+ return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
+}
+
+static int
+bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_port_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ __be16 __port;
+ u16 port = 0;
+
+ if (!ip_set_get_ip_port(skb, opt->family,
+ opt->flags & IPSET_DIM_ONE_SRC, &__port))
+ return -EINVAL;
+
+ port = ntohs(__port);
+
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = port_to_id(map, port);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_port_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port; /* wraparound */
+ u16 port_to;
+ int ret = 0;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (adt == IPSET_TEST) {
+ e.id = port_to_id(map, port);
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to) {
+ swap(port, port_to);
+ if (port < map->first_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else
+ port_to = port;
+
+ if (port_to > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; port <= port_to; port++) {
+ e.id = port_to_id(map, port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_port *x = a->data;
+ const struct bitmap_port *y = b->data;
+
+ return x->first_port == y->first_port &&
+ x->last_port == y->last_port &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+struct bitmap_port_elem {
+};
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_port(struct ip_set *set, struct bitmap_port *map,
+ u16 first_port, u16 last_port)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * map->elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_port = first_port;
+ map->last_port = last_port;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_UNSPEC;
+
+ return true;
+}
+
+static int
+bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_port *map;
+ u16 first_port, last_port;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (first_port > last_port) {
+ u16 tmp = first_port;
+
+ first_port = last_port;
+ last_port = tmp;
+ }
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->elements = last_port - first_port + 1;
+ map->memsize = bitmap_bytes(0, map->elements);
+ set->variant = &bitmap_port;
+ set->dsize = ip_set_elem_len(set, tb, 0);
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_port_gc_init(set, bitmap_port_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_port_type = {
+ .name = "bitmap:port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_port_create,
+ .create_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_port_init(void)
+{
+ return ip_set_type_register(&bitmap_port_type);
+}
+
+static void __exit
+bitmap_port_fini(void)
+{
+ ip_set_type_unregister(&bitmap_port_type);
+}
+
+module_init(bitmap_port_init);
+module_exit(bitmap_port_fini);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
new file mode 100644
index 00000000000..ec8114fae50
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -0,0 +1,2011 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module for IP set management */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/rculist.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/ipset/ip_set.h>
+
+static LIST_HEAD(ip_set_type_list); /* all registered set types */
+static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */
+static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */
+
+struct ip_set_net {
+ struct ip_set * __rcu *ip_set_list; /* all individual sets */
+ ip_set_id_t ip_set_max; /* max number of sets */
+ int is_deleted; /* deleted by ip_set_net_exit */
+};
+static int ip_set_net_id __read_mostly;
+
+static inline struct ip_set_net *ip_set_pernet(struct net *net)
+{
+ return net_generic(net, ip_set_net_id);
+}
+
+#define IP_SET_INC 64
+#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+
+static unsigned int max_sets;
+
+module_param(max_sets, int, 0600);
+MODULE_PARM_DESC(max_sets, "maximal number of sets");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("core IP set support");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
+
+/* When the nfnl mutex is held: */
+#define ip_set_dereference(p) \
+ rcu_dereference_protected(p, 1)
+#define ip_set(inst, id) \
+ ip_set_dereference((inst)->ip_set_list)[id]
+
+/*
+ * The set types are implemented in modules and registered set types
+ * can be found in ip_set_type_list. Adding/deleting types is
+ * serialized by ip_set_type_mutex.
+ */
+
+static inline void
+ip_set_type_lock(void)
+{
+ mutex_lock(&ip_set_type_mutex);
+}
+
+static inline void
+ip_set_type_unlock(void)
+{
+ mutex_unlock(&ip_set_type_mutex);
+}
+
+/* Register and deregister settype */
+
+static struct ip_set_type *
+find_set_type(const char *name, u8 family, u8 revision)
+{
+ struct ip_set_type *type;
+
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family ||
+ type->family == NFPROTO_UNSPEC) &&
+ revision >= type->revision_min &&
+ revision <= type->revision_max)
+ return type;
+ return NULL;
+}
+
+/* Unlock, try to load a set type module and lock again */
+static bool
+load_settype(const char *name)
+{
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ pr_debug("try to load ip_set_%s\n", name);
+ if (request_module("ip_set_%s", name) < 0) {
+ pr_warning("Can't find ip_set type %s\n", name);
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ return false;
+ }
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ return true;
+}
+
+/* Find a set type and reference it */
+#define find_set_type_get(name, family, revision, found) \
+ __find_set_type_get(name, family, revision, found, false)
+
+static int
+__find_set_type_get(const char *name, u8 family, u8 revision,
+ struct ip_set_type **found, bool retry)
+{
+ struct ip_set_type *type;
+ int err;
+
+ if (retry && !load_settype(name))
+ return -IPSET_ERR_FIND_TYPE;
+
+ rcu_read_lock();
+ *found = find_set_type(name, family, revision);
+ if (*found) {
+ err = !try_module_get((*found)->me) ? -EFAULT : 0;
+ goto unlock;
+ }
+ /* Make sure the type is already loaded
+ * but we don't support the revision */
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name)) {
+ err = -IPSET_ERR_FIND_TYPE;
+ goto unlock;
+ }
+ rcu_read_unlock();
+
+ return retry ? -IPSET_ERR_FIND_TYPE :
+ __find_set_type_get(name, family, revision, found, true);
+
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+/* Find a given set type by name and family.
+ * If we succeeded, the supported minimal and maximum revisions are
+ * filled out.
+ */
+#define find_set_type_minmax(name, family, min, max) \
+ __find_set_type_minmax(name, family, min, max, false)
+
+static int
+__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max,
+ bool retry)
+{
+ struct ip_set_type *type;
+ bool found = false;
+
+ if (retry && !load_settype(name))
+ return -IPSET_ERR_FIND_TYPE;
+
+ *min = 255; *max = 0;
+ rcu_read_lock();
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family ||
+ type->family == NFPROTO_UNSPEC)) {
+ found = true;
+ if (type->revision_min < *min)
+ *min = type->revision_min;
+ if (type->revision_max > *max)
+ *max = type->revision_max;
+ }
+ rcu_read_unlock();
+ if (found)
+ return 0;
+
+ return retry ? -IPSET_ERR_FIND_TYPE :
+ __find_set_type_minmax(name, family, min, max, true);
+}
+
+#define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \
+ (f) == NFPROTO_IPV6 ? "inet6" : "any")
+
+/* Register a set type structure. The type is identified by
+ * the unique triple of name, family and revision.
+ */
+int
+ip_set_type_register(struct ip_set_type *type)
+{
+ int ret = 0;
+
+ if (type->protocol != IPSET_PROTOCOL) {
+ pr_warning("ip_set type %s, family %s, revision %u:%u uses "
+ "wrong protocol version %u (want %u)\n",
+ type->name, family_name(type->family),
+ type->revision_min, type->revision_max,
+ type->protocol, IPSET_PROTOCOL);
+ return -EINVAL;
+ }
+
+ ip_set_type_lock();
+ if (find_set_type(type->name, type->family, type->revision_min)) {
+ /* Duplicate! */
+ pr_warning("ip_set type %s, family %s with revision min %u "
+ "already registered!\n", type->name,
+ family_name(type->family), type->revision_min);
+ ret = -EINVAL;
+ goto unlock;
+ }
+ list_add_rcu(&type->list, &ip_set_type_list);
+ pr_debug("type %s, family %s, revision %u:%u registered.\n",
+ type->name, family_name(type->family),
+ type->revision_min, type->revision_max);
+unlock:
+ ip_set_type_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_type_register);
+
+/* Unregister a set type. There's a small race with ip_set_create */
+void
+ip_set_type_unregister(struct ip_set_type *type)
+{
+ ip_set_type_lock();
+ if (!find_set_type(type->name, type->family, type->revision_min)) {
+ pr_warning("ip_set type %s, family %s with revision min %u "
+ "not registered\n", type->name,
+ family_name(type->family), type->revision_min);
+ goto unlock;
+ }
+ list_del_rcu(&type->list);
+ pr_debug("type %s, family %s with revision min %u unregistered.\n",
+ type->name, family_name(type->family), type->revision_min);
+unlock:
+ ip_set_type_unlock();
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ip_set_type_unregister);
+
+/* Utility functions */
+void *
+ip_set_alloc(size_t size)
+{
+ void *members = NULL;
+
+ if (size < KMALLOC_MAX_SIZE)
+ members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+ if (members) {
+ pr_debug("%p: allocated with kmalloc\n", members);
+ return members;
+ }
+
+ members = vzalloc(size);
+ if (!members)
+ return NULL;
+ pr_debug("%p: allocated with vmalloc\n", members);
+
+ return members;
+}
+EXPORT_SYMBOL_GPL(ip_set_alloc);
+
+void
+ip_set_free(void *members)
+{
+ pr_debug("%p: free with %s\n", members,
+ is_vmalloc_addr(members) ? "vfree" : "kfree");
+ kvfree(members);
+}
+EXPORT_SYMBOL_GPL(ip_set_free);
+
+static inline bool
+flag_nested(const struct nlattr *nla)
+{
+ return nla->nla_type & NLA_F_NESTED;
+}
+
+static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
+ [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
+ [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+};
+
+int
+ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
+ return -IPSET_ERR_PROTOCOL;
+
+ *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
+
+int
+ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
+ return -IPSET_ERR_PROTOCOL;
+
+ memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
+ sizeof(struct in6_addr));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+
+typedef void (*destroyer)(void *);
+/* ipset data extension types, in size order */
+
+const struct ip_set_ext_type ip_set_extensions[] = {
+ [IPSET_EXT_ID_COUNTER] = {
+ .type = IPSET_EXT_COUNTER,
+ .flag = IPSET_FLAG_WITH_COUNTERS,
+ .len = sizeof(struct ip_set_counter),
+ .align = __alignof__(struct ip_set_counter),
+ },
+ [IPSET_EXT_ID_TIMEOUT] = {
+ .type = IPSET_EXT_TIMEOUT,
+ .len = sizeof(unsigned long),
+ .align = __alignof__(unsigned long),
+ },
+ [IPSET_EXT_ID_COMMENT] = {
+ .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY,
+ .flag = IPSET_FLAG_WITH_COMMENT,
+ .len = sizeof(struct ip_set_comment),
+ .align = __alignof__(struct ip_set_comment),
+ .destroy = (destroyer) ip_set_comment_free,
+ },
+};
+EXPORT_SYMBOL_GPL(ip_set_extensions);
+
+static inline bool
+add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
+{
+ return ip_set_extensions[id].flag ?
+ (flags & ip_set_extensions[id].flag) :
+ !!tb[IPSET_ATTR_TIMEOUT];
+}
+
+size_t
+ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
+{
+ enum ip_set_ext_id id;
+ size_t offset = 0;
+ u32 cadt_flags = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
+ set->flags |= IPSET_CREATE_FLAG_FORCEADD;
+ for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
+ if (!add_extension(id, cadt_flags, tb))
+ continue;
+ offset += ALIGN(len + offset, ip_set_extensions[id].align);
+ set->offset[id] = offset;
+ set->extensions |= ip_set_extensions[id].type;
+ offset += ip_set_extensions[id].len;
+ }
+ return len + offset;
+}
+EXPORT_SYMBOL_GPL(ip_set_elem_len);
+
+int
+ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
+ struct ip_set_ext *ext)
+{
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!(set->extensions & IPSET_EXT_TIMEOUT))
+ return -IPSET_ERR_TIMEOUT;
+ ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+ if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
+ if (!(set->extensions & IPSET_EXT_COUNTER))
+ return -IPSET_ERR_COUNTER;
+ if (tb[IPSET_ATTR_BYTES])
+ ext->bytes = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_BYTES]));
+ if (tb[IPSET_ATTR_PACKETS])
+ ext->packets = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_PACKETS]));
+ }
+ if (tb[IPSET_ATTR_COMMENT]) {
+ if (!(set->extensions & IPSET_EXT_COMMENT))
+ return -IPSET_ERR_COMMENT;
+ ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_extensions);
+
+/*
+ * Creating/destroying/renaming/swapping affect the existence and
+ * the properties of a set. All of these can be executed from userspace
+ * only and serialized by the nfnl mutex indirectly from nfnetlink.
+ *
+ * Sets are identified by their index in ip_set_list and the index
+ * is used by the external references (set/SET netfilter modules).
+ *
+ * The set behind an index may change by swapping only, from userspace.
+ */
+
+static inline void
+__ip_set_get(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+static inline void
+__ip_set_put(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ BUG_ON(set->ref == 0);
+ set->ref--;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+/*
+ * Add, del and test set entries from kernel.
+ *
+ * The set behind the index must exist and must be referenced
+ * so it can't be destroyed (or changed) under our foot.
+ */
+
+static inline struct ip_set *
+ip_set_rcu_get(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ rcu_read_lock();
+ /* ip_set_list itself needs to be protected */
+ set = rcu_dereference(inst->ip_set_list)[index];
+ rcu_read_unlock();
+
+ return set;
+}
+
+int
+ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret = 0;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return 0;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
+ read_unlock_bh(&set->lock);
+
+ if (ret == -EAGAIN) {
+ /* Type requests element to be completed */
+ pr_debug("element must be competed, ADD is triggered\n");
+ write_lock_bh(&set->lock);
+ set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+ write_unlock_bh(&set->lock);
+ ret = 1;
+ } else {
+ /* --return-nomatch: invert matched element */
+ if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) &&
+ (set->type->features & IPSET_TYPE_NOMATCH) &&
+ (ret > 0 || ret == -ENOTEMPTY))
+ ret = -ret;
+ }
+
+ /* Convert error codes to nomatch */
+ return (ret < 0 ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(ip_set_test);
+
+int
+ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_add);
+
+int
+ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret = 0;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_del);
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ */
+ip_set_id_t
+ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
+{
+ ip_set_id_t i, index = IPSET_INVALID_ID;
+ struct ip_set *s;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ rcu_read_lock();
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = rcu_dereference(inst->ip_set_list)[i];
+ if (s != NULL && STREQ(s->name, name)) {
+ __ip_set_get(s);
+ index = i;
+ *set = s;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_byname);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ */
+
+static inline void
+__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
+{
+ struct ip_set *set;
+
+ rcu_read_lock();
+ set = rcu_dereference(inst->ip_set_list)[index];
+ if (set != NULL)
+ __ip_set_put(set);
+ rcu_read_unlock();
+}
+
+void
+ip_set_put_byindex(struct net *net, ip_set_id_t index)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ __ip_set_put_byindex(inst, index);
+}
+EXPORT_SYMBOL_GPL(ip_set_put_byindex);
+
+/*
+ * Get the name of a set behind a set index.
+ * We assume the set is referenced, so it does exist and
+ * can't be destroyed. The set cannot be renamed due to
+ * the referencing either.
+ *
+ */
+const char *
+ip_set_name_byindex(struct net *net, ip_set_id_t index)
+{
+ const struct ip_set *set = ip_set_rcu_get(net, index);
+
+ BUG_ON(set == NULL);
+ BUG_ON(set->ref == 0);
+
+ /* Referenced, so it's safe */
+ return set->name;
+}
+EXPORT_SYMBOL_GPL(ip_set_name_byindex);
+
+/*
+ * Routines to call by external subsystems, which do not
+ * call nfnl_lock for us.
+ */
+
+/*
+ * Find set by index, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ if (index > inst->ip_set_max)
+ return IPSET_INVALID_ID;
+
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ set = ip_set(inst, index);
+ if (set)
+ __ip_set_get(set);
+ else
+ index = IPSET_INVALID_ID;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex is used in the function.
+ */
+void
+ip_set_nfnl_put(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
+ set = ip_set(inst, index);
+ if (set != NULL)
+ __ip_set_put(set);
+ }
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
+
+/*
+ * Communication protocol with userspace over netlink.
+ *
+ * The commands are serialized by the nfnl mutex.
+ */
+
+static inline bool
+protocol_failed(const struct nlattr * const tb[])
+{
+ return !tb[IPSET_ATTR_PROTOCOL] ||
+ nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+}
+
+static inline u32
+flag_exist(const struct nlmsghdr *nlh)
+{
+ return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
+}
+
+static struct nlmsghdr *
+start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
+ enum ipset_cmd cmd)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
+ sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ return NULL;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = NFPROTO_IPV4;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ return nlh;
+}
+
+/* Create a set */
+
+static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1},
+ [IPSET_ATTR_REVISION] = { .type = NLA_U8 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+};
+
+static struct ip_set *
+find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
+{
+ struct ip_set *set = NULL;
+ ip_set_id_t i;
+
+ *id = IPSET_INVALID_ID;
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set != NULL && STREQ(set->name, name)) {
+ *id = i;
+ break;
+ }
+ }
+ return (*id == IPSET_INVALID_ID ? NULL : set);
+}
+
+static inline struct ip_set *
+find_set(struct ip_set_net *inst, const char *name)
+{
+ ip_set_id_t id;
+
+ return find_set_and_id(inst, name, &id);
+}
+
+static int
+find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
+ struct ip_set **set)
+{
+ struct ip_set *s;
+ ip_set_id_t i;
+
+ *index = IPSET_INVALID_ID;
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s == NULL) {
+ if (*index == IPSET_INVALID_ID)
+ *index = i;
+ } else if (STREQ(name, s->name)) {
+ /* Name clash */
+ *set = s;
+ return -EEXIST;
+ }
+ }
+ if (*index == IPSET_INVALID_ID)
+ /* No free slot remained */
+ return -IPSET_ERR_MAX_SETS;
+ return 0;
+}
+
+static int
+ip_set_none(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ return -EOPNOTSUPP;
+}
+
+static int
+ip_set_create(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct net *net = sock_net(ctnl);
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set *set, *clash = NULL;
+ ip_set_id_t index = IPSET_INVALID_ID;
+ struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+ const char *name, *typename;
+ u8 family, revision;
+ u32 flags = flag_exist(nlh);
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_REVISION] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA]))))
+ return -IPSET_ERR_PROTOCOL;
+
+ name = nla_data(attr[IPSET_ATTR_SETNAME]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
+ pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
+ name, typename, family_name(family), revision);
+
+ /*
+ * First, and without any locks, allocate and initialize
+ * a normal base set structure.
+ */
+ set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+ if (!set)
+ return -ENOMEM;
+ rwlock_init(&set->lock);
+ strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ set->family = family;
+ set->revision = revision;
+
+ /*
+ * Next, check that we know the type, and take
+ * a reference on the type, to make sure it stays available
+ * while constructing our new set.
+ *
+ * After referencing the type, we try to create the type
+ * specific part of the set without holding any locks.
+ */
+ ret = find_set_type_get(typename, family, revision, &(set->type));
+ if (ret)
+ goto out;
+
+ /*
+ * Without holding any locks, create private part.
+ */
+ if (attr[IPSET_ATTR_DATA] &&
+ nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
+ set->type->create_policy)) {
+ ret = -IPSET_ERR_PROTOCOL;
+ goto put_out;
+ }
+
+ ret = set->type->create(net, set, tb, flags);
+ if (ret != 0)
+ goto put_out;
+
+ /* BTW, ret==0 here. */
+
+ /*
+ * Here, we have a valid, constructed set and we are protected
+ * by the nfnl mutex. Find the first free index in ip_set_list
+ * and check clashing.
+ */
+ ret = find_free_id(inst, set->name, &index, &clash);
+ if (ret == -EEXIST) {
+ /* If this is the same set and requested, ignore error */
+ if ((flags & IPSET_FLAG_EXIST) &&
+ STREQ(set->type->name, clash->type->name) &&
+ set->type->family == clash->type->family &&
+ set->type->revision_min == clash->type->revision_min &&
+ set->type->revision_max == clash->type->revision_max &&
+ set->variant->same_set(set, clash))
+ ret = 0;
+ goto cleanup;
+ } else if (ret == -IPSET_ERR_MAX_SETS) {
+ struct ip_set **list, **tmp;
+ ip_set_id_t i = inst->ip_set_max + IP_SET_INC;
+
+ if (i < inst->ip_set_max || i == IPSET_INVALID_ID)
+ /* Wraparound */
+ goto cleanup;
+
+ list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
+ if (!list)
+ goto cleanup;
+ /* nfnl mutex is held, both lists are valid */
+ tmp = ip_set_dereference(inst->ip_set_list);
+ memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
+ rcu_assign_pointer(inst->ip_set_list, list);
+ /* Make sure all current packets have passed through */
+ synchronize_net();
+ /* Use new list */
+ index = inst->ip_set_max;
+ inst->ip_set_max = i;
+ kfree(tmp);
+ ret = 0;
+ } else if (ret)
+ goto cleanup;
+
+ /*
+ * Finally! Add our shiny new set to the list, and be done.
+ */
+ pr_debug("create: '%s' created with index %u!\n", set->name, index);
+ ip_set(inst, index) = set;
+
+ return ret;
+
+cleanup:
+ set->variant->destroy(set);
+put_out:
+ module_put(set->type->me);
+out:
+ kfree(set);
+ return ret;
+}
+
+/* Destroy sets */
+
+static const struct nla_policy
+ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static void
+ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
+{
+ struct ip_set *set = ip_set(inst, index);
+
+ pr_debug("set: %s\n", set->name);
+ ip_set(inst, index) = NULL;
+
+ /* Must call it without holding any lock */
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+}
+
+static int
+ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *s;
+ ip_set_id_t i;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ /* Commands are serialized and references are
+ * protected by the ip_set_ref_lock.
+ * External systems (i.e. xt_set) must call
+ * ip_set_put|get_nfnl_* functions, that way we
+ * can safely check references here.
+ *
+ * list:set timer can only decrement the reference
+ * counter, so if it's already zero, we can proceed
+ * without holding the lock.
+ */
+ read_lock_bh(&ip_set_ref_lock);
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL && s->ref) {
+ ret = -IPSET_ERR_BUSY;
+ goto out;
+ }
+ }
+ read_unlock_bh(&ip_set_ref_lock);
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL)
+ ip_set_destroy_set(inst, i);
+ }
+ } else {
+ s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+ &i);
+ if (s == NULL) {
+ ret = -ENOENT;
+ goto out;
+ } else if (s->ref) {
+ ret = -IPSET_ERR_BUSY;
+ goto out;
+ }
+ read_unlock_bh(&ip_set_ref_lock);
+
+ ip_set_destroy_set(inst, i);
+ }
+ return 0;
+out:
+ read_unlock_bh(&ip_set_ref_lock);
+ return ret;
+}
+
+/* Flush sets */
+
+static void
+ip_set_flush_set(struct ip_set *set)
+{
+ pr_debug("set: %s\n", set->name);
+
+ write_lock_bh(&set->lock);
+ set->variant->flush(set);
+ write_unlock_bh(&set->lock);
+}
+
+static int
+ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *s;
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL)
+ ip_set_flush_set(s);
+ }
+ } else {
+ s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (s == NULL)
+ return -ENOENT;
+
+ ip_set_flush_set(s);
+ }
+
+ return 0;
+}
+
+/* Rename a set */
+
+static const struct nla_policy
+ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static int
+ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set, *s;
+ const char *name2;
+ ip_set_id_t i;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ read_lock_bh(&ip_set_ref_lock);
+ if (set->ref != 0) {
+ ret = -IPSET_ERR_REFERENCED;
+ goto out;
+ }
+
+ name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL && STREQ(s->name, name2)) {
+ ret = -IPSET_ERR_EXIST_SETNAME2;
+ goto out;
+ }
+ }
+ strncpy(set->name, name2, IPSET_MAXNAMELEN);
+
+out:
+ read_unlock_bh(&ip_set_ref_lock);
+ return ret;
+}
+
+/* Swap two sets so that name/index points to the other.
+ * References and set names are also swapped.
+ *
+ * The commands are serialized by the nfnl mutex and references are
+ * protected by the ip_set_ref_lock. The kernel interfaces
+ * do not hold the mutex but the pointer settings are atomic
+ * so the ip_set_list always contains valid pointers to the sets.
+ */
+
+static int
+ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *from, *to;
+ ip_set_id_t from_id, to_id;
+ char from_name[IPSET_MAXNAMELEN];
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+ &from_id);
+ if (from == NULL)
+ return -ENOENT;
+
+ to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
+ &to_id);
+ if (to == NULL)
+ return -IPSET_ERR_EXIST_SETNAME2;
+
+ /* Features must not change.
+ * Not an artificial restriction anymore, as we must prevent
+ * possible loops created by swapping in setlist type of sets. */
+ if (!(from->type->features == to->type->features &&
+ from->family == to->family))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ strncpy(from_name, from->name, IPSET_MAXNAMELEN);
+ strncpy(from->name, to->name, IPSET_MAXNAMELEN);
+ strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+
+ write_lock_bh(&ip_set_ref_lock);
+ swap(from->ref, to->ref);
+ ip_set(inst, from_id) = to;
+ ip_set(inst, to_id) = from;
+ write_unlock_bh(&ip_set_ref_lock);
+
+ return 0;
+}
+
+/* List/save set data */
+
+#define DUMP_INIT 0
+#define DUMP_ALL 1
+#define DUMP_ONE 2
+#define DUMP_LAST 3
+
+#define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF)
+#define DUMP_FLAGS(arg) (((u32)(arg)) >> 16)
+
+static int
+ip_set_dump_done(struct netlink_callback *cb)
+{
+ struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
+ if (cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n",
+ ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
+ __ip_set_put_byindex(inst,
+ (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
+ }
+ return 0;
+}
+
+static inline void
+dump_attrs(struct nlmsghdr *nlh)
+{
+ const struct nlattr *attr;
+ int rem;
+
+ pr_debug("dump nlmsg\n");
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
+ pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
+ }
+}
+
+static int
+dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ u32 dump_type;
+ ip_set_id_t index;
+
+ /* Second pass, so parser can't fail */
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
+
+ /* cb->args[IPSET_CB_NET]: net namespace
+ * [IPSET_CB_DUMP]: dump single set/all sets
+ * [IPSET_CB_INDEX]: set index
+ * [IPSET_CB_ARG0]: type specific
+ */
+
+ if (cda[IPSET_ATTR_SETNAME]) {
+ struct ip_set *set;
+
+ set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
+ &index);
+ if (set == NULL)
+ return -ENOENT;
+
+ dump_type = DUMP_ONE;
+ cb->args[IPSET_CB_INDEX] = index;
+ } else
+ dump_type = DUMP_ALL;
+
+ if (cda[IPSET_ATTR_FLAGS]) {
+ u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
+ dump_type |= (f << 16);
+ }
+ cb->args[IPSET_CB_NET] = (unsigned long)inst;
+ cb->args[IPSET_CB_DUMP] = dump_type;
+
+ return 0;
+}
+
+static int
+ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ ip_set_id_t index = IPSET_INVALID_ID, max;
+ struct ip_set *set = NULL;
+ struct nlmsghdr *nlh = NULL;
+ unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
+ struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
+ u32 dump_type, dump_flags;
+ int ret = 0;
+
+ if (!cb->args[IPSET_CB_DUMP]) {
+ ret = dump_init(cb, inst);
+ if (ret < 0) {
+ nlh = nlmsg_hdr(cb->skb);
+ /* We have to create and send the error message
+ * manually :-( */
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(cb->skb, nlh, ret);
+ return ret;
+ }
+ }
+
+ if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)
+ goto out;
+
+ dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]);
+ dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]);
+ max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1
+ : inst->ip_set_max;
+dump_last:
+ pr_debug("dump type, flag: %u %u index: %ld\n",
+ dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
+ for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
+ index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
+ set = ip_set(inst, index);
+ if (set == NULL) {
+ if (dump_type == DUMP_ONE) {
+ ret = -ENOENT;
+ goto out;
+ }
+ continue;
+ }
+ /* When dumping all sets, we must dump "sorted"
+ * so that lists (unions of sets) are dumped last.
+ */
+ if (dump_type != DUMP_ONE &&
+ ((dump_type == DUMP_ALL) ==
+ !!(set->type->features & IPSET_DUMP_LAST)))
+ continue;
+ pr_debug("List set: %s\n", set->name);
+ if (!cb->args[IPSET_CB_ARG0]) {
+ /* Start listing: make sure set won't be destroyed */
+ pr_debug("reference set\n");
+ __ip_set_get(set);
+ }
+ nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ IPSET_CMD_LIST);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto release_refcount;
+ }
+ if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
+ goto nla_put_failure;
+ if (dump_flags & IPSET_FLAG_LIST_SETNAME)
+ goto next_set;
+ switch (cb->args[IPSET_CB_ARG0]) {
+ case 0:
+ /* Core header data */
+ if (nla_put_string(skb, IPSET_ATTR_TYPENAME,
+ set->type->name) ||
+ nla_put_u8(skb, IPSET_ATTR_FAMILY,
+ set->family) ||
+ nla_put_u8(skb, IPSET_ATTR_REVISION,
+ set->revision))
+ goto nla_put_failure;
+ ret = set->variant->head(set, skb);
+ if (ret < 0)
+ goto release_refcount;
+ if (dump_flags & IPSET_FLAG_LIST_HEADER)
+ goto next_set;
+ /* Fall through and add elements */
+ default:
+ read_lock_bh(&set->lock);
+ ret = set->variant->list(set, skb, cb);
+ read_unlock_bh(&set->lock);
+ if (!cb->args[IPSET_CB_ARG0])
+ /* Set is done, proceed with next one */
+ goto next_set;
+ goto release_refcount;
+ }
+ }
+ /* If we dump all sets, continue with dumping last ones */
+ if (dump_type == DUMP_ALL) {
+ dump_type = DUMP_LAST;
+ cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
+ cb->args[IPSET_CB_INDEX] = 0;
+ goto dump_last;
+ }
+ goto out;
+
+nla_put_failure:
+ ret = -EFAULT;
+next_set:
+ if (dump_type == DUMP_ONE)
+ cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID;
+ else
+ cb->args[IPSET_CB_INDEX]++;
+release_refcount:
+ /* If there was an error or set is done, release set */
+ if (ret || !cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n", ip_set(inst, index)->name);
+ __ip_set_put_byindex(inst, index);
+ cb->args[IPSET_CB_ARG0] = 0;
+ }
+out:
+ if (nlh) {
+ nlmsg_end(skb, nlh);
+ pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
+ dump_attrs(nlh);
+ }
+
+ return ret < 0 ? ret : skb->len;
+}
+
+static int
+ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ {
+ struct netlink_dump_control c = {
+ .dump = ip_set_dump_start,
+ .done = ip_set_dump_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+}
+
+/* Add, del and test */
+
+static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ADT] = { .type = NLA_NESTED },
+};
+
+static int
+call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
+ struct nlattr *tb[], enum ipset_adt adt,
+ u32 flags, bool use_lineno)
+{
+ int ret;
+ u32 lineno = 0;
+ bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
+
+ do {
+ write_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
+ write_unlock_bh(&set->lock);
+ retried = true;
+ } while (ret == -EAGAIN &&
+ set->variant->resize &&
+ (ret = set->variant->resize(set, retried)) == 0);
+
+ if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
+ return 0;
+ if (lineno && use_lineno) {
+ /* Error in restore/batch mode: send back lineno */
+ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
+ struct sk_buff *skb2;
+ struct nlmsgerr *errmsg;
+ size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cmdattr;
+ u32 *errline;
+
+ skb2 = nlmsg_new(payload, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+ rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ errmsg = nlmsg_data(rep);
+ errmsg->error = ret;
+ memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ cmdattr = (void *)&errmsg->msg + min_len;
+
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ cmdattr, nlh->nlmsg_len - min_len,
+ ip_set_adt_policy);
+
+ errline = nla_data(cda[IPSET_ATTR_LINENO]);
+
+ *errline = lineno;
+
+ netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ /* Signal netlink not to send its ACK/errmsg. */
+ return -EINTR;
+ }
+
+ return ret;
+}
+
+static int
+ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(*tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_DATA] == NULL ||
+ !flag_nested(attr[IPSET_ATTR_DATA])))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
+ read_unlock_bh(&set->lock);
+ /* Userspace can't trigger element to be re-added */
+ if (ret == -EAGAIN)
+ ret = 1;
+
+ return ret > 0 ? 0 : -IPSET_ERR_EXIST;
+}
+
+/* Get headed data of a set */
+
+static int
+ip_set_header(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ const struct ip_set *set;
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_HEADER);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) ||
+ nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) ||
+ nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get type data */
+
+static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_type(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ u8 family, min, max;
+ const char *typename;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ ret = find_set_type_minmax(typename, family, &min, &max);
+ if (ret)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_TYPE);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) ||
+ nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION, max) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get protocol version */
+
+static const struct nla_policy
+ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_PROTOCOL);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+ [IPSET_CMD_NONE] = {
+ .call = ip_set_none,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ },
+ [IPSET_CMD_CREATE] = {
+ .call = ip_set_create,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_create_policy,
+ },
+ [IPSET_CMD_DESTROY] = {
+ .call = ip_set_destroy,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_FLUSH] = {
+ .call = ip_set_flush,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_RENAME] = {
+ .call = ip_set_rename,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_SWAP] = {
+ .call = ip_set_swap,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_LIST] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_SAVE] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_ADD] = {
+ .call = ip_set_uadd,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_DEL] = {
+ .call = ip_set_udel,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_TEST] = {
+ .call = ip_set_utest,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_HEADER] = {
+ .call = ip_set_header,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_TYPE] = {
+ .call = ip_set_type,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_type_policy,
+ },
+ [IPSET_CMD_PROTOCOL] = {
+ .call = ip_set_protocol,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_protocol_policy,
+ },
+};
+
+static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
+ .name = "ip_set",
+ .subsys_id = NFNL_SUBSYS_IPSET,
+ .cb_count = IPSET_MSG_MAX,
+ .cb = ip_set_netlink_subsys_cb,
+};
+
+/* Interface to iptables/ip6tables */
+
+static int
+ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
+{
+ unsigned int *op;
+ void *data;
+ int copylen = *len, ret = 0;
+ struct net *net = sock_net(sk);
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (optval != SO_IP_SET)
+ return -EBADF;
+ if (*len < sizeof(unsigned int))
+ return -EINVAL;
+
+ data = vmalloc(*len);
+ if (!data)
+ return -ENOMEM;
+ if (copy_from_user(data, user, *len) != 0) {
+ ret = -EFAULT;
+ goto done;
+ }
+ op = (unsigned int *) data;
+
+ if (*op < IP_SET_OP_VERSION) {
+ /* Check the version at the beginning of operations */
+ struct ip_set_req_version *req_version = data;
+ if (req_version->version != IPSET_PROTOCOL) {
+ ret = -EPROTO;
+ goto done;
+ }
+ }
+
+ switch (*op) {
+ case IP_SET_OP_VERSION: {
+ struct ip_set_req_version *req_version = data;
+
+ if (*len != sizeof(struct ip_set_req_version)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ req_version->version = IPSET_PROTOCOL;
+ ret = copy_to_user(user, req_version,
+ sizeof(struct ip_set_req_version));
+ goto done;
+ }
+ case IP_SET_OP_GET_BYNAME: {
+ struct ip_set_req_get_set *req_get = data;
+ ip_set_id_t id;
+
+ if (*len != sizeof(struct ip_set_req_get_set)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ find_set_and_id(inst, req_get->set.name, &id);
+ req_get->set.index = id;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ case IP_SET_OP_GET_FNAME: {
+ struct ip_set_req_get_set_family *req_get = data;
+ ip_set_id_t id;
+
+ if (*len != sizeof(struct ip_set_req_get_set_family)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ find_set_and_id(inst, req_get->set.name, &id);
+ req_get->set.index = id;
+ if (id != IPSET_INVALID_ID)
+ req_get->family = ip_set(inst, id)->family;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ case IP_SET_OP_GET_BYINDEX: {
+ struct ip_set_req_get_set *req_get = data;
+ struct ip_set *set;
+
+ if (*len != sizeof(struct ip_set_req_get_set) ||
+ req_get->set.index >= inst->ip_set_max) {
+ ret = -EINVAL;
+ goto done;
+ }
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ set = ip_set(inst, req_get->set.index);
+ strncpy(req_get->set.name, set ? set->name : "",
+ IPSET_MAXNAMELEN);
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ default:
+ ret = -EBADMSG;
+ goto done;
+ } /* end of switch(op) */
+
+copy:
+ ret = copy_to_user(user, data, copylen);
+
+done:
+ vfree(data);
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static struct nf_sockopt_ops so_set __read_mostly = {
+ .pf = PF_INET,
+ .get_optmin = SO_IP_SET,
+ .get_optmax = SO_IP_SET + 1,
+ .get = &ip_set_sockfn_get,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init
+ip_set_net_init(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set **list;
+
+ inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX;
+ if (inst->ip_set_max >= IPSET_INVALID_ID)
+ inst->ip_set_max = IPSET_INVALID_ID - 1;
+
+ list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
+ if (!list)
+ return -ENOMEM;
+ inst->is_deleted = 0;
+ rcu_assign_pointer(inst->ip_set_list, list);
+ return 0;
+}
+
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ struct ip_set *set = NULL;
+ ip_set_id_t i;
+
+ inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
+
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set != NULL)
+ ip_set_destroy_set(inst, i);
+ }
+ kfree(rcu_dereference_protected(inst->ip_set_list, 1));
+}
+
+static struct pernet_operations ip_set_net_ops = {
+ .init = ip_set_net_init,
+ .exit = ip_set_net_exit,
+ .id = &ip_set_net_id,
+ .size = sizeof(struct ip_set_net)
+};
+
+
+static int __init
+ip_set_init(void)
+{
+ int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+ if (ret != 0) {
+ pr_err("ip_set: cannot register with nfnetlink.\n");
+ return ret;
+ }
+ ret = nf_register_sockopt(&so_set);
+ if (ret != 0) {
+ pr_err("SO_SET registry failed: %d\n", ret);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ return ret;
+ }
+ ret = register_pernet_subsys(&ip_set_net_ops);
+ if (ret) {
+ pr_err("ip_set: cannot register pernet_subsys.\n");
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ return ret;
+ }
+ pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
+ return 0;
+}
+
+static void __exit
+ip_set_fini(void)
+{
+ unregister_pernet_subsys(&ip_set_net_ops);
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ pr_debug("these are the famous last words\n");
+}
+
+module_init(ip_set_init);
+module_exit(ip_set_fini);
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
new file mode 100644
index 00000000000..29fb01ddff9
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -0,0 +1,174 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Get Layer-4 data from the packets */
+
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/sctp.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/export.h>
+
+/* We must handle non-linear skbs */
+static bool
+get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
+ bool src, __be16 *port, u8 *proto)
+{
+ switch (protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? th->source : th->dest;
+ break;
+ }
+ case IPPROTO_SCTP: {
+ sctp_sctphdr_t _sh;
+ const sctp_sctphdr_t *sh;
+
+ sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
+ if (sh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? sh->source : sh->dest;
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? uh->source : uh->dest;
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr _ich;
+ const struct icmphdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)htons((ic->type << 8) | ic->code);
+ break;
+ }
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _ich;
+ const struct icmp6hdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)
+ htons((ic->icmp6_type << 8) | ic->icmp6_code);
+ break;
+ }
+ default:
+ break;
+ }
+ *proto = protocol;
+
+ return true;
+}
+
+bool
+ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned int protooff = ip_hdrlen(skb);
+ int protocol = iph->protocol;
+
+ /* See comments at tcp_match in ip_tables.c */
+ if (protocol <= 0)
+ return false;
+
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ switch (protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_ICMP:
+ /* Port info not available for fragment offset > 0 */
+ return false;
+ default:
+ /* Other protocols doesn't have ports,
+ so we can match fragments */
+ *proto = protocol;
+ return true;
+ }
+
+ return get_port(skb, protocol, protooff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip4_port);
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+bool
+ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ int protoff;
+ u8 nexthdr;
+ __be16 frag_off = 0;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return false;
+
+ return get_port(skb, nexthdr, protoff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
+#endif
+
+bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+ bool ret;
+ u8 proto;
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ ret = ip_set_get_ip4_port(skb, src, port, &proto);
+ break;
+ case NFPROTO_IPV6:
+ ret = ip_set_get_ip6_port(skb, src, port, &proto);
+ break;
+ default:
+ return false;
+ }
+ if (!ret)
+ return ret;
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
new file mode 100644
index 00000000000..61c7fb05280
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -0,0 +1,1160 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _IP_SET_HASH_GEN_H
+#define _IP_SET_HASH_GEN_H
+
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#ifndef rcu_dereference_bh
+#define rcu_dereference_bh(p) rcu_dereference(p)
+#endif
+
+#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
+
+/* Hashing which uses arrays to resolve clashing. The hash table is resized
+ * (doubled) when searching becomes too long.
+ * Internally jhash is used with the assumption that the size of the
+ * stored data is a multiple of sizeof(u32). If storage supports timeout,
+ * the timeout field must be the last one in the data structure - that field
+ * is ignored when computing the hash key.
+ *
+ * Readers and resizing
+ *
+ * Resizing can be triggered by userspace command only, and those
+ * are serialized by the nfnl mutex. During resizing the set is
+ * read-locked, so the only possible concurrent operations are
+ * the kernel side readers. Those must be protected by proper RCU locking.
+ */
+
+/* Number of elements to store in an initial array block */
+#define AHASH_INIT_SIZE 4
+/* Max number of elements to store in an array block */
+#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE)
+
+/* Max number of elements can be tuned */
+#ifdef IP_SET_HASH_WITH_MULTI
+#define AHASH_MAX(h) ((h)->ahash_max)
+
+static inline u8
+tune_ahash_max(u8 curr, u32 multi)
+{
+ u32 n;
+
+ if (multi < curr)
+ return curr;
+
+ n = curr + AHASH_INIT_SIZE;
+ /* Currently, at listing one hash bucket must fit into a message.
+ * Therefore we have a hard limit here.
+ */
+ return n > curr && n <= 64 ? n : curr;
+}
+#define TUNE_AHASH_MAX(h, multi) \
+ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#else
+#define AHASH_MAX(h) AHASH_MAX_SIZE
+#define TUNE_AHASH_MAX(h, multi)
+#endif
+
+/* A hash bucket */
+struct hbucket {
+ void *value; /* the array of the values */
+ u8 size; /* size of the array */
+ u8 pos; /* position of the first free entry */
+};
+
+/* The hash table: the table size stored here in order to make resizing easy */
+struct htable {
+ u8 htable_bits; /* size of hash table == 2^htable_bits */
+ struct hbucket bucket[0]; /* hashtable buckets */
+};
+
+#define hbucket(h, i) (&((h)->bucket[i]))
+
+#ifndef IPSET_NET_COUNT
+#define IPSET_NET_COUNT 1
+#endif
+
+/* Book-keeping of the prefixes added to the set */
+struct net_prefixes {
+ u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */
+ u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */
+};
+
+/* Compute the hash table size */
+static size_t
+htable_size(u8 hbits)
+{
+ size_t hsize;
+
+ /* We must fit both into u32 in jhash and size_t */
+ if (hbits > 31)
+ return 0;
+ hsize = jhash_size(hbits);
+ if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
+ < hsize)
+ return 0;
+
+ return hsize * sizeof(struct hbucket) + sizeof(struct htable);
+}
+
+/* Compute htable_bits from the user input parameter hashsize */
+static u8
+htable_bits(u32 hashsize)
+{
+ /* Assume that hashsize == 2^htable_bits */
+ u8 bits = fls(hashsize - 1);
+ if (jhash_size(bits) != hashsize)
+ /* Round up to the first 2^n value */
+ bits = fls(hashsize);
+
+ return bits;
+}
+
+static int
+hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
+{
+ if (n->pos >= n->size) {
+ void *tmp;
+
+ if (n->size >= ahash_max)
+ /* Trigger rehashing */
+ return -EAGAIN;
+
+ tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ if (n->size) {
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ }
+ n->value = tmp;
+ n->size += AHASH_INIT_SIZE;
+ }
+ return 0;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+#if IPSET_NET_COUNT > 1
+#define __CIDR(cidr, i) (cidr[i])
+#else
+#define __CIDR(cidr, i) (cidr)
+#endif
+#ifdef IP_SET_HASH_WITH_NETS_PACKED
+/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */
+#define CIDR(cidr, i) (__CIDR(cidr, i) + 1)
+#else
+#define CIDR(cidr, i) (__CIDR(cidr, i))
+#endif
+
+#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
+
+#ifdef IP_SET_HASH_WITH_MULTI
+#define NLEN(family) (SET_HOST_MASK(family) + 1)
+#else
+#define NLEN(family) SET_HOST_MASK(family)
+#endif
+
+#else
+#define NLEN(family) 0
+#endif /* IP_SET_HASH_WITH_NETS */
+
+#endif /* _IP_SET_HASH_GEN_H */
+
+/* Family dependent templates */
+
+#undef ahash_data
+#undef mtype_data_equal
+#undef mtype_do_data_match
+#undef mtype_data_set_flags
+#undef mtype_data_reset_flags
+#undef mtype_data_netmask
+#undef mtype_data_list
+#undef mtype_data_next
+#undef mtype_elem
+
+#undef mtype_ahash_destroy
+#undef mtype_ext_cleanup
+#undef mtype_add_cidr
+#undef mtype_del_cidr
+#undef mtype_ahash_memsize
+#undef mtype_flush
+#undef mtype_destroy
+#undef mtype_gc_init
+#undef mtype_same_set
+#undef mtype_kadt
+#undef mtype_uadt
+#undef mtype
+
+#undef mtype_add
+#undef mtype_del
+#undef mtype_test_cidrs
+#undef mtype_test
+#undef mtype_expire
+#undef mtype_resize
+#undef mtype_head
+#undef mtype_list
+#undef mtype_gc
+#undef mtype_gc_init
+#undef mtype_variant
+#undef mtype_data_match
+
+#undef HKEY
+
+#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal)
+#ifdef IP_SET_HASH_WITH_NETS
+#define mtype_do_data_match IPSET_TOKEN(MTYPE, _do_data_match)
+#else
+#define mtype_do_data_match(d) 1
+#endif
+#define mtype_data_set_flags IPSET_TOKEN(MTYPE, _data_set_flags)
+#define mtype_data_reset_elem IPSET_TOKEN(MTYPE, _data_reset_elem)
+#define mtype_data_reset_flags IPSET_TOKEN(MTYPE, _data_reset_flags)
+#define mtype_data_netmask IPSET_TOKEN(MTYPE, _data_netmask)
+#define mtype_data_list IPSET_TOKEN(MTYPE, _data_list)
+#define mtype_data_next IPSET_TOKEN(MTYPE, _data_next)
+#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+#define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy)
+#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
+#define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr)
+#define mtype_del_cidr IPSET_TOKEN(MTYPE, _del_cidr)
+#define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize)
+#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
+#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
+#define mtype MTYPE
+
+#define mtype_add IPSET_TOKEN(MTYPE, _add)
+#define mtype_del IPSET_TOKEN(MTYPE, _del)
+#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
+#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
+#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
+#define mtype_head IPSET_TOKEN(MTYPE, _head)
+#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
+#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
+
+#ifndef HKEY_DATALEN
+#define HKEY_DATALEN sizeof(struct mtype_elem)
+#endif
+
+#define HKEY(data, initval, htable_bits) \
+(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \
+ & jhash_mask(htable_bits))
+
+#ifndef htype
+#define htype HTYPE
+
+/* The generic hash structure */
+struct htype {
+ struct htable __rcu *table; /* the hash table */
+ u32 maxelem; /* max elements in the hash */
+ u32 elements; /* current element (vs timeout) */
+ u32 initval; /* random jhash init value */
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ u32 markmask; /* markmask value for mark mask to store */
+#endif
+ struct timer_list gc; /* garbage collection when timeout enabled */
+ struct mtype_elem next; /* temporary storage for uadd */
+#ifdef IP_SET_HASH_WITH_MULTI
+ u8 ahash_max; /* max elements in an array block */
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask; /* netmask value for subnets to store */
+#endif
+#ifdef IP_SET_HASH_WITH_RBTREE
+ struct rb_root rbtree;
+#endif
+#ifdef IP_SET_HASH_WITH_NETS
+ struct net_prefixes nets[0]; /* book-keeping of prefixes */
+#endif
+};
+#endif
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Network cidr size book keeping when the hash stores different
+ * sized networks */
+static void
+mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+{
+ int i, j;
+
+ /* Add in increasing prefix order, so larger cidr first */
+ for (i = 0, j = -1; i < nets_length && h->nets[i].nets[n]; i++) {
+ if (j != -1)
+ continue;
+ else if (h->nets[i].cidr[n] < cidr)
+ j = i;
+ else if (h->nets[i].cidr[n] == cidr) {
+ h->nets[i].nets[n]++;
+ return;
+ }
+ }
+ if (j != -1) {
+ for (; i > j; i--) {
+ h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
+ h->nets[i].nets[n] = h->nets[i - 1].nets[n];
+ }
+ }
+ h->nets[i].cidr[n] = cidr;
+ h->nets[i].nets[n] = 1;
+}
+
+static void
+mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+{
+ u8 i, j, net_end = nets_length - 1;
+
+ for (i = 0; i < nets_length; i++) {
+ if (h->nets[i].cidr[n] != cidr)
+ continue;
+ if (h->nets[i].nets[n] > 1 || i == net_end ||
+ h->nets[i + 1].nets[n] == 0) {
+ h->nets[i].nets[n]--;
+ return;
+ }
+ for (j = i; j < net_end && h->nets[j].nets[n]; j++) {
+ h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
+ h->nets[j].nets[n] = h->nets[j + 1].nets[n];
+ }
+ h->nets[j].nets[n] = 0;
+ return;
+ }
+}
+#endif
+
+/* Calculate the actual memory size of the set data */
+static size_t
+mtype_ahash_memsize(const struct htype *h, const struct htable *t,
+ u8 nets_length, size_t dsize)
+{
+ u32 i;
+ size_t memsize = sizeof(*h)
+ + sizeof(*t)
+#ifdef IP_SET_HASH_WITH_NETS
+ + sizeof(struct net_prefixes) * nets_length
+#endif
+ + jhash_size(t->htable_bits) * sizeof(struct hbucket);
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++)
+ memsize += t->bucket[i].size * dsize;
+
+ return memsize;
+}
+
+/* Get the ith element from the array block n */
+#define ahash_data(n, i, dsize) \
+ ((struct mtype_elem *)((n)->value + ((i) * (dsize))))
+
+static void
+mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
+{
+ int i;
+
+ for (i = 0; i < n->pos; i++)
+ ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
+}
+
+/* Flush a hash type of set: destroy all elements */
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ struct hbucket *n;
+ u32 i;
+
+ t = rcu_dereference_bh_nfnl(h->table);
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ n->size = n->pos = 0;
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+#ifdef IP_SET_HASH_WITH_NETS
+ memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
+#endif
+ h->elements = 0;
+}
+
+/* Destroy the hashtable part of the set */
+static void
+mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
+{
+ struct hbucket *n;
+ u32 i;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+
+ ip_set_free(t);
+}
+
+/* Destroy a hash type of set */
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (set->extensions & IPSET_EXT_TIMEOUT)
+ del_timer_sync(&h->gc);
+
+ mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true);
+#ifdef IP_SET_HASH_WITH_RBTREE
+ rbtree_destroy(&h->rbtree);
+#endif
+ kfree(h);
+
+ set->data = NULL;
+}
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct htype *h = set->data;
+
+ init_timer(&h->gc);
+ h->gc.data = (unsigned long) set;
+ h->gc.function = gc;
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&h->gc);
+ pr_debug("gc initialized, run in every %u\n",
+ IPSET_GC_PERIOD(set->timeout));
+}
+
+static bool
+mtype_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct htype *x = a->data;
+ const struct htype *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ a->timeout == b->timeout &&
+#ifdef IP_SET_HASH_WITH_NETMASK
+ x->netmask == y->netmask &&
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ x->markmask == y->markmask &&
+#endif
+ a->extensions == b->extensions;
+}
+
+/* Delete expired elements from the hashtable */
+static void
+mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
+{
+ struct htable *t;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ u32 i;
+ int j;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 k;
+#endif
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, dsize);
+ if (ip_set_timeout_expired(ext_timeout(data, set))) {
+ pr_debug("expired %u/%u\n", i, j);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (k = 0; k < IPSET_NET_COUNT; k++)
+ mtype_del_cidr(h, CIDR(data->cidr, k),
+ nets_length, k);
+#endif
+ ip_set_ext_destroy(set, data);
+ if (j != n->pos - 1)
+ /* Not last one */
+ memcpy(data,
+ ahash_data(n, n->pos - 1, dsize),
+ dsize);
+ n->pos--;
+ h->elements--;
+ }
+ }
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ /* Still try to delete expired elements */
+ continue;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ }
+ rcu_read_unlock_bh();
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct htype *h = set->data;
+
+ pr_debug("called\n");
+ write_lock_bh(&set->lock);
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+ write_unlock_bh(&set->lock);
+
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&h->gc);
+}
+
+/* Resize a hash: create a new hash table with doubling the hashsize
+ * and inserting the elements to it. Repeat until we succeed or
+ * fail due to memory pressures. */
+static int
+mtype_resize(struct ip_set *set, bool retried)
+{
+ struct htype *h = set->data;
+ struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table);
+ u8 htable_bits = orig->htable_bits;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 flags;
+#endif
+ struct mtype_elem *data;
+ struct mtype_elem *d;
+ struct hbucket *n, *m;
+ u32 i, j;
+ int ret;
+
+ /* Try to cleanup once */
+ if (SET_WITH_TIMEOUT(set) && !retried) {
+ i = h->elements;
+ write_lock_bh(&set->lock);
+ mtype_expire(set, set->data, NLEN(set->family), set->dsize);
+ write_unlock_bh(&set->lock);
+ if (h->elements < i)
+ return 0;
+ }
+
+retry:
+ ret = 0;
+ htable_bits++;
+ pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+ set->name, orig->htable_bits, htable_bits, orig);
+ if (!htable_bits) {
+ /* In case we have plenty of memory :-) */
+ pr_warning("Cannot increase the hashsize of set %s further\n",
+ set->name);
+ return -IPSET_ERR_HASH_FULL;
+ }
+ t = ip_set_alloc(sizeof(*t)
+ + jhash_size(htable_bits) * sizeof(struct hbucket));
+ if (!t)
+ return -ENOMEM;
+ t->htable_bits = htable_bits;
+
+ read_lock_bh(&set->lock);
+ for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+ n = hbucket(orig, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ flags = 0;
+ mtype_data_reset_flags(data, &flags);
+#endif
+ m = hbucket(t, HKEY(data, h->initval, htable_bits));
+ ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize);
+ if (ret < 0) {
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(data, &flags);
+#endif
+ read_unlock_bh(&set->lock);
+ mtype_ahash_destroy(set, t, false);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+ }
+ d = ahash_data(m, m->pos++, set->dsize);
+ memcpy(d, data, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(d, &flags);
+#endif
+ }
+ }
+
+ rcu_assign_pointer(h->table, t);
+ read_unlock_bh(&set->lock);
+
+ /* Give time to other readers of the set */
+ synchronize_rcu_bh();
+
+ pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
+ orig->htable_bits, orig, t->htable_bits, t);
+ mtype_ahash_destroy(set, orig, false);
+
+ return 0;
+}
+
+/* Add an element to a hash and update the internal counters when succeeded,
+ * otherwise report the proper error code. */
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i, ret = 0;
+ int j = AHASH_MAX(h) + 1;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 key, multi = 0;
+
+ if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) {
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t,key);
+ if (n->pos) {
+ /* Choosing the first entry in the array to replace */
+ j = 0;
+ goto reuse_slot;
+ }
+ rcu_read_unlock_bh();
+ }
+ if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
+ /* FIXME: when set is full, we slow down here */
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+
+ if (h->elements >= h->maxelem) {
+ if (net_ratelimit())
+ pr_warning("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ return -IPSET_ERR_HASH_FULL;
+ }
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (mtype_data_equal(data, d, &multi)) {
+ if (flag_exist ||
+ (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))) {
+ /* Just the extensions could be overwritten */
+ j = i;
+ goto reuse_slot;
+ } else {
+ ret = -IPSET_ERR_EXIST;
+ goto out;
+ }
+ }
+ /* Reuse first timed out entry */
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)) &&
+ j != AHASH_MAX(h) + 1)
+ j = i;
+ }
+reuse_slot:
+ if (j != AHASH_MAX(h) + 1) {
+ /* Fill out reused slot */
+ data = ahash_data(n, j, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (i = 0; i < IPSET_NET_COUNT; i++) {
+ mtype_del_cidr(h, CIDR(data->cidr, i),
+ NLEN(set->family), i);
+ mtype_add_cidr(h, CIDR(d->cidr, i),
+ NLEN(set->family), i);
+ }
+#endif
+ ip_set_ext_destroy(set, data);
+ } else {
+ /* Use/create a new slot */
+ TUNE_AHASH_MAX(h, multi);
+ ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize);
+ if (ret != 0) {
+ if (ret == -EAGAIN)
+ mtype_data_next(&h->next, d);
+ goto out;
+ }
+ data = ahash_data(n, n->pos++, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ mtype_add_cidr(h, CIDR(d->cidr, i), NLEN(set->family),
+ i);
+#endif
+ h->elements++;
+ }
+ memcpy(data, d, sizeof(struct mtype_elem));
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_set_flags(data, flags);
+#endif
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(data, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(data, set), ext);
+
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Delete an element from the hash: swap it with the last element
+ * and free up space if possible.
+ */
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i, ret = -IPSET_ERR_EXIST;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 j;
+#endif
+ u32 key, multi = 0;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))
+ goto out;
+ if (i != n->pos - 1)
+ /* Not last one */
+ memcpy(data, ahash_data(n, n->pos - 1, set->dsize),
+ set->dsize);
+
+ n->pos--;
+ h->elements--;
+#ifdef IP_SET_HASH_WITH_NETS
+ for (j = 0; j < IPSET_NET_COUNT; j++)
+ mtype_del_cidr(h, CIDR(d->cidr, j), NLEN(set->family),
+ j);
+#endif
+ ip_set_ext_destroy(set, data);
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * set->dsize,
+ GFP_ATOMIC);
+ if (!tmp) {
+ ret = 0;
+ goto out;
+ }
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * set->dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ ret = 0;
+ goto out;
+ }
+
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+static inline int
+mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, struct ip_set *set, u32 flags)
+{
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(data, set),
+ ext, mext, flags);
+ return mtype_do_data_match(data);
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Special test function which takes into account the different network
+ * sizes added to the set */
+static int
+mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
+ const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t = rcu_dereference_bh(h->table);
+ struct hbucket *n;
+ struct mtype_elem *data;
+#if IPSET_NET_COUNT == 2
+ struct mtype_elem orig = *d;
+ int i, j = 0, k;
+#else
+ int i, j = 0;
+#endif
+ u32 key, multi = 0;
+ u8 nets_length = NLEN(set->family);
+
+ pr_debug("test by nets\n");
+ for (; j < nets_length && h->nets[j].nets[0] && !multi; j++) {
+#if IPSET_NET_COUNT == 2
+ mtype_data_reset_elem(d, &orig);
+ mtype_data_netmask(d, h->nets[j].cidr[0], false);
+ for (k = 0; k < nets_length && h->nets[k].nets[1] && !multi;
+ k++) {
+ mtype_data_netmask(d, h->nets[k].cidr[1], true);
+#else
+ mtype_data_netmask(d, h->nets[j].cidr[0]);
+#endif
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set)) {
+ if (!ip_set_timeout_expired(
+ ext_timeout(data, set)))
+ return mtype_data_match(data, ext,
+ mext, set,
+ flags);
+#ifdef IP_SET_HASH_WITH_MULTI
+ multi = 0;
+#endif
+ } else
+ return mtype_data_match(data, ext,
+ mext, set, flags);
+ }
+#if IPSET_NET_COUNT == 2
+ }
+#endif
+ }
+ return 0;
+}
+#endif
+
+/* Test whether the element is added to the set */
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ struct mtype_elem *d = value;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ int i, ret = 0;
+ u32 key, multi = 0;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+#ifdef IP_SET_HASH_WITH_NETS
+ /* If we test an IP address and not a network address,
+ * try all possible network sizes */
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ if (CIDR(d->cidr, i) != SET_HOST_MASK(set->family))
+ break;
+ if (i == IPSET_NET_COUNT) {
+ ret = mtype_test_cidrs(set, d, ext, mext, flags);
+ goto out;
+ }
+#endif
+
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (mtype_data_equal(data, d, &multi) &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))) {
+ ret = mtype_data_match(data, ext, mext, set, flags);
+ goto out;
+ }
+ }
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Reply a HEADER request: fill out the header part of the set */
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct htype *h = set->data;
+ const struct htable *t;
+ struct nlattr *nested;
+ size_t memsize;
+
+ t = rcu_dereference_bh_nfnl(h->table);
+ memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
+ htonl(jhash_size(t->htable_bits))) ||
+ nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
+ goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (h->netmask != HOST_MASK &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+ goto nla_put_failure;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
+ goto nla_put_failure;
+#endif
+ if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/* Reply a LIST/SAVE request: dump the elements of the specified set */
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct htype *h = set->data;
+ const struct htable *t = rcu_dereference_bh_nfnl(h->table);
+ struct nlattr *atd, *nested;
+ const struct hbucket *n;
+ const struct mtype_elem *e;
+ u32 first = cb->args[IPSET_CB_ARG0];
+ /* We assume that one hash bucket fills into one page */
+ void *incomplete;
+ int i;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ pr_debug("list hash set %s\n", set->name);
+ for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
+ cb->args[IPSET_CB_ARG0]++) {
+ incomplete = skb_tail_pointer(skb);
+ n = hbucket(t, cb->args[IPSET_CB_ARG0]);
+ pr_debug("cb->arg bucket: %lu, t %p n %p\n",
+ cb->args[IPSET_CB_ARG0], t, n);
+ for (i = 0; i < n->pos; i++) {
+ e = ahash_data(n, i, set->dsize);
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ pr_debug("list hash %lu hbucket %p i %u, data %p\n",
+ cb->args[IPSET_CB_ARG0], n, i, e);
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (cb->args[IPSET_CB_ARG0] == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_data_list(skb, e))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, e, true))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, incomplete);
+ if (unlikely(first == cb->args[IPSET_CB_ARG0])) {
+ pr_warning("Can't list set %s: one bucket does not fit into "
+ "a message. Please report it!\n", set->name);
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, atd);
+ return 0;
+}
+
+static int
+IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt);
+
+static int
+IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+
+static const struct ip_set_type_variant mtype_variant = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .resize = mtype_resize,
+ .same_set = mtype_same_set,
+};
+
+#ifdef IP_SET_EMIT_CREATE
+static int
+IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
+ struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ u32 markmask;
+#endif
+ u8 hbits;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask;
+#endif
+ size_t hsize;
+ struct HTYPE *h;
+ struct htable *t;
+
+ if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ markmask = 0xffffffff;
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+ netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+#endif
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) ||
+#endif
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if ((set->family == NFPROTO_IPV4 && netmask > 32) ||
+ (set->family == NFPROTO_IPV6 && netmask > 128) ||
+ netmask == 0)
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ if (tb[IPSET_ATTR_MARKMASK]) {
+ markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK]));
+
+ if ((markmask > 4294967295u) || markmask == 0)
+ return -IPSET_ERR_INVALID_MARKMASK;
+ }
+#endif
+
+ hsize = sizeof(*h);
+#ifdef IP_SET_HASH_WITH_NETS
+ hsize += sizeof(struct net_prefixes) *
+ (set->family == NFPROTO_IPV4 ? 32 : 128);
+#endif
+ h = kzalloc(hsize, GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ h->netmask = netmask;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ h->markmask = markmask;
+#endif
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ hsize = htable_size(hbits);
+ if (hsize == 0) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ t = ip_set_alloc(hsize);
+ if (!t) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ t->htable_bits = hbits;
+ rcu_assign_pointer(h->table, t);
+
+ set->data = h;
+ if (set->family == NFPROTO_IPV4) {
+ set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));
+ } else {
+ set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)));
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ if (set->family == NFPROTO_IPV4)
+ IPSET_TOKEN(HTYPE, 4_gc_init)(set,
+ IPSET_TOKEN(HTYPE, 4_gc));
+ else
+ IPSET_TOKEN(HTYPE, 6_gc_init)(set,
+ IPSET_TOKEN(HTYPE, 6_gc));
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(t->htable_bits),
+ t->htable_bits, h->maxelem, set->data, t);
+
+ return 0;
+}
+#endif /* IP_SET_EMIT_CREATE */
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
new file mode 100644
index 00000000000..dd40607f878
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -0,0 +1,315 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counters support */
+/* 2 Comments support */
+#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip");
+
+/* Type specific function prefix */
+#define HTYPE hash_ip
+#define IP_SET_HASH_WITH_NETMASK
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ip4_elem {
+ /* Zero valued IP addresses cannot be stored */
+ __be32 ip;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ip4_data_equal(const struct hash_ip4_elem *e1,
+ const struct hash_ip4_elem *e2,
+ u32 *multi)
+{
+ return e1->ip == e2->ip;
+}
+
+static inline bool
+hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
+{
+ next->ip = e->ip;
+}
+
+#define MTYPE hash_ip4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip4_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ __be32 ip;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
+ ip &= ip_set_netmask(h->netmask);
+ if (ip == 0)
+ return -EINVAL;
+
+ e.ip = ip;
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip4_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, hosts;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ip &= ip_set_hostmask(h->netmask);
+
+ if (adt == IPSET_TEST) {
+ e.ip = htonl(ip);
+ if (e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip += hosts) {
+ e.ip = htonl(ip);
+ if (e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+/* Member elements */
+struct hash_ip6_elem {
+ union nf_inet_addr ip;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
+ const struct hash_ip6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
+}
+
+static inline void
+hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip6_netmask(ip, prefix);
+}
+
+static bool
+hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ip6
+#define PF 6
+#define HOST_MASK 128
+
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip6_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip6_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -IPSET_ERR_HASH_ELEM;
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_ip_type __read_mostly = {
+ .name = "hash:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ip_init(void)
+{
+ return ip_set_type_register(&hash_ip_type);
+}
+
+static void __exit
+hash_ip_fini(void)
+{
+ ip_set_type_unregister(&hash_ip_type);
+}
+
+module_init(hash_ip_init);
+module_exit(hash_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
new file mode 100644
index 00000000000..4eff0a29725
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -0,0 +1,321 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,mark type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
+IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,mark");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipmark
+#define IP_SET_HASH_WITH_MARKMASK
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipmark4_elem {
+ __be32 ip;
+ __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1,
+ const struct hash_ipmark4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark4_data_list(struct sk_buff *skb,
+ const struct hash_ipmark4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
+ const struct hash_ipmark4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_ipmark4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.mark = skb->mark;
+ e.mark &= h->markmask;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark &= h->markmask;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ e.ip = htonl(ip);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipmark6_elem {
+ union nf_inet_addr ip;
+ __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1,
+ const struct hash_ipmark6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark6_data_list(struct sk_buff *skb,
+ const struct hash_ipmark6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
+ const struct hash_ipmark6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipmark6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+
+static int
+hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.mark = skb->mark;
+ e.mark &= h->markmask;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark &= h->markmask;
+
+ if (adt == IPSET_TEST) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static struct ip_set_type hash_ipmark_type __read_mostly = {
+ .name = "hash:ip,mark",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MARK,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipmark_create,
+ .create_policy = {
+ [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_MARK] = { .type = NLA_U32 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipmark_init(void)
+{
+ return ip_set_type_register(&hash_ipmark_type);
+}
+
+static void __exit
+hash_ipmark_fini(void)
+{
+ ip_set_type_unregister(&hash_ipmark_type);
+}
+
+module_init(hash_ipmark_init);
+module_exit(hash_ipmark_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
new file mode 100644
index 00000000000..7597b82a8b0
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -0,0 +1,390 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Counters support added */
+/* 3 Comments support added */
+#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipport
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
+ const struct hash_ipport4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipport4_data_list(struct sk_buff *skb,
+ const struct hash_ipport4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipport4_data_next(struct hash_ipport4_elem *next,
+ const struct hash_ipport4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+#define MTYPE hash_ipport4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipport4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0, p = 0, port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
+ const struct hash_ipport6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipport6_data_list(struct sk_buff *skb,
+ const struct hash_ipport6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipport6_data_next(struct hash_ipport4_elem *next,
+ const struct hash_ipport6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipport6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipport6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipport_type __read_mostly = {
+ .name = "hash:ip,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipport_init(void)
+{
+ return ip_set_type_register(&hash_ipport_type);
+}
+
+static void __exit
+hash_ipport_fini(void)
+{
+ ip_set_type_unregister(&hash_ipport_type);
+}
+
+module_init(hash_ipport_init);
+module_exit(hash_ipport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
new file mode 100644
index 00000000000..672655ffd57
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -0,0 +1,402 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Counters support added */
+/* 3 Comments support added */
+#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port,ip");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipportip
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipportip4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+static inline bool
+hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
+ const struct hash_ipportip4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipportip4_data_list(struct sk_buff *skb,
+ const struct hash_ipportip4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+/* Common functions */
+#define MTYPE hash_ipportip4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0, p = 0, port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipportip6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
+ const struct hash_ipportip6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipportip6_data_list(struct sk_buff *skb,
+ const struct hash_ipportip6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_ipportip6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipportip_type __read_mostly = {
+ .name = "hash:ip,port,ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipportip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportip_init(void)
+{
+ return ip_set_type_register(&hash_ipportip_type);
+}
+
+static void __exit
+hash_ipportip_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportip_type);
+}
+
+module_init(hash_ipportip_init);
+module_exit(hash_ipportip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
new file mode 100644
index 00000000000..7308d84f927
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -0,0 +1,561 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Range as input support for IPv4 added */
+/* 3 nomatch flag support added */
+/* 4 Counters support added */
+/* 5 Comments support added */
+#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipportnet
+
+/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
+ * However this way we have to store internally cidr - 1,
+ * dancing back and forth.
+ */
+#define IP_SET_HASH_WITH_NETS_PACKED
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipportnet4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
+ const struct hash_ipportnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
+{
+ elem->ip2 &= ip_set_netmask(cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_ipportnet4_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+ next->ip2 = d->ip2;
+}
+
+#define MTYPE hash_ipportnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ e.ip2 &= ip_set_netmask(e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, p = 0, port, port_to;
+ u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports ||
+ tb[IPSET_ATTR_IP2_TO])) {
+ e.ip = htonl(ip);
+ e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_from > ip2_to)
+ swap(ip2_from, ip2_to);
+ if (ip2_from + UINT_MAX == ip2_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ e.ip = htonl(ip);
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ip2 = retried &&
+ ip == ntohl(h->next.ip) &&
+ p == ntohs(h->next.port)
+ ? ntohl(h->next.ip2) : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip2 = htonl(ip2);
+ ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
+ &cidr);
+ e.cidr = cidr - 1;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = ip2_last + 1;
+ }
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipportnet6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
+ const struct hash_ipportnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip2, cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_ipportnet6_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_ipportnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ ip6_netmask(&e.ip2, e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ ip6_netmask(&e.ip2, e.cidr + 1);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipportnet_type __read_mostly = {
+ .name = "hash:ip,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportnet_init(void)
+{
+ return ip_set_type_register(&hash_ipportnet_type);
+}
+
+static void __exit
+hash_ipportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportnet_type);
+}
+
+module_init(hash_ipportnet_init);
+module_exit(hash_ipportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
new file mode 100644
index 00000000000..4c7d495783a
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -0,0 +1,397 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Range as input support for IPv4 added */
+/* 2 nomatch flag support added */
+/* 3 Counters support added */
+/* 4 Comments support added */
+#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net");
+
+/* Type specific function prefix */
+#define HTYPE hash_net
+#define IP_SET_HASH_WITH_NETS
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_net4_elem {
+ __be32 ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
+
+/* Common functions */
+
+static inline bool
+hash_net4_data_equal(const struct hash_net4_elem *ip1,
+ const struct hash_net4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_net4_do_data_match(const struct hash_net4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_net4_data_next(struct hash_net4_elem *next,
+ const struct hash_net4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_net4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret:
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ }
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_net6_elem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
+
+/* Common functions */
+
+static inline bool
+hash_net6_data_equal(const struct hash_net6_elem *ip1,
+ const struct hash_net6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_net6_do_data_match(const struct hash_net6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_net6_data_next(struct hash_net4_elem *next,
+ const struct hash_net6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_net6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip, e.cidr);
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_net_type __read_mostly = {
+ .name = "hash:net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_net_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_net_init(void)
+{
+ return ip_set_type_register(&hash_net_type);
+}
+
+static void __exit
+hash_net_fini(void)
+{
+ ip_set_type_unregister(&hash_net_type);
+}
+
+module_init(hash_net_init);
+module_exit(hash_net_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
new file mode 100644
index 00000000000..db2606805b3
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -0,0 +1,610 @@
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,iface type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 nomatch flag support added */
+/* 2 /0 support added */
+/* 3 Counters support added */
+/* 4 Comments support added */
+#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,iface");
+
+/* Interface name rbtree */
+
+struct iface_node {
+ struct rb_node node;
+ char iface[IFNAMSIZ];
+};
+
+#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface)
+
+static void
+rbtree_destroy(struct rb_root *root)
+{
+ struct iface_node *node, *next;
+
+ rbtree_postorder_for_each_entry_safe(node, next, root, node)
+ kfree(node);
+
+ *root = RB_ROOT;
+}
+
+static int
+iface_test(struct rb_root *root, const char **iface)
+{
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ const char *d = iface_data(n);
+ int res = strcmp(*iface, d);
+
+ if (res < 0)
+ n = n->rb_left;
+ else if (res > 0)
+ n = n->rb_right;
+ else {
+ *iface = d;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+iface_add(struct rb_root *root, const char **iface)
+{
+ struct rb_node **n = &(root->rb_node), *p = NULL;
+ struct iface_node *d;
+
+ while (*n) {
+ char *ifname = iface_data(*n);
+ int res = strcmp(*iface, ifname);
+
+ p = *n;
+ if (res < 0)
+ n = &((*n)->rb_left);
+ else if (res > 0)
+ n = &((*n)->rb_right);
+ else {
+ *iface = ifname;
+ return 0;
+ }
+ }
+
+ d = kzalloc(sizeof(*d), GFP_ATOMIC);
+ if (!d)
+ return -ENOMEM;
+ strcpy(d->iface, *iface);
+
+ rb_link_node(&d->node, p, n);
+ rb_insert_color(&d->node, root);
+
+ *iface = d->iface;
+ return 0;
+}
+
+/* Type specific function prefix */
+#define HTYPE hash_netiface
+#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_RBTREE
+#define IP_SET_HASH_WITH_MULTI
+
+#define STREQ(a, b) (strcmp(a, b) == 0)
+
+/* IPv4 variant */
+
+struct hash_netiface4_elem_hashed {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+};
+
+/* Member elements */
+struct hash_netiface4_elem {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
+ const struct hash_netiface4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->cidr == ip2->cidr &&
+ (++*multi) &&
+ ip1->physdev == ip2->physdev &&
+ ip1->iface == ip2->iface;
+}
+
+static inline int
+hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netiface4_data_list(struct sk_buff *skb,
+ const struct hash_netiface4_elem *data)
+{
+ u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+ if (data->nomatch)
+ flags |= IPSET_FLAG_NOMATCH;
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netiface4_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_netiface4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .elem = 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret;
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
+
+#define IFACE(dir) (par->dir ? par->dir->name : NULL)
+#define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL)
+#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
+
+ if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#ifdef CONFIG_BRIDGE_NETFILTER
+ const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+ if (!nf_bridge)
+ return -EINVAL;
+ e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
+ e.physdev = 1;
+#else
+ e.iface = NULL;
+#endif
+ } else
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+ if (!e.iface)
+ return -EINVAL;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ char iface[IFNAMSIZ];
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_IFACE] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_PHYSDEV)
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+ if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netiface6_elem_hashed {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+};
+
+struct hash_netiface6_elem {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
+ const struct hash_netiface6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->cidr == ip2->cidr &&
+ (++*multi) &&
+ ip1->physdev == ip2->physdev &&
+ ip1->iface == ip2->iface;
+}
+
+static inline int
+hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netiface6_data_list(struct sk_buff *skb,
+ const struct hash_netiface6_elem *data)
+{
+ u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+ if (data->nomatch)
+ flags |= IPSET_FLAG_NOMATCH;
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netiface6_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_netiface6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .elem = 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret;
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
+
+ if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#ifdef CONFIG_BRIDGE_NETFILTER
+ const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+ if (!nf_bridge)
+ return -EINVAL;
+ e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
+ e.physdev = 1;
+#else
+ e.iface = NULL;
+#endif
+ } else
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+ if (!e.iface)
+ return -EINVAL;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ char iface[IFNAMSIZ];
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_IFACE] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip6_netmask(&e.ip, e.cidr);
+
+ strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_PHYSDEV)
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_netiface_type __read_mostly = {
+ .name = "hash:net,iface",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netiface_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IFACE] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netiface_init(void)
+{
+ return ip_set_type_register(&hash_netiface_type);
+}
+
+static void __exit
+hash_netiface_fini(void)
+{
+ ip_set_type_unregister(&hash_netiface_type);
+}
+
+module_init(hash_netiface_init);
+module_exit(hash_netiface_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
new file mode 100644
index 00000000000..3e99987e4bf
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -0,0 +1,481 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
+IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_netnet
+#define IP_SET_HASH_WITH_NETS
+#define IPSET_NET_COUNT 2
+
+/* IPv4 variants */
+
+/* Member elements */
+struct hash_netnet4_elem {
+ union {
+ __be32 ip[2];
+ __be64 ipcmp;
+ };
+ u8 nomatch;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
+ const struct hash_netnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ipcmp == ip2->ipcmp &&
+ ip1->ccmp == ip2->ccmp;
+}
+
+static inline int
+hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
+ struct hash_netnet4_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
+{
+ if (inner) {
+ elem->ip[1] &= ip_set_netmask(cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ elem->ip[0] &= ip_set_netmask(cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netnet4_data_list(struct sk_buff *skb,
+ const struct hash_netnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_netnet4_data_next(struct hash_netnet4_elem *next,
+ const struct hash_netnet4_elem *d)
+{
+ next->ipcmp = d->ipcmp;
+}
+
+#define MTYPE hash_netnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
+ e.ip[0] &= ip_set_netmask(e.cidr[0]);
+ e.ip[1] &= ip_set_netmask(e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
+ u8 cidr, cidr2;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[0] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr2 || cidr2 > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[1] = cidr2;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] &&
+ tb[IPSET_ATTR_IP2_TO])) {
+ e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_to < ip2_from)
+ swap(ip2_from, ip2_to);
+ if (ip2_from + UINT_MAX == ip2_to)
+ return -IPSET_ERR_HASH_RANGE;
+
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip[0]);
+
+ while (!after(ip, ip_to)) {
+ e.ip[0] = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr[0] = cidr;
+ ip2 = (retried &&
+ ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
+ : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip[1] = htonl(ip2);
+ last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2);
+ e.cidr[1] = cidr2;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = last2 + 1;
+ }
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variants */
+
+struct hash_netnet6_elem {
+ union nf_inet_addr ip[2];
+ u8 nomatch;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
+ const struct hash_netnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
+ ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
+ ip1->ccmp == ip2->ccmp;
+}
+
+static inline int
+hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
+ struct hash_netnet6_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
+{
+ if (inner) {
+ ip6_netmask(&elem->ip[1], cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ ip6_netmask(&elem->ip[0], cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netnet6_data_list(struct sk_buff *skb,
+ const struct hash_netnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_netnet6_data_next(struct hash_netnet4_elem *next,
+ const struct hash_netnet6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6);
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
+ ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (tb[IPSET_ATTR_CIDR2])
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
+ e.cidr[1] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_netnet_type __read_mostly = {
+ .name = "hash:net,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netnet_init(void)
+{
+ return ip_set_type_register(&hash_netnet_type);
+}
+
+static void __exit
+hash_netnet_fini(void)
+{
+ ip_set_type_unregister(&hash_netnet_type);
+}
+
+module_init(hash_netnet_init);
+module_exit(hash_netnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
new file mode 100644
index 00000000000..1c645fbd09c
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -0,0 +1,509 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Range as input support for IPv4 added */
+/* 3 nomatch flag support added */
+/* 4 Counters support added */
+/* 5 Comments support added */
+#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,port");
+
+/* Type specific function prefix */
+#define HTYPE hash_netport
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
+ * However this way we have to store internally cidr - 1,
+ * dancing back and forth.
+ */
+#define IP_SET_HASH_WITH_NETS_PACKED
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_netport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
+ const struct hash_netport4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_netport4_do_data_match(const struct hash_netport4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_netport4_data_list(struct sk_buff *skb,
+ const struct hash_netport4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netport4_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+#define MTYPE hash_netport4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to, p = 0, ip = 0, ip_to = 0, last;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = port_to = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port_to < port)
+ swap(port, port_to);
+ }
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr = cidr - 1;
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
+ const struct hash_netport6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_netport6_do_data_match(const struct hash_netport6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_netport6_data_list(struct sk_buff *skb,
+ const struct hash_netport6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netport6_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netport6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+ ip6_netmask(&e.ip, e.cidr + 1);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_netport_type __read_mostly = {
+ .name = "hash:net,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netport_init(void)
+{
+ return ip_set_type_register(&hash_netport_type);
+}
+
+static void __exit
+hash_netport_fini(void)
+{
+ ip_set_type_unregister(&hash_netport_type);
+}
+
+module_init(hash_netport_init);
+module_exit(hash_netport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
new file mode 100644
index 00000000000..c0d2ba73f8b
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -0,0 +1,587 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 0 Comments support added */
+#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
+IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,port,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_netportnet
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+#define IPSET_NET_COUNT 2
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_netportnet4_elem {
+ union {
+ __be32 ip[2];
+ __be64 ipcmp;
+ };
+ __be16 port;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
+ const struct hash_netportnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ipcmp == ip2->ipcmp &&
+ ip1->ccmp == ip2->ccmp &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
+ struct hash_netportnet4_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
+ u8 cidr, bool inner)
+{
+ if (inner) {
+ elem->ip[1] &= ip_set_netmask(cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ elem->ip[0] &= ip_set_netmask(cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netportnet4_data_list(struct sk_buff *skb,
+ const struct hash_netportnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
+ const struct hash_netportnet4_elem *d)
+{
+ next->ipcmp = d->ipcmp;
+ next->port = d->port;
+}
+
+#define MTYPE hash_netportnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]);
+ e.ip[0] &= ip_set_netmask(e.cidr[0]);
+ e.ip[1] &= ip_set_netmask(e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
+ u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
+ bool with_ports = false;
+ u8 cidr, cidr2;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[0] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[1] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) {
+ e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ if (unlikely(ip + UINT_MAX == ip_to))
+ return -IPSET_ERR_HASH_RANGE;
+ }
+
+ port_to = port = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_from > ip2_to)
+ swap(ip2_from, ip2_to);
+ if (unlikely(ip2_from + UINT_MAX == ip2_to))
+ return -IPSET_ERR_HASH_RANGE;
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip[0]);
+
+ while (!after(ip, ip_to)) {
+ e.ip[0] = htonl(ip);
+ ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr[0] = cidr;
+ p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ip2 = (retried && ip == ntohl(h->next.ip[0]) &&
+ p == ntohs(h->next.port)) ? ntohl(h->next.ip[1])
+ : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip[1] = htonl(ip2);
+ ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
+ &cidr2);
+ e.cidr[1] = cidr2;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = ip2_last + 1;
+ }
+ }
+ ip = ip_last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netportnet6_elem {
+ union nf_inet_addr ip[2];
+ __be16 port;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
+ const struct hash_netportnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
+ ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
+ ip1->ccmp == ip2->ccmp &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
+ struct hash_netportnet6_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
+ u8 cidr, bool inner)
+{
+ if (inner) {
+ ip6_netmask(&elem->ip[1], cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ ip6_netmask(&elem->ip[0], cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netportnet6_data_list(struct sk_buff *skb,
+ const struct hash_netportnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
+ const struct hash_netportnet6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netportnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6);
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
+ ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (tb[IPSET_ATTR_CIDR2])
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
+ e.cidr[1] > HOST_MASK))
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_netportnet_type __read_mostly = {
+ .name = "hash:net,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netportnet_init(void)
+{
+ return ip_set_type_register(&hash_netportnet_type);
+}
+
+static void __exit
+hash_netportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_netportnet_type);
+}
+
+module_init(hash_netportnet_init);
+module_exit(hash_netportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
new file mode 100644
index 00000000000..3e2317f3cf6
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -0,0 +1,685 @@
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the list:set type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_list.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counters support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comments support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_list:set");
+
+/* Member elements */
+struct set_elem {
+ ip_set_id_t id;
+};
+
+struct set_adt_elem {
+ ip_set_id_t id;
+ ip_set_id_t refid;
+ int before;
+};
+
+/* Type structure */
+struct list_set {
+ u32 size; /* size of set list array */
+ struct timer_list gc; /* garbage collection */
+ struct net *net; /* namespace */
+ struct set_elem members[0]; /* the set members */
+};
+
+#define list_set_elem(set, map, id) \
+ (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize)
+
+static int
+list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i, cmdflags = opt->cmdflags;
+ int ret;
+
+ /* Don't lookup sub-counters at all */
+ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
+ if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
+ opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_test(e->id, skb, par, opt);
+ if (ret > 0) {
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(e, set),
+ ext, &opt->ext,
+ cmdflags);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int
+list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_add(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_del(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ switch (adt) {
+ case IPSET_TEST:
+ return list_set_ktest(set, skb, par, opt, &ext);
+ case IPSET_ADD:
+ return list_set_kadd(set, skb, par, opt, &ext);
+ case IPSET_DEL:
+ return list_set_kdel(set, skb, par, opt, &ext);
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+static bool
+id_eq(const struct ip_set *set, u32 i, ip_set_id_t id)
+{
+ const struct list_set *map = set->data;
+ const struct set_elem *e;
+
+ if (i >= map->size)
+ return 0;
+
+ e = list_set_elem(set, map, i);
+ return !!(e->id == id &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set))));
+}
+
+static int
+list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
+ const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(set, map, i);
+
+ if (e->id != IPSET_INVALID_ID) {
+ if (i == map->size - 1) {
+ /* Last element replaced: e.g. add new,before,last */
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ } else {
+ struct set_elem *x = list_set_elem(set, map,
+ map->size - 1);
+
+ /* Last element pushed off */
+ if (x->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(map->net, x->id);
+ ip_set_ext_destroy(set, x);
+ }
+ memmove(list_set_elem(set, map, i + 1), e,
+ set->dsize * (map->size - (i + 1)));
+ /* Extensions must be initialized to zero */
+ memset(e, 0, set->dsize);
+ }
+ }
+
+ e->id = d->id;
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ return 0;
+}
+
+static int
+list_set_del(struct ip_set *set, u32 i)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(set, map, i);
+
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+
+ if (i < map->size - 1)
+ memmove(e, list_set_elem(set, map, i + 1),
+ set->dsize * (map->size - (i + 1)));
+
+ /* Last element */
+ e = list_set_elem(set, map, map->size - 1);
+ e->id = IPSET_INVALID_ID;
+ return 0;
+}
+
+static void
+set_cleanup_entries(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i = 0;
+
+ while (i < map->size) {
+ e = list_set_elem(set, map, i);
+ if (e->id != IPSET_INVALID_ID &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ list_set_del(set, i);
+ /* Check element moved to position i in next loop */
+ else
+ i++;
+ }
+}
+
+static int
+list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return 1;
+ else if (d->before > 0)
+ ret = id_eq(set, i + 1, d->refid);
+ else
+ ret = i > 0 && id_eq(set, i - 1, d->refid);
+ return ret;
+ }
+ return 0;
+}
+
+
+static int
+list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 i, ret = 0;
+
+ if (SET_WITH_TIMEOUT(set))
+ set_cleanup_entries(set);
+
+ /* Check already added element */
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto insert;
+ else if (e->id != d->id)
+ continue;
+
+ if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) ||
+ (d->before < 0 &&
+ (i == 0 || !id_eq(set, i - 1, d->refid))))
+ /* Before/after doesn't match */
+ return -IPSET_ERR_REF_EXIST;
+ if (!flag_exist)
+ /* Can't re-add */
+ return -IPSET_ERR_EXIST;
+ /* Update extensions */
+ ip_set_ext_destroy(set, e);
+
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ /* Set is already added to the list */
+ ip_set_put_byindex(map->net, d->id);
+ return 0;
+ }
+insert:
+ ret = -IPSET_ERR_LIST_FULL;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ ret = d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : list_set_add(set, i, d, ext);
+ else if (e->id != d->refid)
+ continue;
+ else if (d->before > 0)
+ ret = list_set_add(set, i, d, ext);
+ else if (i + 1 < map->size)
+ ret = list_set_add(set, i + 1, d, ext);
+ }
+
+ return ret;
+}
+
+static int
+list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : -IPSET_ERR_EXIST;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return list_set_del(set, i);
+ else if (d->before > 0) {
+ if (!id_eq(set, i + 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ return list_set_del(set, i);
+ } else if (i == 0 || !id_eq(set, i - 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ else
+ return list_set_del(set, i);
+ }
+ return -IPSET_ERR_EXIST;
+}
+
+static int
+list_set_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct list_set *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct set_adt_elem e = { .refid = IPSET_INVALID_ID };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ struct ip_set *s;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_NAME] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+ e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s);
+ if (e.id == IPSET_INVALID_ID)
+ return -IPSET_ERR_NAME;
+ /* "Loop detection" */
+ if (s->type->features & IPSET_TYPE_NAME) {
+ ret = -IPSET_ERR_LOOP;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ e.before = f & IPSET_FLAG_BEFORE;
+ }
+
+ if (e.before && !tb[IPSET_ATTR_NAMEREF]) {
+ ret = -IPSET_ERR_BEFORE;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_NAMEREF]) {
+ e.refid = ip_set_get_byname(map->net,
+ nla_data(tb[IPSET_ATTR_NAMEREF]),
+ &s);
+ if (e.refid == IPSET_INVALID_ID) {
+ ret = -IPSET_ERR_NAMEREF;
+ goto finish;
+ }
+ if (!e.before)
+ e.before = -1;
+ }
+ if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set))
+ set_cleanup_entries(set);
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+finish:
+ if (e.refid != IPSET_INVALID_ID)
+ ip_set_put_byindex(map->net, e.refid);
+ if (adt != IPSET_ADD || ret)
+ ip_set_put_byindex(map->net, e.id);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+list_set_flush(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ e->id = IPSET_INVALID_ID;
+ }
+ }
+}
+
+static void
+list_set_destroy(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+ list_set_flush(set);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->size * set->dsize)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+list_set_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *atd, *nested;
+ u32 i, first = cb->args[IPSET_CB_ARG0];
+ const struct set_elem *e;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[IPSET_CB_ARG0] < map->size;
+ cb->args[IPSET_CB_ARG0]++) {
+ i = cb->args[IPSET_CB_ARG0];
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto finish;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (i == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (nla_put_string(skb, IPSET_ATTR_NAME,
+ ip_set_name_byindex(map->net, e->id)))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, e, true))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+finish:
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ if (unlikely(i == first)) {
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, atd);
+ return 0;
+}
+
+static bool
+list_set_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct list_set *x = a->data;
+ const struct list_set *y = b->data;
+
+ return x->size == y->size &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+static const struct ip_set_type_variant set_variant = {
+ .kadt = list_set_kadt,
+ .uadt = list_set_uadt,
+ .adt = {
+ [IPSET_ADD] = list_set_uadd,
+ [IPSET_DEL] = list_set_udel,
+ [IPSET_TEST] = list_set_utest,
+ },
+ .destroy = list_set_destroy,
+ .flush = list_set_flush,
+ .head = list_set_head,
+ .list = list_set_list,
+ .same_set = list_set_same_set,
+};
+
+static void
+list_set_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct list_set *map = set->data;
+
+ write_lock_bh(&set->lock);
+ set_cleanup_entries(set);
+ write_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct list_set *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create list:set type of sets */
+
+static bool
+init_list_set(struct net *net, struct ip_set *set, u32 size)
+{
+ struct list_set *map;
+ struct set_elem *e;
+ u32 i;
+
+ map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL);
+ if (!map)
+ return false;
+
+ map->size = size;
+ map->net = net;
+ set->data = map;
+
+ for (i = 0; i < size; i++) {
+ e = list_set_elem(set, map, i);
+ e->id = IPSET_INVALID_ID;
+ }
+
+ return true;
+}
+
+static int
+list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 size = IP_SET_LIST_DEFAULT_SIZE;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_SIZE])
+ size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]);
+ if (size < IP_SET_LIST_MIN_SIZE)
+ size = IP_SET_LIST_MIN_SIZE;
+
+ set->variant = &set_variant;
+ set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem));
+ if (!init_list_set(net, set, size))
+ return -ENOMEM;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ list_set_gc_init(set, list_set_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type list_set_type __read_mostly = {
+ .name = "list:set",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = list_set_create,
+ .create_policy = {
+ [IPSET_ATTR_SIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_NAME] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+list_set_init(void)
+{
+ return ip_set_type_register(&list_set_type);
+}
+
+static void __exit
+list_set_fini(void)
+{
+ ip_set_type_unregister(&list_set_type);
+}
+
+module_init(list_set_init);
+module_exit(list_set_fini);
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
new file mode 100644
index 00000000000..04d15fdc99e
--- /dev/null
+++ b/net/netfilter/ipset/pfxlen.c
@@ -0,0 +1,313 @@
+#include <linux/export.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+
+/*
+ * Prefixlen maps for fast conversions, by Jan Engelhardt.
+ */
+
+#define E(a, b, c, d) \
+ {.ip6 = { \
+ htonl(a), htonl(b), \
+ htonl(c), htonl(d), \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_netmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_netmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_netmask_map);
+
+#undef E
+#define E(a, b, c, d) \
+ {.ip6 = { (__force __be32) a, (__force __be32) b, \
+ (__force __be32) c, (__force __be32) d, \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_hostmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_hostmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_hostmask_map);
+
+/* Find the largest network which matches the range from left, in host order. */
+u32
+ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr)
+{
+ u32 last;
+ u8 i;
+
+ for (i = 1; i < 32; i++) {
+ if ((from & ip_set_hostmask(i)) != from)
+ continue;
+ last = from | ~ip_set_hostmask(i);
+ if (!after(last, to)) {
+ *cidr = i;
+ return last;
+ }
+ }
+ *cidr = 32;
+ return from;
+}
+EXPORT_SYMBOL_GPL(ip_set_range_to_cidr);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 70bd1d0774c..0c3b1670b0d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -28,12 +28,11 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
depends on IPV6 = y || IP_VS = IPV6
+ select IP6_NF_IPTABLES
---help---
- Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+ Add IPv6 support to IPVS.
- See http://www.mindbasket.com/ipvs for more information.
-
- Say N if unsure.
+ Say Y if unsure.
config IP_VS_DEBUG
bool "IP virtual server debugging"
@@ -122,7 +121,6 @@ config IP_VS_RR
config IP_VS_WRR
tristate "weighted round-robin scheduling"
- select GCD
---help---
The weighted robin-robin scheduling algorithm directs network
connections to different real servers based on server weights
@@ -232,11 +230,27 @@ config IP_VS_NQ
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+comment 'IPVS SH scheduler'
+
+config IP_VS_SH_TAB_BITS
+ int "IPVS source hashing table size (the Nth power of 2)"
+ range 4 20
+ default 8
+ ---help---
+ The source hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is tiled by each destination
+ until all slots in the table are filled. When using weights to
+ allow destinations to receive more connections, the table is
+ tiled an amount proportional to the weights specified. The table
+ needs to be large enough to effectively fit all the destinations
+ multiplied by their respective weights.
+
comment 'IPVS application helper'
config IP_VS_FTP
tristate "FTP protocol helper"
- depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
+ NF_CONNTRACK_FTP
select IP_VS_NFCT
---help---
FTP is a protocol that transfers IP address and/or port number in
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index a475edee091..dfd7b65b3d2 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -31,7 +31,6 @@
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/tcp.h>
-#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -43,11 +42,8 @@ EXPORT_SYMBOL(register_ip_vs_app);
EXPORT_SYMBOL(unregister_ip_vs_app);
EXPORT_SYMBOL(register_ip_vs_app_inc);
-/* ipvs application list head */
-static LIST_HEAD(ip_vs_app_list);
static DEFINE_MUTEX(__ip_vs_app_mutex);
-
/*
* Get an ip_vs_app object
*/
@@ -62,12 +58,25 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
module_put(app->module);
}
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
/*
* Allocate/initialize app incarnation and register it in proto apps.
*/
static int
-ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
struct ip_vs_protocol *pp;
struct ip_vs_app *inc;
@@ -98,7 +107,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
}
}
- ret = pp->register_app(inc);
+ ret = pp->register_app(net, inc);
if (ret)
goto out;
@@ -109,8 +118,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
return 0;
out:
- kfree(inc->timeout_table);
- kfree(inc);
+ ip_vs_app_inc_destroy(inc);
return ret;
}
@@ -119,7 +127,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
* Release app incarnation
*/
static void
-ip_vs_app_inc_release(struct ip_vs_app *inc)
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
@@ -127,15 +135,14 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
return;
if (pp->unregister_app)
- pp->unregister_app(inc);
+ pp->unregister_app(net, inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, ntohs(inc->port));
list_del(&inc->a_list);
- kfree(inc->timeout_table);
- kfree(inc);
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
}
@@ -147,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
{
int result;
- atomic_inc(&inc->usecnt);
- if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
- atomic_dec(&inc->usecnt);
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
return result;
}
@@ -159,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
*/
void ip_vs_app_inc_put(struct ip_vs_app *inc)
{
- ip_vs_app_put(inc->app);
atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
}
@@ -168,13 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)
* Register an application incarnation in protocol applications
*/
int
-register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
int result;
mutex_lock(&__ip_vs_app_mutex);
- result = ip_vs_app_inc_new(app, proto, port);
+ result = ip_vs_app_inc_new(net, app, proto, port);
mutex_unlock(&__ip_vs_app_mutex);
@@ -182,51 +190,79 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
}
-/*
- * ip_vs_app registration routine
- */
-int register_ip_vs_app(struct ip_vs_app *app)
+/* Register application for netns */
+struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
- /* increase the module use count */
- ip_vs_use_count_inc();
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a;
+ int err = 0;
+
+ if (!ipvs)
+ return ERR_PTR(-ENOENT);
mutex_lock(&__ip_vs_app_mutex);
- list_add(&app->a_list, &ip_vs_app_list);
+ list_for_each_entry(a, &ipvs->app_list, a_list) {
+ if (!strcmp(app->name, a->name)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ a = kmemdup(app, sizeof(*app), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ INIT_LIST_HEAD(&a->incs_list);
+ list_add(&a->a_list, &ipvs->app_list);
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+out_unlock:
mutex_unlock(&__ip_vs_app_mutex);
- return 0;
+ return err ? ERR_PTR(err) : a;
}
/*
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
*/
-void unregister_ip_vs_app(struct ip_vs_app *app)
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
- struct ip_vs_app *inc, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a, *anxt, *inc, *nxt;
+
+ if (!ipvs)
+ return;
mutex_lock(&__ip_vs_app_mutex);
- list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
- ip_vs_app_inc_release(inc);
- }
+ list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) {
+ if (app && strcmp(app->name, a->name))
+ continue;
+ list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) {
+ ip_vs_app_inc_release(net, inc);
+ }
- list_del(&app->a_list);
+ list_del(&a->a_list);
+ kfree(a);
- mutex_unlock(&__ip_vs_app_mutex);
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+ }
- /* decrease the module use count */
- ip_vs_use_count_dec();
+ mutex_unlock(&__ip_vs_app_mutex);
}
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
-int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
{
return pp->app_conn_bind(cp);
}
@@ -313,17 +349,17 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
* Assumes already checked proto==IPPROTO_TCP and diff!=0.
*/
static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
- unsigned flag, __u32 seq, int diff)
+ unsigned int flag, __u32 seq, int diff)
{
/* spinlock is to keep updating cp->flags atomic */
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
vseq->previous_delta = vseq->delta;
vseq->delta += diff;
vseq->init_seq = seq;
cp->flags |= flag;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
@@ -481,11 +517,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
* /proc/net/ip_vs_app entry function
*/
-static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
{
struct ip_vs_app *app, *inc;
- list_for_each_entry(app, &ip_vs_app_list, a_list) {
+ list_for_each_entry(app, &ipvs->app_list, a_list) {
list_for_each_entry(inc, &app->incs_list, a_list) {
if (pos-- == 0)
return inc;
@@ -497,19 +533,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
mutex_lock(&__ip_vs_app_mutex);
- return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+ return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_app *inc, *app;
struct list_head *e;
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_app_idx(0);
+ return ip_vs_app_idx(ipvs, 0);
inc = v;
app = inc->app;
@@ -518,7 +559,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return list_entry(e, struct ip_vs_app, a_list);
/* go on to next application */
- for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
app = list_entry(e, struct ip_vs_app, a_list);
list_for_each_entry(inc, &app->incs_list, a_list) {
return inc;
@@ -557,7 +598,8 @@ static const struct seq_operations ip_vs_app_seq_ops = {
static int ip_vs_app_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_app_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations ip_vs_app_fops = {
@@ -565,19 +607,21 @@ static const struct file_operations ip_vs_app_fops = {
.open = ip_vs_app_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
};
#endif
-int __init ip_vs_app_init(void)
+int __net_init ip_vs_app_net_init(struct net *net)
{
- /* we will replace it with proc_net_ipvs_create() soon */
- proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->app_list);
+ proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
return 0;
}
-
-void ip_vs_app_cleanup(void)
+void __net_exit ip_vs_app_net_cleanup(struct net *net)
{
- proc_net_remove(&init_net, "ip_vs_app");
+ unregister_ip_vs_app(net, NULL /* all */);
+ remove_proc_entry("ip_vs_app", net->proc_net);
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e9adecdc8ca..610e19c0e13 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -48,104 +48,71 @@
/*
* Connection hash size. Default is what was selected at compile time.
*/
-int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
/* size and mask values */
-int ip_vs_conn_tab_size;
-int ip_vs_conn_tab_mask;
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
-static struct list_head *ip_vs_conn_tab;
+static struct hlist_head *ip_vs_conn_tab __read_mostly;
/* SLAB cache for IPVS connections */
static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
-/* counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
/* counter for no client port connections */
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
+static unsigned int ip_vs_conn_rnd __read_mostly;
/*
* Fine locking granularity for big connection hash table
*/
-#define CT_LOCKARRAY_BITS 4
+#define CT_LOCKARRAY_BITS 5
#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
struct ip_vs_aligned_lock
{
- rwlock_t l;
+ spinlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
static struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-static inline void ct_read_lock(unsigned key)
-{
- read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned key)
-{
- read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock(unsigned key)
-{
- write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock(unsigned key)
-{
- write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned key)
-{
- read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned key)
-{
- read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock_bh(unsigned key)
+static inline void ct_write_lock_bh(unsigned int key)
{
- write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
-static inline void ct_write_unlock_bh(unsigned key)
+static inline void ct_write_unlock_bh(unsigned int key)
{
- write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
/*
* Returns hash value for IPVS connection entry
*/
-static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
const union nf_inet_addr *addr,
__be16 port)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
- return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
- (__force u32)port, proto, ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+ (__force u32)port, proto, ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
#endif
- return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
- ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+ ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
}
static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -166,18 +133,18 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
port = p->vport;
}
- return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+ return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
}
static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
- NULL, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ &cp->caddr, cp->cport, NULL, 0, &p);
- if (cp->dest && cp->dest->svc->pe) {
- p.pe = cp->dest->svc->pe;
+ if (cp->pe) {
+ p.pe = cp->pe;
p.pe_data = cp->pe_data;
p.pe_data_len = cp->pe_data_len;
}
@@ -186,12 +153,12 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
}
/*
- * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
* returns bool success.
*/
static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
{
- unsigned hash;
+ unsigned int hash;
int ret;
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
@@ -200,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/* Hash by protocol, client address and port */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
ret = 1;
} else {
pr_err("%s(): request for already hashed, called from %pF\n",
@@ -215,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
}
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -223,21 +190,21 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/*
* UNhashes ip_vs_conn from ip_vs_conn_tab.
- * returns bool success.
+ * returns bool success. Caller should hold conn reference.
*/
static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
- unsigned hash;
+ unsigned int hash;
int ret;
/* unhash it and decrease its reference counter */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
- list_del(&cp->c_list);
+ hlist_del_rcu(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
@@ -245,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
ret = 0;
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -260,28 +257,30 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
static inline struct ip_vs_conn *
__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_conn *cp;
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
return cp;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
return NULL;
}
@@ -309,33 +308,31 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
static int
ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
- unsigned int proto_off, int inverse,
- struct ip_vs_conn_param *p)
+ int inverse, struct ip_vs_conn_param *p)
{
__be16 _ports[2], *pptr;
+ struct net *net = skb_net(skb);
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return 1;
if (likely(!inverse))
- ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
- &iph->daddr, pptr[1], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ pptr[0], &iph->daddr, pptr[1], p);
else
- ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
- &iph->saddr, pptr[0], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ pptr[1], &iph->saddr, pptr[0], p);
return 0;
}
struct ip_vs_conn *
ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off, int inverse)
+ const struct ip_vs_iphdr *iph, int inverse)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
return NULL;
return ip_vs_conn_in_get(&p);
@@ -345,17 +342,21 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
/* Get reference to connection template */
struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_conn *cp;
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (p->pe_data && p->pe->ct_match) {
- if (p->pe->ct_match(p, cp))
- goto out;
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (unlikely(p->pe_data && p->pe->ct_match)) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
continue;
}
@@ -365,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
* p->vaddr is a fwmark */
ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ p->vport == cp->vport && p->cport == cp->cport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- p->protocol == cp->protocol)
- goto out;
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
}
cp = NULL;
out:
- if (cp)
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -392,7 +394,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
* p->vaddr, p->vport: pkt dest address (foreign host) */
struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_conn *cp, *ret=NULL;
/*
@@ -400,22 +402,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
*/
hash = ip_vs_conn_hashkey_param(p, true);
- ct_read_lock(hash);
+ rcu_read_lock();
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
- p->vport == cp->cport && p->cport == cp->dport &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
ret = cp;
break;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -428,13 +432,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
struct ip_vs_conn *
ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off, int inverse)
+ const struct ip_vs_iphdr *iph, int inverse)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
return NULL;
return ip_vs_conn_out_get(&p);
@@ -460,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
{
if (ip_vs_conn_unhash(cp)) {
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
atomic_dec(&ip_vs_conn_no_cport_cnt);
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
cp->cport = cport;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
/* hash on new dport */
ip_vs_conn_hash(cp);
@@ -545,28 +547,31 @@ static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
unsigned int conn_flags;
+ __u32 flags;
/* if dest is NULL, then return directly */
if (!dest)
return;
/* Increase the refcnt counter of the dest */
- atomic_inc(&dest->refcnt);
+ ip_vs_dest_hold(dest);
conn_flags = atomic_read(&dest->conn_flags);
if (cp->protocol != IPPROTO_UDP)
conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
+ flags = cp->flags;
/* Bind with the destination and its corresponding transmitter */
- if (cp->flags & IP_VS_CONN_F_SYNC) {
+ if (flags & IP_VS_CONN_F_SYNC) {
/* if the connection is not template and is created
* by sync, preserve the activity flag.
*/
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
conn_flags &= ~IP_VS_CONN_F_INACTIVE;
/* connections inherit forwarding method from dest */
- cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+ flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
}
- cp->flags |= conn_flags;
+ flags |= conn_flags;
+ cp->flags = flags;
cp->dest = dest;
IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -581,18 +586,18 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
atomic_read(&dest->refcnt));
/* Update the connection counters */
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
- /* It is a normal connection, so increase the inactive
- connection counter because it is in TCP SYNRECV
- state (inactive) or other protocol inacive state */
- if ((cp->flags & IP_VS_CONN_F_SYNC) &&
- (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so modify the counters
+ * according to the flags, later the protocol can
+ * update them on state change
+ */
+ if (!(flags & IP_VS_CONN_F_INACTIVE))
atomic_inc(&dest->activeconns);
else
atomic_inc(&dest->inactconns);
} else {
/* It is a persistent connection/template, so increase
- the peristent connection counter */
+ the persistent connection counter */
atomic_inc(&dest->persistconns);
}
@@ -606,18 +611,46 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
* Check if there is a destination for the connection, if so
* bind the connection to the destination.
*/
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
{
struct ip_vs_dest *dest;
- if ((cp) && (!cp->dest)) {
- dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
- &cp->vaddr, cp->vport,
- cp->protocol);
+ rcu_read_lock();
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark, cp->flags);
+ if (dest) {
+ struct ip_vs_proto_data *pd;
+
+ spin_lock_bh(&cp->lock);
+ if (cp->dest) {
+ spin_unlock_bh(&cp->lock);
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Applications work depending on the forwarding method
+ * but better to reassign them always when binding dest */
+ if (cp->app)
+ ip_vs_unbind_app(cp);
+
ip_vs_bind_dest(cp, dest);
- return dest;
- } else
- return NULL;
+ spin_unlock_bh(&cp->lock);
+
+ /* Update its packet transmitter */
+ cp->packet_xmit = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ if (pd && atomic_read(&pd->appcnt))
+ ip_vs_bind_app(cp, pd->pp);
+ }
+ rcu_read_unlock();
}
@@ -654,7 +687,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
}
} else {
/* It is a persistent connection/template, so decrease
- the peristent connection counter */
+ the persistent connection counter */
atomic_dec(&dest->persistconns);
}
@@ -669,14 +702,19 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
}
- /*
- * Simply decrease the refcnt of the dest, because the
- * dest will be either in service's destination list
- * or in the trash.
- */
- atomic_dec(&dest->refcnt);
+ ip_vs_dest_put(dest);
}
+static int expire_quiescent_template(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest)
+{
+#ifdef CONFIG_SYSCTL
+ return ipvs->sysctl_expire_quiescent_template &&
+ (atomic_read(&dest->weight) == 0);
+#else
+ return 0;
+#endif
+}
/*
* Checking if the destination of a connection template is available.
@@ -686,14 +724,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
/*
* Checking the dest server status.
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- (sysctl_ip_vs_expire_quiescent_template &&
- (atomic_read(&dest->weight) == 0))) {
+ expire_quiescent_template(ipvs, dest)) {
IP_VS_DBG_BUF(9, "check_template: dest not available for "
"protocol %s s:%s:%d v:%s:%d "
"-> d:%s:%d\n",
@@ -721,22 +759,27 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
* Simply decrease the refcnt of the template,
* don't restart its timer.
*/
- atomic_dec(&ct->refcnt);
+ __ip_vs_conn_put(ct);
return 0;
}
return 1;
}
-static void ip_vs_conn_expire(unsigned long data)
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
{
- struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
- cp->timeout = 60*HZ;
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
- /*
- * hey, I'm using it
- */
- atomic_inc(&cp->refcnt);
+static void ip_vs_conn_expire(unsigned long data)
+{
+ struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct net *net = ip_vs_conn_net(cp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
/*
* do I control anybody?
@@ -744,55 +787,60 @@ static void ip_vs_conn_expire(unsigned long data)
if (atomic_read(&cp->n_control))
goto expire_later;
- /*
- * unhash it if it is hashed in the conn table
- */
- if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
- goto expire_later;
-
- /*
- * refcnt==1 implies I'm the only one referrer
- */
- if (likely(atomic_read(&cp->refcnt) == 1)) {
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
/* delete the timer if it is activated by other users */
- if (timer_pending(&cp->timer))
- del_timer(&cp->timer);
+ del_timer(&cp->timer);
/* does anybody control me? */
if (cp->control)
ip_vs_control_del(cp);
- if (cp->flags & IP_VS_CONN_F_NFCT)
- ip_vs_conn_drop_conntrack(cp);
+ if (cp->flags & IP_VS_CONN_F_NFCT) {
+ /* Do not access conntracks during subsys cleanup
+ * because nf_conntrack_find_get can not be used after
+ * conntrack cleanup for the net.
+ */
+ smp_rmb();
+ if (ipvs->enable)
+ ip_vs_conn_drop_conntrack(cp);
+ }
- kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
- atomic_dec(&ip_vs_conn_count);
-
- kmem_cache_free(ip_vs_conn_cachep, cp);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
+ atomic_dec(&ipvs->conn_count);
return;
}
- /* hash it back to the table */
- ip_vs_conn_hash(cp);
-
expire_later:
- IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
- atomic_read(&cp->refcnt)-1,
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
atomic_read(&cp->n_control));
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+
ip_vs_conn_put(cp);
}
-
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
- if (del_timer(&cp->timer))
- mod_timer(&cp->timer, jiffies);
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
}
@@ -801,34 +849,44 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
*/
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p,
- const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
- struct ip_vs_dest *dest)
+ const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
+ struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
+ struct netns_ipvs *ipvs = net_ipvs(p->net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ p->protocol);
- cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NULL;
}
- INIT_LIST_HEAD(&cp->c_list);
+ INIT_HLIST_NODE(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+ ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->protocol = p->protocol;
- ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+ ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
cp->cport = p->cport;
- ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+ /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
+ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ &cp->vaddr, p->vaddr);
cp->vport = p->vport;
- /* proto should only be IPPROTO_IP if d_addr is a fwmark */
- ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
- &cp->daddr, daddr);
+ ip_vs_addr_set(p->af, &cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
- if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ cp->fwmark = fwmark;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
}
spin_lock_init(&cp->lock);
@@ -839,19 +897,30 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
*/
atomic_set(&cp->refcnt, 1);
+ cp->control = NULL;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
- atomic_inc(&ip_vs_conn_count);
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
+ atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
+ cp->dest = NULL;
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
+ cp->old_state = 0;
cp->timeout = 3*HZ;
+ cp->sync_endtime = jiffies & ~3UL;
/* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
@@ -861,8 +930,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
#endif
ip_vs_bind_xmit(cp);
- if (unlikely(pp && atomic_read(&pp->appcnt)))
- ip_vs_bind_app(cp, pp);
+ if (unlikely(pd && atomic_read(&pd->appcnt)))
+ ip_vs_bind_app(cp, pd->pp);
/*
* Allow conntrack to be preserved. By default, conntrack
@@ -871,7 +940,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* IP_VS_CONN_F_ONE_PACKET too.
*/
- if (ip_vs_conntrack_enabled())
+ if (ip_vs_conntrack_enabled(ipvs))
cp->flags |= IP_VS_CONN_F_NFCT;
/* Hash it in the ip_vs_conn_tab finally */
@@ -884,36 +953,49 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* /proc/net/ip_vs_conn entries
*/
#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+ struct seq_net_private p;
+ struct hlist_head *l;
+};
static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
{
int idx;
struct ip_vs_conn *cp;
+ struct ip_vs_iter_state *iter = seq->private;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
if (pos-- == 0) {
- seq->private = &ip_vs_conn_tab[idx];
- return cp;
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
}
}
- ct_read_unlock_bh(idx);
+ cond_resched_rcu();
}
return NULL;
}
static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
- seq->private = NULL;
+ struct ip_vs_iter_state *iter = seq->private;
+
+ iter->l = NULL;
+ rcu_read_lock();
return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_conn *cp = v;
- struct list_head *e, *l = seq->private;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
+ struct hlist_head *l = iter->l;
int idx;
++*pos;
@@ -921,30 +1003,26 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return ip_vs_conn_array(seq, 0);
/* more on same hash chain? */
- if ((e = cp->c_list.next) != l)
- return list_entry(e, struct ip_vs_conn, c_list);
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_conn, c_list);
idx = l - ip_vs_conn_tab;
- ct_read_unlock_bh(idx);
-
while (++idx < ip_vs_conn_tab_size) {
- ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
- seq->private = &ip_vs_conn_tab[idx];
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ iter->l = &ip_vs_conn_tab[idx];
return cp;
}
- ct_read_unlock_bh(idx);
+ cond_resched_rcu();
}
- seq->private = NULL;
+ iter->l = NULL;
return NULL;
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
- struct list_head *l = seq->private;
-
- if (l)
- ct_read_unlock_bh(l - ip_vs_conn_tab);
+ rcu_read_unlock();
}
static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -955,18 +1033,19 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
size_t len = 0;
- if (cp->dest && cp->pe_data &&
- cp->dest->svc->pe->show_pe_data) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+ if (cp->pe_data) {
pe_data[0] = ' ';
- len = strlen(cp->dest->svc->pe->name);
- memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
pe_data[len + 1] = ' ';
len += 2;
- len += cp->dest->svc->pe->show_pe_data(cp,
- pe_data + len);
+ len += cp->pe->show_pe_data(cp, pe_data + len);
}
pe_data[len] = '\0';
@@ -1004,7 +1083,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
static int ip_vs_conn_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_fops = {
@@ -1012,10 +1092,10 @@ static const struct file_operations ip_vs_conn_fops = {
.open = ip_vs_conn_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
};
-static const char *ip_vs_origin_name(unsigned flags)
+static const char *ip_vs_origin_name(unsigned int flags)
{
if (flags & IP_VS_CONN_F_SYNC)
return "SYNC";
@@ -1031,6 +1111,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -1067,7 +1151,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_sync_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_sync_fops = {
@@ -1075,7 +1160,7 @@ static const struct file_operations ip_vs_conn_sync_fops = {
.open = ip_vs_conn_sync_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
};
#endif
@@ -1113,27 +1198,24 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
}
/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
+void ip_vs_random_dropentry(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
+ rcu_read_lock();
/*
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
- unsigned hash = net_random() & ip_vs_conn_tab_mask;
+ unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
- /*
- * Lock is actually needed in this loop.
- */
- ct_write_lock_bh(hash);
-
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
-
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1148,6 +1230,18 @@ void ip_vs_random_dropentry(void)
default:
continue;
}
+ } else if (cp->protocol == IPPROTO_SCTP) {
+ switch (cp->state) {
+ case IP_VS_SCTP_S_INIT1:
+ case IP_VS_SCTP_S_INIT:
+ break;
+ case IP_VS_SCTP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+ default:
+ continue;
+ }
} else {
if (!todrop_entry(cp))
continue;
@@ -1155,51 +1249,78 @@ void ip_vs_random_dropentry(void)
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(hash);
+ cond_resched_rcu();
}
+ rcu_read_unlock();
}
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
-static void ip_vs_conn_flush(void)
+static void ip_vs_conn_flush(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- flush_again:
+flush_again:
+ rcu_read_lock();
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- /*
- * Lock is actually needed in this loop.
- */
- ct_write_lock_bh(idx);
-
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(idx);
+ cond_resched_rcu();
}
+ rcu_read_unlock();
/* the counter may be not NULL, because maybe some conn entries
are run by slow timer handler or unhashed but still referred */
- if (atomic_read(&ip_vs_conn_count) != 0) {
+ if (atomic_read(&ipvs->conn_count) != 0) {
schedule();
goto flush_again;
}
}
+/*
+ * per netns init and exit
+ */
+int __net_init ip_vs_conn_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ atomic_set(&ipvs->conn_count, 0);
+
+ proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
+ proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
+ return 0;
+}
+
+void __net_exit ip_vs_conn_net_cleanup(struct net *net)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush(net);
+ remove_proc_entry("ip_vs_conn", net->proc_net);
+ remove_proc_entry("ip_vs_conn_sync", net->proc_net);
+}
int __init ip_vs_conn_init(void)
{
@@ -1212,8 +1333,7 @@ int __init ip_vs_conn_init(void)
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
- sizeof(struct list_head));
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
if (!ip_vs_conn_tab)
return -ENOMEM;
@@ -1233,32 +1353,24 @@ int __init ip_vs_conn_init(void)
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
- for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
- }
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
+ INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
- rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
- proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
- proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
-
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
return 0;
}
-
void ip_vs_conn_cleanup(void)
{
- /* flush all the connection entries first */
- ip_vs_conn_flush();
-
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- proc_net_remove(&init_net, "ip_vs_conn");
- proc_net_remove(&init_net, "ip_vs_conn_sync");
vfree(ip_vs_conn_tab);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b4e51e9c5a0..e6836755c45 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -41,6 +41,7 @@
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h>
#include <net/ip6_checksum.h>
+#include <net/netns/generic.h> /* net_generic() */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -68,12 +69,15 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
+static int ip_vs_net_id __read_mostly;
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
/* ID used in ICMP lookups */
#define icmp_id(icmph) (((icmph)->un).echo.id)
#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
-const char *ip_vs_proto_name(unsigned proto)
+const char *ip_vs_proto_name(unsigned int proto)
{
static char buf[20];
@@ -93,7 +97,7 @@ const char *ip_vs_proto_name(unsigned proto)
return "ICMPv6";
#endif
default:
- sprintf(buf, "IP_%d", proto);
+ sprintf(buf, "IP_%u", proto);
return buf;
}
}
@@ -108,21 +112,32 @@ static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.inpkts++;
- dest->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.inpkts++;
- dest->svc->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.inpkts++;
- ip_vs_stats.ustats.inbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -131,21 +146,32 @@ static inline void
ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.outpkts++;
- dest->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.outpkts++;
- dest->svc->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.outpkts++;
- ip_vs_stats.ustats.outbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -153,41 +179,43 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
static inline void
ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
- spin_lock(&cp->dest->stats.lock);
- cp->dest->stats.ustats.conns++;
- spin_unlock(&cp->dest->stats.lock);
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_cpu_stats *s;
- spin_lock(&svc->stats.lock);
- svc->stats.ustats.conns++;
- spin_unlock(&svc->stats.lock);
+ s = this_cpu_ptr(cp->dest->stats.cpustats);
+ s->ustats.conns++;
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.conns++;
- spin_unlock(&ip_vs_stats.lock);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.conns++;
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.conns++;
}
-static inline int
+static inline void
ip_vs_set_state(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- if (unlikely(!pp->state_transition))
- return 0;
- return pp->state_transition(cp, direction, skb, pp);
+ if (likely(pd->pp->state_transition))
+ pd->pp->state_transition(cp, direction, skb, pd);
}
-static inline void
+static inline int
ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
struct sk_buff *skb, int protocol,
const union nf_inet_addr *caddr, __be16 cport,
const union nf_inet_addr *vaddr, __be16 vport,
struct ip_vs_conn_param *p)
{
- ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
- p->pe = svc->pe;
+ ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ vport, p);
+ p->pe = rcu_dereference(svc->pe);
if (p->pe && p->pe->fill_param)
- p->pe->fill_param(p, skb);
+ return p->pe->fill_param(p, skb);
+
+ return 0;
}
/*
@@ -199,33 +227,32 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
*/
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
- struct sk_buff *skb,
- __be16 ports[2])
+ struct sk_buff *skb, __be16 src_port, __be16 dst_port,
+ int *ignored, struct ip_vs_iphdr *iph)
{
struct ip_vs_conn *cp = NULL;
- struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
struct ip_vs_conn *ct;
__be16 dport = 0; /* destination port to forward */
unsigned int flags;
struct ip_vs_conn_param param;
+ const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
union nf_inet_addr snet; /* source network of the client,
after masking */
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
/* Mask saddr with the netmask to adjust template granularity */
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
- ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
+ ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ (__force __u32) svc->netmask);
else
#endif
- snet.ip = iph.saddr.ip & svc->netmask;
+ snet.ip = iph->saddr.ip & svc->netmask;
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
- IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -242,19 +269,18 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* is created for other persistent services.
*/
{
- int protocol = iph.protocol;
- const union nf_inet_addr *vaddr = &iph.daddr;
- const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+ int protocol = iph->protocol;
+ const union nf_inet_addr *vaddr = &iph->daddr;
__be16 vport = 0;
- if (ports[1] == svc->port) {
+ if (dst_port == svc->port) {
/* non-FTP template:
* <protocol, caddr, 0, vaddr, vport, daddr, dport>
* FTP template:
* <protocol, caddr, 0, vaddr, 0, daddr, 0>
*/
if (svc->port != FTPPORT)
- vport = ports[1];
+ vport = dst_port;
} else {
/* Note: persistent fwmark-based services and
* persistent port zero service are handled here.
@@ -268,24 +294,34 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
vaddr = &fwmark;
}
}
- ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
- vaddr, vport, &param);
+ /* return *ignored = -1 so NF_DROP can be used */
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param) < 0) {
+ *ignored = -1;
+ return NULL;
+ }
}
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
if (!ct || !ip_vs_check_template(ct)) {
- /* No template found or the dest of the connection
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * No template found or the dest of the connection
* template is not available.
+ * return *ignored=0 i.e. ICMP and NF_DROP
*/
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
+ *ignored = 0;
return NULL;
}
- if (ports[1] == svc->port && svc->port != FTPPORT)
+ if (dst_port == svc->port && svc->port != FTPPORT)
dport = dest->port;
/* Create a template
@@ -293,9 +329,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* and thus param.pe_data will be destroyed
* when the template expires */
ct = ip_vs_conn_new(&param, &dest->addr, dport,
- IP_VS_CONN_F_TEMPLATE, dest);
+ IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
if (ct == NULL) {
kfree(param.pe_data);
+ *ignored = -1;
return NULL;
}
@@ -306,22 +343,24 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
kfree(param.pe_data);
}
- dport = ports[1];
+ dport = dst_port;
if (dport == svc->port && dest->port)
dport = dest->port;
flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
- && iph.protocol == IPPROTO_UDP)?
+ && iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
- &iph.daddr, ports[1], &param);
- cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
+ src_port, &iph->daddr, dst_port, &param);
+
+ cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
if (cp == NULL) {
ip_vs_conn_put(ct);
+ *ignored = -1;
return NULL;
}
@@ -341,20 +380,39 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* It selects a server according to the virtual service, and
* creates a connection entry.
* Protocols supported: TCP, UDP
+ *
+ * Usage of *ignored
+ *
+ * 1 : protocol tried to schedule (eg. on SYN), found svc but the
+ * svc/scheduler decides that this packet should be accepted with
+ * NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 : scheduler can not find destination, so try bypass or
+ * return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 : scheduler tried to schedule but fatal error occurred, eg.
+ * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ * failure such as missing Call-ID, ENOMEM on skb_linearize
+ * or pe_data. In this case we should return NF_DROP without
+ * any attempts to send ICMP with ip_vs_leave.
*/
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp, int *ignored)
+ struct ip_vs_proto_data *pd, int *ignored,
+ struct ip_vs_iphdr *iph)
{
+ struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
- struct ip_vs_iphdr iph;
+ struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
__be16 _ports[2], *pptr;
unsigned int flags;
*ignored = 1;
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
- pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+ /*
+ * IPv6 frags, only the first hit here.
+ */
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return NULL;
@@ -371,12 +429,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
}
/*
- * Do not schedule replies from local real server. It is risky
- * for fwmark services but mostly for persistent services.
+ * Do not schedule replies from local real server.
*/
if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
- (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+ (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
"Not scheduling reply for existing connection");
__ip_vs_conn_put(cp);
@@ -386,10 +442,11 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Persistent service
*/
- if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
- *ignored = 0;
- return ip_vs_sched_persist(svc, skb, pptr);
- }
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
+ iph);
+
+ *ignored = 0;
/*
* Non-persistent service
@@ -402,16 +459,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return NULL;
}
- *ignored = 0;
-
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
}
flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
- && iph.protocol == IPPROTO_UDP)?
+ && iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
/*
@@ -419,13 +475,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
- pptr[0], &iph.daddr, pptr[1], &p);
+
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0], &iph->daddr,
+ pptr[1], &p);
cp = ip_vs_conn_new(&p, &dest->addr,
dest->port ? dest->port : pptr[1],
- flags, dest);
- if (!cp)
+ flags, dest, skb->mark);
+ if (!cp) {
+ *ignored = -1;
return NULL;
+ }
}
IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
@@ -447,49 +507,52 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
* no destination is available for a new connection.
*/
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
{
__be16 _ports[2], *pptr;
- struct ip_vs_iphdr iph;
+#ifdef CONFIG_SYSCTL
+ struct net *net;
+ struct netns_ipvs *ipvs;
int unicast;
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+#endif
- pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL) {
- ip_vs_service_put(svc);
return NF_DROP;
}
+#ifdef CONFIG_SYSCTL
+ net = skb_net(skb);
+
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
- unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
+ unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
else
#endif
- unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+ unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is a non-local unicast, then create
a cache_bypass connection entry */
- if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
- int ret, cs;
+ ipvs = net_ipvs(net);
+ if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
+ int ret;
struct ip_vs_conn *cp;
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
- iph.protocol == IPPROTO_UDP)?
+ iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
- ip_vs_service_put(svc);
-
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol,
- &iph.saddr, pptr[0],
- &iph.daddr, pptr[1], &p);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0],
+ &iph->daddr, pptr[1], &p);
cp = ip_vs_conn_new(&p, &daddr, 0,
IP_VS_CONN_F_BYPASS | flags,
- NULL);
+ NULL, skb->mark);
if (!cp)
return NF_DROP;
}
@@ -498,16 +561,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_in_stats(cp, skb);
/* set state */
- cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
/* transmit the first SYN packet */
- ret = cp->packet_xmit(skb, cp, pp);
+ ret = cp->packet_xmit(skb, cp, pd->pp, iph);
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
+#endif
/*
* When the virtual ftp service is presented, packets destined
@@ -515,12 +579,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
- if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
- ip_vs_service_put(svc);
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
return NF_ACCEPT;
- }
-
- ip_vs_service_put(svc);
/*
* Notify the client that the destination is unreachable, and
@@ -532,9 +592,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6) {
if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net_ = dev_net(skb_dst(skb)->dev);
- skb->dev = net->loopback_dev;
+ skb->dev = net_->loopback_dev;
}
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
} else
@@ -544,6 +604,33 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
return NF_DROP;
}
+#ifdef CONFIG_SYSCTL
+
+static int sysctl_snat_reroute(struct sk_buff *skb)
+{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ return ipvs->sysctl_snat_reroute;
+}
+
+static int sysctl_nat_icmp_send(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ return ipvs->sysctl_nat_icmp_send;
+}
+
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
+{
+ return ipvs->sysctl_expire_nodest_conn;
+}
+
+#else
+
+static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
+static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
+
+#endif
+
__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -560,21 +647,32 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
{
- int err = ip_defrag(skb, user);
+ int err;
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
if (!err)
ip_send_check(ip_hdr(skb));
return err;
}
-#ifdef CONFIG_IP_VS_IPV6
-static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
+static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
{
- /* TODO IPv6: Find out what to do here for IPv6 */
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
+ return 1;
+ } else
+#endif
+ if ((sysctl_snat_reroute(skb) ||
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ return 1;
+
return 0;
}
-#endif
/*
* Packet has been made sufficiently writable in caller
@@ -630,10 +728,19 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int inout)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
- unsigned int icmp_offset = sizeof(struct ipv6hdr);
- struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
- icmp_offset);
- struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
+ unsigned int icmp_offset = 0;
+ unsigned int offs = 0; /* header offset*/
+ int protocol;
+ struct icmp6hdr *icmph;
+ struct ipv6hdr *ciph;
+ unsigned short fragoffs;
+
+ ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
+ icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
+ offs = icmp_offset + sizeof(struct icmp6hdr);
+ ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
+
+ protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
if (inout) {
iph->saddr = cp->vaddr.in6;
@@ -644,10 +751,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
}
/* the TCP/UDP/SCTP port */
- if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
- IPPROTO_SCTP == ciph->nexthdr) {
- __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
+ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)) {
+ __be16 *ports = (void *)(skb_network_header(skb) + offs);
+ IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
+ ntohs(inout ? ports[1] : ports[0]),
+ ntohs(inout ? cp->vport : cp->dport));
if (inout)
ports[1] = cp->vport;
else
@@ -674,7 +784,7 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* Handle relevant response ICMP messages - forward to the right
- * destination host. Used for NAT and local client.
+ * destination host.
*/
static int handle_response_icmp(int af, struct sk_buff *skb,
union nf_inet_addr *snet,
@@ -710,16 +820,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#endif
ip_vs_nat_icmp(skb, pp, cp, 1);
-#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
- goto out;
- } else
-#endif
- if ((sysctl_ip_vs_snat_reroute ||
- skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(skb, RTN_LOCAL) != 0)
- goto out;
+ if (ip_vs_route_me_harder(af, skb))
+ goto out;
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
@@ -757,7 +859,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
*related = 1;
/* reassemble IP fragments */
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -804,51 +906,35 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
"Checking outgoing ICMP for");
- offset += cih->ihl * 4;
-
- ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
if (!cp)
return NF_ACCEPT;
snet.ip = iph->saddr;
return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
- pp, offset, ihl);
+ pp, ciph.len, ihl);
}
#ifdef CONFIG_IP_VS_IPV6
static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
- unsigned int hooknum)
+ unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
{
- struct ipv6hdr *iph;
struct icmp6hdr _icmph, *ic;
- struct ipv6hdr _ciph, *cih; /* The ip header contained
- within the ICMP */
- struct ip_vs_iphdr ciph;
+ struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
- unsigned int offset;
union nf_inet_addr snet;
+ unsigned int writable;
*related = 1;
-
- /* reassemble IP fragments */
- if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
- return NF_STOLEN;
- }
-
- iph = ipv6_hdr(skb);
- offset = sizeof(struct ipv6hdr);
- ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
if (ic == NULL)
return NF_DROP;
- IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
- ic->icmp6_type, ntohs(icmpv6_id(ic)),
- &iph->saddr, &iph->daddr);
-
/*
* Work through seeing if this is for us.
* These checks are supposed to be in an order that means easy
@@ -856,42 +942,45 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
- if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
- (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
- (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
*related = 0;
return NF_ACCEPT;
}
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (ipvsh->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &ipvsh->saddr, &ipvsh->daddr);
/* Now find the contained IP header */
- offset += sizeof(_icmph);
- cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
- if (cih == NULL)
+ ciph.len = ipvsh->len + sizeof(_icmph);
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
-
- pp = ip_vs_proto_get(cih->nexthdr);
+ ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ pp = ip_vs_proto_get(ciph.protocol);
if (!pp)
return NF_ACCEPT;
- /* Is the embedded protocol header present? */
- /* TODO: we don't support fragmentation at the moment anyways */
- if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
- return NF_ACCEPT;
-
- IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
- "Checking outgoing ICMPv6 for");
-
- offset += sizeof(struct ipv6hdr);
-
- ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
if (!cp)
return NF_ACCEPT;
- ipv6_addr_copy(&snet.in6, &iph->saddr);
- return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
- pp, offset, sizeof(struct ipv6hdr));
+ snet.in6 = ciph.saddr.in6;
+ writable = ciph.len;
+ return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
+ pp, writable, sizeof(struct ipv6hdr));
}
#endif
@@ -920,20 +1009,47 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
return th->rst;
}
+static inline bool is_new_conn(const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+ return th->syn;
+ }
+ case IPPROTO_SCTP: {
+ sctp_chunkhdr_t *sch, schunk;
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return false;
+ return sch->type == SCTP_CID_INIT;
+ }
+ default:
+ return false;
+ }
+}
+
/* Handle response packets: rewrite addresses and send away...
- * Used for NAT and local client.
*/
static unsigned int
-handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
- struct ip_vs_conn *cp, int ihl)
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
+ struct ip_vs_protocol *pp = pd->pp;
+
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
- if (!skb_make_writable(skb, ihl))
+ if (!skb_make_writable(skb, iph->len))
goto drop;
/* mangle the packet */
- if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
+ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
goto drop;
#ifdef CONFIG_IP_VS_IPV6
@@ -961,21 +1077,13 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* if it came from this machine itself. So re-compute
* the routing information.
*/
-#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
- goto drop;
- } else
-#endif
- if ((sysctl_ip_vs_snat_reroute ||
- skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(skb, RTN_LOCAL) != 0)
- goto drop;
+ if (ip_vs_route_me_harder(af, skb))
+ goto drop;
IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
- ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
ip_vs_notrack(skb);
@@ -999,8 +1107,10 @@ drop:
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
EnterFunction(11);
@@ -1022,17 +1132,20 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ net = skb_net(skb);
+ if (!net_ipvs(net)->enable)
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_out_icmp_v6(skb, &related,
- hooknum);
+ hooknum, &iph);
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
#endif
@@ -1042,54 +1155,44 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
+ pp = pd->pp;
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6) {
- if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb,
- ip_vs_defrag_user(hooknum)))
- return NF_STOLEN;
- }
-
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
- } else
+ if (af == AF_INET)
#endif
- if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
- !pp->dont_defrag)) {
+ if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
if (ip_vs_gather_frags(skb,
ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
}
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_out_get(af, skb, &iph, 0);
if (likely(cp))
- return handle_response(af, skb, pp, cp, iph.len);
- if (sysctl_ip_vs_nat_icmp_send &&
+ return handle_response(af, skb, pd, cp, &iph);
+ if (sysctl_nat_icmp_send(net) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
__be16 _ports[2], *pptr;
- pptr = skb_header_pointer(skb, iph.len,
- sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph.len,
+ sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(af, iph.protocol,
- &iph.saddr,
- pptr[0])) {
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
/*
* Notify the real server: there is no
* existing entry if it is not RST
@@ -1104,9 +1207,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- struct net *net =
- dev_net(skb_dst(skb)->dev);
-
if (!skb->dev)
skb->dev = net->loopback_dev;
icmpv6_send(skb,
@@ -1133,11 +1233,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_out(hooknum, skb, AF_INET);
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
}
/*
@@ -1145,17 +1245,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1166,11 +1260,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_out(hooknum, skb, AF_INET6);
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
}
/*
@@ -1178,17 +1272,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
}
#endif
@@ -1202,19 +1290,21 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
static int
ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
{
+ struct net *net = NULL;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
struct ip_vs_iphdr ciph;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
- unsigned int offset, ihl, verdict;
- union nf_inet_addr snet;
+ struct ip_vs_proto_data *pd;
+ unsigned int offset, offset2, ihl, verdict;
+ bool ipip;
*related = 1;
/* reassemble IP fragments */
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1249,9 +1339,27 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- pp = ip_vs_proto_get(cih->protocol);
- if (!pp)
+ net = skb_net(skb);
+
+ /* Special case for errors for IPIP packets */
+ ipip = false;
+ if (cih->protocol == IPPROTO_IPIP) {
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ /* Error for our IPIP must arrive at LOCAL_IN */
+ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
+ return NF_ACCEPT;
+ offset += cih->ihl * 4;
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ipip = true;
+ }
+
+ pd = ip_vs_proto_data_get(net, cih->protocol);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
/* Is the embedded protocol header present? */
if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
@@ -1261,22 +1369,16 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
"Checking incoming ICMP for");
- offset += cih->ihl * 4;
-
- ip_vs_fill_iphdr(AF_INET, cih, &ciph);
- /* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
- if (!cp) {
- /* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
- if (cp) {
- snet.ip = iph->saddr;
- return handle_response_icmp(AF_INET, skb, &snet,
- cih->protocol, cp, pp,
- offset, ihl);
- }
+ offset2 = offset;
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ offset = ciph.len;
+ /* The embedded headers contain source and dest in reverse order.
+ * For IPIP this is error for request, not for reply.
+ */
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
+ if (!cp)
return NF_ACCEPT;
- }
verdict = NF_DROP;
@@ -1288,59 +1390,95 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
goto out;
}
+ if (ipip) {
+ __be32 info = ic->un.gateway;
+ __u8 type = ic->type;
+ __u8 code = ic->code;
+
+ /* Update the MTU */
+ if (ic->type == ICMP_DEST_UNREACH &&
+ ic->code == ICMP_FRAG_NEEDED) {
+ struct ip_vs_dest *dest = cp->dest;
+ u32 mtu = ntohs(ic->un.frag.mtu);
+ __be16 frag_off = cih->frag_off;
+
+ /* Strip outer IP and ICMP, go to IPIP header */
+ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
+ goto ignore_ipip;
+ offset2 -= ihl + sizeof(_icmph);
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ ipv4_update_pmtu(skb, dev_net(skb->dev),
+ mtu, 0, 0, 0, 0);
+ /* Client uses PMTUD? */
+ if (!(frag_off & htons(IP_DF)))
+ goto ignore_ipip;
+ /* Prefer the resulting PMTU */
+ if (dest) {
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
+ }
+ if (mtu > 68 + sizeof(struct iphdr))
+ mtu -= sizeof(struct iphdr);
+ info = htonl(mtu);
+ }
+ /* Strip outer IP, ICMP and IPIP, go to IP header of
+ * original request.
+ */
+ if (pskb_pull(skb, offset2) == NULL)
+ goto ignore_ipip;
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ type, code, ntohl(info));
+ icmp_send(skb, type, code, info);
+ /* ICMP can be shorter but anyways, account it */
+ ip_vs_out_stats(cp, skb);
+
+ignore_ipip:
+ consume_skb(skb);
+ verdict = NF_STOLEN;
+ goto out;
+ }
+
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
- if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+ if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
+ IPPROTO_SCTP == cih->protocol)
offset += 2 * sizeof(__u16);
- verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
- /* LOCALNODE from FORWARD hook is not supported */
- if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
- skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
- IP_VS_DBG(1, "%s(): "
- "local delivery to %pI4 but in FORWARD\n",
- __func__, &skb_rtable(skb)->rt_dst);
- verdict = NF_DROP;
- }
+ verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
- out:
+out:
__ip_vs_conn_put(cp);
return verdict;
}
#ifdef CONFIG_IP_VS_IPV6
-static int
-ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
+static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *iph)
{
- struct ipv6hdr *iph;
+ struct net *net = NULL;
+ struct ipv6hdr _ip6h, *ip6h;
struct icmp6hdr _icmph, *ic;
- struct ipv6hdr _ciph, *cih; /* The ip header contained
- within the ICMP */
- struct ip_vs_iphdr ciph;
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
- unsigned int offset, verdict;
- union nf_inet_addr snet;
- struct rt6_info *rt;
+ struct ip_vs_proto_data *pd;
+ unsigned int offs_ciph, writable, verdict;
*related = 1;
- /* reassemble IP fragments */
- if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
- return NF_STOLEN;
- }
-
- iph = ipv6_hdr(skb);
- offset = sizeof(struct ipv6hdr);
- ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
if (ic == NULL)
return NF_DROP;
- IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
- ic->icmp6_type, ntohs(icmpv6_id(ic)),
- &iph->saddr, &iph->daddr);
-
/*
* Work through seeing if this is for us.
* These checks are supposed to be in an order that means easy
@@ -1348,66 +1486,71 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
- if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
- (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
- (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
*related = 0;
return NF_ACCEPT;
}
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (iph->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &iph->saddr, &iph->daddr);
/* Now find the contained IP header */
- offset += sizeof(_icmph);
- cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
- if (cih == NULL)
+ ciph.len = iph->len + sizeof(_icmph);
+ offs_ciph = ciph.len; /* Save ip header offset */
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
-
- pp = ip_vs_proto_get(cih->nexthdr);
- if (!pp)
+ ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, ciph.protocol);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
- /* Is the embedded protocol header present? */
- /* TODO: we don't support fragmentation at the moment anyways */
- if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+ /* Cannot handle fragmented embedded protocol */
+ if (ciph.fragoffs)
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
"Checking incoming ICMPv6 for");
- offset += sizeof(struct ipv6hdr);
+ /* The embedded headers contain source and dest in reverse order
+ * if not from localhost
+ */
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph,
+ (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
- ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
- /* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
- if (!cp) {
- /* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
- if (cp) {
- ipv6_addr_copy(&snet.in6, &iph->saddr);
- return handle_response_icmp(AF_INET6, skb, &snet,
- cih->nexthdr,
- cp, pp, offset,
- sizeof(struct ipv6hdr));
- }
+ if (!cp)
+ return NF_ACCEPT;
+ /* VS/TUN, VS/DR and LOCALNODE just let it go */
+ if ((hooknum == NF_INET_LOCAL_OUT) &&
+ (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
+ __ip_vs_conn_put(cp);
return NF_ACCEPT;
}
- verdict = NF_DROP;
-
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
- if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
- IPPROTO_SCTP == cih->nexthdr)
- offset += 2 * sizeof(__u16);
- verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
- /* LOCALNODE from FORWARD hook is not supported */
- if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
- (rt = (struct rt6_info *) skb_dst(skb)) &&
- rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
- IP_VS_DBG(1, "%s(): "
- "local delivery to %pI6 but in FORWARD\n",
- __func__, &rt->rt6i_dst);
- verdict = NF_DROP;
- }
+
+ /* Need to mangle contained IPv6 header in ICMPv6 packet */
+ writable = ciph.len;
+ if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
+ IPPROTO_SCTP == ciph.protocol)
+ writable += 2 * sizeof(__u16); /* Also mangle ports */
+
+ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
__ip_vs_conn_put(cp);
@@ -1423,10 +1566,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
- int ret, restart, pkts;
+ int ret, pkts;
+ struct netns_ipvs *ipvs;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1440,14 +1586,20 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely((skb->pkt_type != PACKET_HOST &&
hooknum != NF_INET_LOCAL_OUT) ||
!skb_dst(skb))) {
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(af, skb, &iph);
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
" ignored in hook %u\n",
skb->pkt_type, iph.protocol,
IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
return NF_ACCEPT;
}
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
/* Bad... Do not break raw sockets */
if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
@@ -1463,11 +1615,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
- int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+ int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
+ &iph);
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
#endif
@@ -1477,23 +1629,34 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
/* Protocol supported? */
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
-
+ pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_in_get(af, skb, &iph, 0);
- if (unlikely(!cp)) {
+ if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
+ unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
+ is_new_conn(skb, &iph)) {
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ }
+
+ if (unlikely(!cp) && !iph.fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
int v;
- if (!pp->conn_schedule(af, skb, pp, &v, &cp))
+ /* Schedule and create new connection entry into &cp */
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
return v;
}
@@ -1501,16 +1664,22 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, af, pp, skb, 0,
"ip_vs_in: packet continues traversal as normal");
+ if (iph.fragoffs) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
+ }
return NF_ACCEPT;
}
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
- if (sysctl_ip_vs_expire_nodest_conn) {
+ if (sysctl_expire_nodest_conn(ipvs)) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
@@ -1521,9 +1690,9 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
ip_vs_in_stats(cp, skb);
- restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
- ret = cp->packet_xmit(skb, cp, pp);
+ ret = cp->packet_xmit(skb, cp, pp, &iph);
/* do not touch skb anymore */
else {
IP_VS_DBG_RL("warning: packet_xmit is null");
@@ -1535,37 +1704,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
*
* Sync connection if it is about to close to
* encorage the standby servers to update the connections timeout
+ *
+ * For ONE_PKT let ip_vs_sync_conn() do the filter work.
*/
- pkts = atomic_add_return(1, &cp->in_pkts);
- if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
- cp->protocol == IPPROTO_SCTP) {
- if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
- (cp->old_state != cp->state &&
- ((cp->state == IP_VS_SCTP_S_CLOSED) ||
- (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
- (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
- ip_vs_sync_conn(cp);
- goto out;
- }
- }
- /* Keep this block last: TCP and others with pp->num_states <= 1 */
- else if (af == AF_INET &&
- (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
- (((cp->protocol != IPPROTO_TCP ||
- cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
- ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
- ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
- (cp->state == IP_VS_TCP_S_CLOSE) ||
- (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
- (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
- ip_vs_sync_conn(cp);
-out:
- cp->old_state = cp->state;
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ pkts = sysctl_sync_threshold(ipvs);
+ else
+ pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, pkts);
ip_vs_conn_put(cp);
return ret;
@@ -1576,12 +1725,12 @@ out:
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_in(hooknum, skb, AF_INET);
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
}
/*
@@ -1589,17 +1738,11 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1609,12 +1752,12 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_in(hooknum, skb, AF_INET6);
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
}
/*
@@ -1622,17 +1765,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
}
#endif
@@ -1648,30 +1785,48 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
* and send them to ip_vs_in_icmp.
*/
static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
- return ip_vs_in_icmp(skb, &r, hooknum);
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp(skb, &r, ops->hooknum);
}
#ifdef CONFIG_IP_VS_IPV6
static unsigned int
-ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ struct ip_vs_iphdr iphdr;
+
+ ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+ if (iphdr.protocol != IPPROTO_ICMPV6)
+ return NF_ACCEPT;
- if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp_v6(skb, &r, hooknum);
+ return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
}
#endif
@@ -1681,9 +1836,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 99,
+ .priority = NF_IP_PRI_NAT_SRC - 2,
},
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
@@ -1691,32 +1846,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_remote_request4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 101,
+ .priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -99,
+ .priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -98,
+ .priority = NF_IP_PRI_NAT_DST + 2,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
@@ -1724,7 +1879,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
@@ -1733,9 +1888,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 99,
+ .priority = NF_IP6_PRI_NAT_SRC - 2,
},
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
@@ -1743,32 +1898,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_remote_request6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 101,
+ .priority = NF_IP6_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply6,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -99,
+ .priority = NF_IP6_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -98,
+ .priority = NF_IP6_PRI_NAT_DST + 2,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp_v6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
@@ -1776,13 +1931,102 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
#endif
};
+/*
+ * Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_generic(net, ip_vs_net_id);
+ if (ipvs == NULL)
+ return -ENOMEM;
+
+ /* Hold the beast until a service is registerd */
+ ipvs->enable = 0;
+ ipvs->net = net;
+ /* Counters used for creating unique names */
+ ipvs->gen = atomic_read(&ipvs_netns_cnt);
+ atomic_inc(&ipvs_netns_cnt);
+ net->ipvs = ipvs;
+
+ if (ip_vs_estimator_net_init(net) < 0)
+ goto estimator_fail;
+
+ if (ip_vs_control_net_init(net) < 0)
+ goto control_fail;
+
+ if (ip_vs_protocol_net_init(net) < 0)
+ goto protocol_fail;
+
+ if (ip_vs_app_net_init(net) < 0)
+ goto app_fail;
+
+ if (ip_vs_conn_net_init(net) < 0)
+ goto conn_fail;
+ if (ip_vs_sync_net_init(net) < 0)
+ goto sync_fail;
+
+ printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+ sizeof(struct netns_ipvs), ipvs->gen);
+ return 0;
+/*
+ * Error handling
+ */
+
+sync_fail:
+ ip_vs_conn_net_cleanup(net);
+conn_fail:
+ ip_vs_app_net_cleanup(net);
+app_fail:
+ ip_vs_protocol_net_cleanup(net);
+protocol_fail:
+ ip_vs_control_net_cleanup(net);
+control_fail:
+ ip_vs_estimator_net_cleanup(net);
+estimator_fail:
+ net->ipvs = NULL;
+ return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+ ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
+ ip_vs_conn_net_cleanup(net);
+ ip_vs_app_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(net);
+ ip_vs_control_net_cleanup(net);
+ ip_vs_estimator_net_cleanup(net);
+ IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+ net->ipvs = NULL;
+}
+
+static void __net_exit __ip_vs_dev_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ net_ipvs(net)->enable = 0; /* Disable packet reception */
+ smp_wmb();
+ ip_vs_sync_net_cleanup(net);
+ LeaveFunction(2);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+ .init = __ip_vs_init,
+ .exit = __ip_vs_cleanup,
+ .id = &ip_vs_net_id,
+ .size = sizeof(struct netns_ipvs),
+};
+
+static struct pernet_operations ipvs_core_dev_ops = {
+ .exit = __ip_vs_dev_cleanup,
+};
/*
* Initialize IP Virtual Server
@@ -1791,57 +2035,68 @@ static int __init ip_vs_init(void)
{
int ret;
- ip_vs_estimator_init();
-
ret = ip_vs_control_init();
if (ret < 0) {
pr_err("can't setup control.\n");
- goto cleanup_estimator;
+ goto exit;
}
ip_vs_protocol_init();
- ret = ip_vs_app_init();
- if (ret < 0) {
- pr_err("can't setup application helper.\n");
- goto cleanup_protocol;
- }
-
ret = ip_vs_conn_init();
if (ret < 0) {
pr_err("can't setup connection table.\n");
- goto cleanup_app;
+ goto cleanup_protocol;
}
+ ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
+ if (ret < 0)
+ goto cleanup_conn;
+
+ ret = register_pernet_device(&ipvs_core_dev_ops);
+ if (ret < 0)
+ goto cleanup_sub;
+
ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
if (ret < 0) {
pr_err("can't register hooks.\n");
- goto cleanup_conn;
+ goto cleanup_dev;
+ }
+
+ ret = ip_vs_register_nl_ioctl();
+ if (ret < 0) {
+ pr_err("can't register netlink/ioctl.\n");
+ goto cleanup_hooks;
}
pr_info("ipvs loaded.\n");
+
return ret;
- cleanup_conn:
+cleanup_hooks:
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+cleanup_dev:
+ unregister_pernet_device(&ipvs_core_dev_ops);
+cleanup_sub:
+ unregister_pernet_subsys(&ipvs_core_ops);
+cleanup_conn:
ip_vs_conn_cleanup();
- cleanup_app:
- ip_vs_app_cleanup();
- cleanup_protocol:
+cleanup_protocol:
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
- cleanup_estimator:
- ip_vs_estimator_cleanup();
+exit:
return ret;
}
static void __exit ip_vs_cleanup(void)
{
+ ip_vs_unregister_nl_ioctl();
nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ unregister_pernet_device(&ipvs_core_dev_ops);
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
ip_vs_conn_cleanup();
- ip_vs_app_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
- ip_vs_estimator_cleanup();
pr_info("ipvs unloaded.\n");
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c6f29363922..581a6584ed0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -38,6 +38,7 @@
#include <linux/mutex.h>
#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
#include <net/ip.h>
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
@@ -54,45 +55,7 @@
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
/* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-#ifdef CONFIG_IP_VS_NFCT
-int sysctl_ip_vs_conntrack;
-#endif
-int sysctl_ip_vs_snat_reroute = 1;
-
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
@@ -103,29 +66,35 @@ int ip_vs_get_debug_level(void)
}
#endif
+
+/* Protos */
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
+
+
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
{
- struct rt6_info *rt;
- struct flowi fl = {
- .oif = 0,
- .fl6_dst = *addr,
- .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
+ struct flowi6 fl6 = {
+ .daddr = *addr,
};
+ struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+ bool is_local;
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
- if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
- return 1;
+ is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
- return 0;
+ dst_release(dst);
+ return is_local;
}
#endif
+
+#ifdef CONFIG_SYSCTL
/*
* update_defense_level is called from keventd and from sysctl,
* so it needs to protect itself from softirqs
*/
-static void update_defense_level(void)
+static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
static int old_secure_tcp = 0;
@@ -141,73 +110,73 @@ static void update_defense_level(void)
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
- nomem = (availmem < sysctl_ip_vs_amemthresh);
+ nomem = (availmem < ipvs->sysctl_amemthresh);
local_bh_disable();
/* drop_entry */
- spin_lock(&__ip_vs_dropentry_lock);
- switch (sysctl_ip_vs_drop_entry) {
+ spin_lock(&ipvs->dropentry_lock);
+ switch (ipvs->sysctl_drop_entry) {
case 0:
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
break;
case 1:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
- sysctl_ip_vs_drop_entry = 2;
+ atomic_set(&ipvs->dropentry, 1);
+ ipvs->sysctl_drop_entry = 2;
} else {
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
}
break;
case 2:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
} else {
- atomic_set(&ip_vs_dropentry, 0);
- sysctl_ip_vs_drop_entry = 1;
+ atomic_set(&ipvs->dropentry, 0);
+ ipvs->sysctl_drop_entry = 1;
};
break;
case 3:
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
break;
}
- spin_unlock(&__ip_vs_dropentry_lock);
+ spin_unlock(&ipvs->dropentry_lock);
/* drop_packet */
- spin_lock(&__ip_vs_droppacket_lock);
- switch (sysctl_ip_vs_drop_packet) {
+ spin_lock(&ipvs->droppacket_lock);
+ switch (ipvs->sysctl_drop_packet) {
case 0:
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
break;
case 1:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
- sysctl_ip_vs_drop_packet = 2;
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ ipvs->sysctl_drop_packet = 2;
} else {
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
}
break;
case 2:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
} else {
- ip_vs_drop_rate = 0;
- sysctl_ip_vs_drop_packet = 1;
+ ipvs->drop_rate = 0;
+ ipvs->sysctl_drop_packet = 1;
}
break;
case 3:
- ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+ ipvs->drop_rate = ipvs->sysctl_am_droprate;
break;
}
- spin_unlock(&__ip_vs_droppacket_lock);
+ spin_unlock(&ipvs->droppacket_lock);
/* secure_tcp */
- spin_lock(&ip_vs_securetcp_lock);
- switch (sysctl_ip_vs_secure_tcp) {
+ spin_lock(&ipvs->securetcp_lock);
+ switch (ipvs->sysctl_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
to_change = 0;
@@ -216,7 +185,7 @@ static void update_defense_level(void)
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
- sysctl_ip_vs_secure_tcp = 2;
+ ipvs->sysctl_secure_tcp = 2;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
@@ -229,7 +198,7 @@ static void update_defense_level(void)
} else {
if (old_secure_tcp >= 2)
to_change = 0;
- sysctl_ip_vs_secure_tcp = 1;
+ ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
@@ -237,10 +206,11 @@ static void update_defense_level(void)
to_change = 1;
break;
}
- old_secure_tcp = sysctl_ip_vs_secure_tcp;
+ old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
- ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
- spin_unlock(&ip_vs_securetcp_lock);
+ ip_vs_protocol_timeout_change(ipvs,
+ ipvs->sysctl_secure_tcp > 1);
+ spin_unlock(&ipvs->securetcp_lock);
local_bh_enable();
}
@@ -250,17 +220,18 @@ static void update_defense_level(void)
* Timer for checking the defense
*/
#define DEFENSE_TIMER_PERIOD 1*HZ
-static void defense_work_handler(struct work_struct *work);
-static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
static void defense_work_handler(struct work_struct *work)
{
- update_defense_level();
- if (atomic_read(&ip_vs_dropentry))
- ip_vs_random_dropentry();
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, defense_work.work);
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+ update_defense_level(ipvs);
+ if (atomic_read(&ipvs->dropentry))
+ ip_vs_random_dropentry(ipvs->net);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
+#endif
int
ip_vs_use_count_inc(void)
@@ -283,67 +254,50 @@ ip_vs_use_count_dec(void)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
-
-/*
- * Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
- * Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
- * FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
/*
* Returns hash value for virtual service
*/
-static __inline__ unsigned
-ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
- __be16 port)
+static inline unsigned int
+ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
+ const union nf_inet_addr *addr, __be16 port)
{
- register unsigned porth = ntohs(port);
+ register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
+ __u32 ahash;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
+ ahash = ntohl(addr_fold);
+ ahash ^= ((size_t) net >> 8);
- return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
- & IP_VS_SVC_TAB_MASK;
+ return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+ IP_VS_SVC_TAB_MASK;
}
/*
* Returns hash value of fwmark for virtual service lookup
*/
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
{
- return fwmark & IP_VS_SVC_TAB_MASK;
+ return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
- * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
* or in the ip_vs_svc_fwm_table by fwmark.
* Should be called with locked tables.
*/
static int ip_vs_svc_hash(struct ip_vs_service *svc)
{
- unsigned hash;
+ unsigned int hash;
if (svc->flags & IP_VS_SVC_F_HASHED) {
pr_err("%s(): request for already hashed, called from %pF\n",
@@ -353,17 +307,17 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
if (svc->fwmark == 0) {
/*
- * Hash it by <protocol,addr,port> in ip_vs_svc_table
+ * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
- hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
- svc->port);
- list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ &svc->addr, svc->port);
+ hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
- * Hash it by fwmark in ip_vs_svc_fwm_table
+ * Hash it by fwmark in svc_fwm_table
*/
- hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
- list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
+ hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
svc->flags |= IP_VS_SVC_F_HASHED;
@@ -374,7 +328,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
/*
- * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ * Unhashes a service from svc_table / svc_fwm_table.
* Should be called with locked tables.
*/
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
@@ -386,11 +340,11 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
}
if (svc->fwmark == 0) {
- /* Remove it from the ip_vs_svc_table table */
- list_del(&svc->s_list);
+ /* Remove it from the svc_table table */
+ hlist_del_rcu(&svc->s_list);
} else {
- /* Remove it from the ip_vs_svc_fwm_table table */
- list_del(&svc->f_list);
+ /* Remove it from the svc_fwm_table table */
+ hlist_del_rcu(&svc->f_list);
}
svc->flags &= ~IP_VS_SVC_F_HASHED;
@@ -400,23 +354,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
/*
- * Get service by {proto,addr,port} in the service table.
+ * Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
- __be16 vport)
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
- hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
+ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
- list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
- && (svc->protocol == protocol)) {
+ && (svc->protocol == protocol)
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -430,16 +385,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_find(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
- hash = ip_vs_svc_fwm_hashkey(fwmark);
+ hash = ip_vs_svc_fwm_hashkey(net, fwmark);
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
- if (svc->fwmark == fwmark && svc->af == af) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ if (svc->fwmark == fwmark && svc->af == af
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -448,50 +404,49 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark)
return NULL;
}
+/* Find service, called under RCU lock */
struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
- const union nf_inet_addr *vaddr, __be16 vport)
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
-
- read_lock(&__ip_vs_svc_lock);
+ struct netns_ipvs *ipvs = net_ipvs(net);
/*
* Check the table hashed by fwmark first
*/
- if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
- goto out;
+ if (fwmark) {
+ svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ if (svc)
+ goto out;
+ }
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
- && atomic_read(&ip_vs_ftpsvc_counter)
+ && atomic_read(&ipvs->ftpsvc_counter)
&& (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
- && atomic_read(&ip_vs_nullsvc_counter)) {
+ && atomic_read(&ipvs->nullsvc_counter)) {
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
}
out:
- if (svc)
- atomic_inc(&svc->usecnt);
- read_unlock(&__ip_vs_svc_lock);
-
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
@@ -505,21 +460,35 @@ static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
atomic_inc(&svc->refcnt);
- dest->svc = svc;
+ rcu_assign_pointer(dest->svc, svc);
}
-static void
-__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+static void ip_vs_service_free(struct ip_vs_service *svc)
+{
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
+ kfree(svc);
+}
+
+static void ip_vs_service_rcu_free(struct rcu_head *head)
{
- struct ip_vs_service *svc = dest->svc;
+ struct ip_vs_service *svc;
- dest->svc = NULL;
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
+
+static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+{
if (atomic_dec_and_test(&svc->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- kfree(svc);
+ ntohs(svc->port));
+ if (do_delay)
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
+ else
+ ip_vs_service_free(svc);
}
}
@@ -527,11 +496,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
/*
* Returns hash value for real service
*/
-static inline unsigned ip_vs_rs_hashkey(int af,
+static inline unsigned int ip_vs_rs_hashkey(int af,
const union nf_inet_addr *addr,
__be16 port)
{
- register unsigned porth = ntohs(port);
+ register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
#ifdef CONFIG_IP_VS_IPV6
@@ -544,17 +513,13 @@ static inline unsigned ip_vs_rs_hashkey(int af,
& IP_VS_RTAB_MASK;
}
-/*
- * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
- * should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
- unsigned hash;
+ unsigned int hash;
- if (!list_empty(&dest->d_list)) {
- return 0;
- }
+ if (dest->in_rs_table)
+ return;
/*
* Hash by proto,addr,port,
@@ -562,64 +527,51 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ip_vs_rtable[hash]);
-
- return 1;
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
}
-/*
- * UNhashes ip_vs_dest from ip_vs_rtable.
- * should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
- * Remove it from the ip_vs_rtable table.
+ * Remove it from the rs_table table.
*/
- if (!list_empty(&dest->d_list)) {
- list_del(&dest->d_list);
- INIT_LIST_HEAD(&dest->d_list);
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
}
-
- return 1;
}
-/*
- * Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
- const union nf_inet_addr *daddr,
- __be16 dport)
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
{
- unsigned hash;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash;
struct ip_vs_dest *dest;
- /*
- * Check for "full" addressed entries
- * Return the first found entry
- */
+ /* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&__ip_vs_rs_lock);
- list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
- if ((dest->af == af)
- && ip_vs_addr_equal(af, &dest->addr, daddr)
- && (dest->port == dport)
- && ((dest->protocol == protocol) ||
- dest->vfwmark)) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
/* HIT */
- read_unlock(&__ip_vs_rs_lock);
- return dest;
+ rcu_read_unlock();
+ return true;
}
}
- read_unlock(&__ip_vs_rs_lock);
+ rcu_read_unlock();
- return NULL;
+ return false;
}
-/*
- * Lookup destination by {addr,port} in the given service
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
*/
static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
@@ -630,7 +582,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find the destination for the given service
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->af == svc->af)
&& ip_vs_addr_equal(svc->af, &dest->addr, daddr)
&& (dest->port == dport)) {
@@ -644,32 +596,56 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find destination by {daddr,dport,vaddr,protocol}
- * Cretaed to be used in ip_vs_process_message() in
+ * Created to be used in ip_vs_process_message() in
* the backup synchronization daemon. It finds the
* destination to be bound to the received connection
* on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
+ * Called under RCU lock, no refcnt is returned.
*/
-struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
+ const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
- __be16 vport, __u16 protocol)
+ __be16 vport, __u16 protocol, __u32 fwmark,
+ __u32 flags)
{
struct ip_vs_dest *dest;
struct ip_vs_service *svc;
+ __be16 port = dport;
- svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+ svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
- dest = ip_vs_lookup_dest(svc, daddr, dport);
- if (dest)
- atomic_inc(&dest->refcnt);
- ip_vs_service_put(svc);
+ if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
+ port = 0;
+ dest = ip_vs_lookup_dest(svc, daddr, port);
+ if (!dest)
+ dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
return dest;
}
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst, 1);
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
+}
+
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
@@ -684,12 +660,14 @@ static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
- struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_dest *dest;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
@@ -705,28 +683,29 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
(ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
dest->vport == svc->port))) {
/* HIT */
- return dest;
- }
-
- /*
- * Try to purge the destination from trash if not referenced
- */
- if (atomic_read(&dest->refcnt) == 1) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
- "from trash\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(svc->af, &dest->addr),
- ntohs(dest->port));
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- kfree(dest);
+ list_del(&dest->t_list);
+ ip_vs_dest_hold(dest);
+ goto out;
}
}
- return NULL;
+ dest = NULL;
+
+out:
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ return dest;
}
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+ struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
+
+ __ip_vs_dst_cache_reset(dest);
+ __ip_vs_svc_put(svc, false);
+ free_percpu(dest->stats.cpustats);
+ ip_vs_dest_put_and_free(dest);
+}
/*
* Clean up all the destinations in the trash
@@ -735,27 +714,54 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
- * be 1, so we simply release them here.
+ * be 0, so we simply release them here.
*/
-static void ip_vs_trash_cleanup(void)
+static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- kfree(dest);
+ del_timer_sync(&ipvs->dest_trash_timer);
+ /* No need to use dest_trash_lock */
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
}
}
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
+
+ spin_lock_bh(&src->lock);
+
+ IP_VS_SHOW_STATS_COUNTER(conns);
+ IP_VS_SHOW_STATS_COUNTER(inpkts);
+ IP_VS_SHOW_STATS_COUNTER(outpkts);
+ IP_VS_SHOW_STATS_COUNTER(inbytes);
+ IP_VS_SHOW_STATS_COUNTER(outbytes);
+
+ ip_vs_read_estimator(dst, src);
+
+ spin_unlock_bh(&src->lock);
+}
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
spin_lock_bh(&stats->lock);
- memset(&stats->ustats, 0, sizeof(stats->ustats));
+ /* get current counters as zero point, rates are zeroed */
+
+#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
+
+ IP_VS_ZERO_STATS_COUNTER(conns);
+ IP_VS_ZERO_STATS_COUNTER(inpkts);
+ IP_VS_ZERO_STATS_COUNTER(outpkts);
+ IP_VS_ZERO_STATS_COUNTER(inbytes);
+ IP_VS_ZERO_STATS_COUNTER(outbytes);
+
ip_vs_zero_estimator(stats);
spin_unlock_bh(&stats->lock);
@@ -768,6 +774,9 @@ static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_service *old_svc;
+ struct ip_vs_scheduler *sched;
int conn_flags;
/* set the weight and the flags */
@@ -780,23 +789,22 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
- * Put the real service in ip_vs_rtable if not present.
+ * Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&__ip_vs_rs_lock);
- ip_vs_rs_hash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
+ ip_vs_rs_hash(ipvs, dest);
}
atomic_set(&dest->conn_flags, conn_flags);
/* bind the service */
- if (!dest->svc) {
+ old_svc = rcu_dereference_protected(dest->svc, 1);
+ if (!old_svc) {
__ip_vs_bind_svc(dest, svc);
} else {
- if (dest->svc != svc) {
- __ip_vs_unbind_svc(dest);
+ if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
+ __ip_vs_svc_put(old_svc, true);
}
}
@@ -808,28 +816,21 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
- spin_lock(&dest->dst_lock);
- ip_vs_dst_reset(dest);
- spin_unlock(&dest->dst_lock);
-
- if (add)
- ip_vs_new_estimator(&dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ spin_lock_bh(&dest->dst_lock);
+ __ip_vs_dst_cache_reset(dest);
+ spin_unlock_bh(&dest->dst_lock);
+ sched = rcu_dereference_protected(svc->scheduler, 1);
if (add) {
- list_add(&dest->n_list, &svc->destinations);
+ ip_vs_start_estimator(svc->net, &dest->stats);
+ list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
+ if (sched->add_dest)
+ sched->add_dest(svc, dest);
+ } else {
+ if (sched->upd_dest)
+ sched->upd_dest(svc, dest);
}
-
- /* call the update_service, because server weight may be changed */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
}
@@ -841,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest **dest_p)
{
struct ip_vs_dest *dest;
- unsigned atype;
+ unsigned int atype, i;
EnterFunction(2);
@@ -850,20 +851,28 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
- !__ip_vs_addr_is_local_v6(&udest->addr.in6))
+ !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
return -EINVAL;
} else
#endif
{
- atype = inet_addr_type(&init_net, udest->addr.ip);
+ atype = inet_addr_type(svc->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
- if (dest == NULL) {
- pr_err("%s(): no memory.\n", __func__);
+ if (dest == NULL)
return -ENOMEM;
+
+ dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!dest->stats.cpustats)
+ goto err_alloc;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_dest_stats;
+ ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
+ u64_stats_init(&ip_vs_dest_stats->syncp);
}
dest->af = svc->af;
@@ -879,7 +888,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 1);
- INIT_LIST_HEAD(&dest->d_list);
+ INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
@@ -888,6 +897,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
LeaveFunction(2);
return 0;
+
+err_alloc:
+ kfree(dest);
+ return -ENOMEM;
}
@@ -917,10 +930,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Check if the dest already exists in the list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
@@ -942,11 +955,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
- /*
- * Get the destination from the trash
- */
- list_del(&dest->n_list);
-
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
@@ -986,10 +994,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Lookup the destination list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
@@ -1002,48 +1010,33 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return 0;
}
-
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+ bool cleanup)
{
- ip_vs_kill_estimator(&dest->stats);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_stop_estimator(net, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&__ip_vs_rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
- /*
- * Decrease the refcnt of the dest, and free the dest
- * if nobody refers to it (refcnt=0). Otherwise, throw
- * the destination into the trash.
- */
- if (atomic_dec_and_test(&dest->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port));
- ip_vs_dst_reset(dest);
- /* simply decrease svc->refcnt here, let the caller check
- and release the service if nobody refers to it.
- Only user context can release destination and service,
- and only one user context can update virtual service at a
- time, so the operation here is OK */
- atomic_dec(&dest->svc->refcnt);
- kfree(dest);
- } else {
- IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
- "dest->refcnt=%d\n",
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port),
- atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ip_vs_dest_trash);
- atomic_inc(&dest->refcnt);
- }
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash without reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = 0;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_dest_put(dest);
}
@@ -1059,14 +1052,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
/*
* Remove it from the d-linked destination list.
*/
- list_del(&dest->n_list);
+ list_del_rcu(&dest->n_list);
svc->num_dests--;
- /*
- * Call the update_service function of its scheduler
- */
- if (svcupd && svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
+ if (svcupd) {
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched->del_dest)
+ sched->del_dest(svc, dest);
+ }
}
@@ -1081,49 +1076,75 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
EnterFunction(2);
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
return -ENOENT;
}
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
/*
* Unlink dest from the service
*/
__ip_vs_unlink_dest(svc, dest, 1);
- write_unlock_bh(&__ip_vs_svc_lock);
-
/*
* Delete the destination
*/
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest, false);
LeaveFunction(2);
return 0;
}
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+ struct net *net = (struct net *) data;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest *dest, *next;
+ unsigned long now = jiffies;
+
+ spin_lock(&ipvs->dest_trash_lock);
+ list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+ if (atomic_read(&dest->refcnt) > 0)
+ continue;
+ if (dest->idle_start) {
+ if (time_before(now, dest->idle_start +
+ IP_VS_DEST_TRASH_PERIOD))
+ continue;
+ } else {
+ dest->idle_start = max(1UL, now);
+ continue;
+ }
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+ if (!list_empty(&ipvs->dest_trash))
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ spin_unlock(&ipvs->dest_trash_lock);
+}
/*
* Add a service into the service hash table
*/
static int
-ip_vs_add_service(struct ip_vs_service_user_kern *u,
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
- int ret = 0;
+ int ret = 0, i;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
@@ -1137,7 +1158,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1147,9 +1168,13 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
#ifdef CONFIG_IP_VS_IPV6
- if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
- ret = -EINVAL;
- goto out_err;
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out_err;
+ }
}
#endif
@@ -1159,9 +1184,20 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
ret = -ENOMEM;
goto out_err;
}
+ svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!svc->stats.cpustats) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_stats;
+ ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
+ u64_stats_init(&ip_vs_stats->syncp);
+ }
+
/* I'm the first user of the service */
- atomic_set(&svc->usecnt, 0);
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
@@ -1172,9 +1208,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
+ svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
- rwlock_init(&svc->sched_lock);
+ spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
@@ -1184,38 +1221,34 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
sched = NULL;
/* Bind the ct retriever */
- ip_vs_bind_pe(svc, pe);
+ RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
- atomic_inc(&ip_vs_ftpsvc_counter);
+ atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_inc(&ip_vs_nullsvc_counter);
+ atomic_inc(&ipvs->nullsvc_counter);
- ip_vs_new_estimator(&svc->stats);
+ ip_vs_start_estimator(net, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services++;
+ ipvs->num_services++;
/* Hash the service into the service table */
- write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_hash(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
*svc_p = svc;
+ /* Now there is a service - full throttle */
+ ipvs->enable = 1;
return 0;
+
out_err:
if (svc != NULL) {
- ip_vs_unbind_scheduler(svc);
- if (svc->inc) {
- local_bh_disable();
- ip_vs_app_inc_put(svc->inc);
- local_bh_enable();
- }
- kfree(svc);
+ ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
ip_vs_pe_put(pe);
@@ -1248,7 +1281,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = sched;
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1259,18 +1292,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
#ifdef CONFIG_IP_VS_IPV6
- if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
- ret = -EINVAL;
- goto out;
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out;
+ }
}
#endif
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched != old_sched) {
+ /* Bind the new scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ old_sched = sched;
+ goto out;
+ }
+ /* Unbind the old scheduler on success */
+ ip_vs_unbind_scheduler(svc, old_sched);
+ }
/*
* Set the flags and timeout value
@@ -1279,112 +1321,65 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
- old_sched = svc->scheduler;
- if (sched != old_sched) {
- /*
- * Unbind the old scheduler
- */
- if ((ret = ip_vs_unbind_scheduler(svc))) {
- old_sched = sched;
- goto out_unlock;
- }
-
- /*
- * Bind the new scheduler
- */
- if ((ret = ip_vs_bind_scheduler(svc, sched))) {
- /*
- * If ip_vs_bind_scheduler fails, restore the old
- * scheduler.
- * The main reason of failure is out of memory.
- *
- * The question is if the old scheduler can be
- * restored all the time. TODO: if it cannot be
- * restored some time, we must delete the service,
- * otherwise the system may crash.
- */
- ip_vs_bind_scheduler(svc, old_sched);
- old_sched = sched;
- goto out_unlock;
- }
- }
-
- old_pe = svc->pe;
- if (pe != old_pe) {
- ip_vs_unbind_pe(svc);
- ip_vs_bind_pe(svc, pe);
- }
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (pe != old_pe)
+ rcu_assign_pointer(svc->pe, pe);
- out_unlock:
- write_unlock_bh(&__ip_vs_svc_lock);
- out:
+out:
ip_vs_scheduler_put(old_sched);
ip_vs_pe_put(old_pe);
return ret;
}
-
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
* - We are called under _bh lock
*/
-static void __ip_vs_del_service(struct ip_vs_service *svc)
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
pr_info("%s: enter\n", __func__);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services--;
+ ipvs->num_services--;
- ip_vs_kill_estimator(&svc->stats);
+ ip_vs_stop_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
- old_sched = svc->scheduler;
- ip_vs_unbind_scheduler(svc);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ ip_vs_unbind_scheduler(svc, old_sched);
ip_vs_scheduler_put(old_sched);
- /* Unbind persistence engine */
- old_pe = svc->pe;
- ip_vs_unbind_pe(svc);
+ /* Unbind persistence engine, keep svc->pe */
+ old_pe = rcu_dereference_protected(svc->pe, 1);
ip_vs_pe_put(old_pe);
- /* Unbind app inc */
- if (svc->inc) {
- ip_vs_app_inc_put(svc->inc);
- svc->inc = NULL;
- }
-
/*
* Unlink the whole destination list
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest, cleanup);
}
/*
* Update the virtual service counters
*/
if (svc->port == FTPPORT)
- atomic_dec(&ip_vs_ftpsvc_counter);
+ atomic_dec(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_dec(&ip_vs_nullsvc_counter);
+ atomic_dec(&ipvs->nullsvc_counter);
/*
* Free the service if nobody refers to it
*/
- if (atomic_read(&svc->refcnt) == 0) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
- svc->fwmark,
- IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- kfree(svc);
- }
+ __ip_vs_svc_put(svc, true);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1393,23 +1388,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Unlink a service from list and try to delete it if its refcnt reached 0
*/
-static void ip_vs_unlink_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
{
+ /* Hold svc to avoid double release from dest_trash */
+ atomic_inc(&svc->refcnt);
/*
* Unhash it from the service table
*/
- write_lock_bh(&__ip_vs_svc_lock);
-
ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
- __ip_vs_del_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
+ __ip_vs_del_service(svc, cleanup);
}
/*
@@ -1419,7 +1407,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
{
if (svc == NULL)
return -EEXIST;
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, false);
return 0;
}
@@ -1428,17 +1416,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(void)
+static int ip_vs_flush(struct net *net, bool cleanup)
{
int idx;
- struct ip_vs_service *svc, *nxt;
+ struct ip_vs_service *svc;
+ struct hlist_node *n;
/*
- * Flush the service table hashed by <protocol,addr,port>
+ * Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- ip_vs_unlink_service(svc);
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+ s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1446,15 +1437,97 @@ static int ip_vs_flush(void)
* Flush the service table hashed by fwmark
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt,
- &ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_unlink_service(svc);
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
}
}
return 0;
}
+/*
+ * Delete service by {netns} in the service table.
+ * Called by __ip_vs_cleanup()
+ */
+void ip_vs_service_net_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ /* Check for "full" addressed entries */
+ mutex_lock(&__ip_vs_mutex);
+ ip_vs_flush(net, true);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+}
+
+/* Put all references for device (dst_cache) */
+static inline void
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
+{
+ struct ip_vs_dest_dst *dest_dst;
+
+ spin_lock_bh(&dest->dst_lock);
+ dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
+ if (dest_dst && dest_dst->dst_cache->dev == dev) {
+ IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
+ dev->name,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ __ip_vs_dst_cache_reset(dest);
+ }
+ spin_unlock_bh(&dest->dst_lock);
+
+}
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
+ */
+static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ unsigned int idx;
+
+ if (event != NETDEV_DOWN || !ipvs)
+ return NOTIFY_DONE;
+ IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+ EnterFunction(2);
+ mutex_lock(&__ip_vs_mutex);
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+ }
+
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+
+ }
+ }
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+ return NOTIFY_DONE;
+}
/*
* Zero counters in a service or all services
@@ -1463,41 +1536,46 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
- write_lock_bh(&__ip_vs_svc_lock);
list_for_each_entry(dest, &svc->destinations, n_list) {
ip_vs_zero_stats(&dest->stats);
}
ip_vs_zero_stats(&svc->stats);
- write_unlock_bh(&__ip_vs_svc_lock);
return 0;
}
-static int ip_vs_zero_all(void)
+static int ip_vs_zero_all(struct net *net)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- ip_vs_zero_service(svc);
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_zero_service(svc);
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
- ip_vs_zero_stats(&ip_vs_stats);
+ ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
return 0;
}
+#ifdef CONFIG_SYSCTL
+
+static int zero;
+static int three = 3;
static int
-proc_do_defense_mode(ctl_table *table, int write,
+proc_do_defense_mode(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = current->nsproxy->net_ns;
int *valp = table->data;
int val = *valp;
int rc;
@@ -1508,15 +1586,14 @@ proc_do_defense_mode(ctl_table *table, int write,
/* Restore the correct value */
*valp = val;
} else {
- update_defense_level();
+ update_defense_level(net_ipvs(net));
}
}
return rc;
}
-
static int
-proc_do_sync_threshold(ctl_table *table, int write,
+proc_do_sync_threshold(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
@@ -1527,52 +1604,77 @@ proc_do_sync_threshold(ctl_table *table, int write,
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(table, write, buffer, lenp, ppos);
- if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+ if (write && (valp[0] < 0 || valp[1] < 0 ||
+ (valp[0] >= valp[1] && valp[1]))) {
/* Restore the correct value */
memcpy(valp, val, sizeof(val));
}
return rc;
}
+static int
+proc_do_sync_mode(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 1)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if (*valp < 1 || !is_power_of_2(*valp)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ * Do not change order or insert new entries without
+ * align with netns init in ip_vs_control_net_init()
*/
static struct ctl_table vs_vars[] = {
{
.procname = "amemthresh",
- .data = &sysctl_ip_vs_amemthresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_IP_VS_DEBUG
- {
- .procname = "debug_level",
- .data = &sysctl_ip_vs_debug_level,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#endif
{
.procname = "am_droprate",
- .data = &sysctl_ip_vs_am_droprate,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "drop_entry",
- .data = &sysctl_ip_vs_drop_entry,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "drop_packet",
- .data = &sysctl_ip_vs_drop_packet,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
@@ -1580,7 +1682,6 @@ static struct ctl_table vs_vars[] = {
#ifdef CONFIG_IP_VS_NFCT
{
.procname = "conntrack",
- .data = &sysctl_ip_vs_conntrack,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
@@ -1588,18 +1689,124 @@ static struct ctl_table vs_vars[] = {
#endif
{
.procname = "secure_tcp",
- .data = &sysctl_ip_vs_secure_tcp,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "snat_reroute",
- .data = &sysctl_ip_vs_snat_reroute,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .procname = "sync_version",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_mode,
+ },
+ {
+ .procname = "sync_ports",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_ports,
+ },
+ {
+ .procname = "sync_persist_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_qlen_max",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "sync_sock_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cache_bypass",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_nodest_conn",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_sctp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_quiescent_template",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_threshold",
+ .maxlen =
+ sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+ .mode = 0644,
+ .proc_handler = proc_do_sync_threshold,
+ },
+ {
+ .procname = "sync_refresh_period",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "sync_retries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &three,
+ },
+ {
+ .procname = "nat_icmp_send",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "pmtu_disc",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "backup_only",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#if 0
{
.procname = "timeout_established",
@@ -1686,58 +1893,16 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
- {
- .procname = "cache_bypass",
- .data = &sysctl_ip_vs_cache_bypass,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_nodest_conn",
- .data = &sysctl_ip_vs_expire_nodest_conn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_quiescent_template",
- .data = &sysctl_ip_vs_expire_quiescent_template,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sync_threshold",
- .data = &sysctl_ip_vs_sync_threshold,
- .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
- .mode = 0644,
- .proc_handler = proc_do_sync_threshold,
- },
- {
- .procname = "nat_icmp_send",
- .data = &sysctl_ip_vs_nat_icmp_send,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- { }
-};
-
-const struct ctl_path net_vs_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "vs", },
{ }
};
-EXPORT_SYMBOL_GPL(net_vs_ctl_path);
-static struct ctl_table_header * sysctl_header;
+#endif
#ifdef CONFIG_PROC_FS
struct ip_vs_iter {
- struct list_head *table;
+ struct seq_net_private p; /* Do not move this, netns depends upon it*/
+ struct hlist_head *table;
int bucket;
};
@@ -1745,7 +1910,7 @@ struct ip_vs_iter {
* Write the contents of the VS rule table to a PROCfs file.
* (It is kept just for backward compatibility)
*/
-static inline const char *ip_vs_fwd_name(unsigned flags)
+static inline const char *ip_vs_fwd_name(unsigned int flags)
{
switch (flags & IP_VS_CONN_F_FWD_MASK) {
case IP_VS_CONN_F_LOCALNODE:
@@ -1763,14 +1928,15 @@ static inline const char *ip_vs_fwd_name(unsigned flags)
/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
+ struct net *net = seq_file_net(seq);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (pos-- == 0){
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
@@ -1780,8 +1946,9 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (pos-- == 0) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
@@ -1793,17 +1960,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
}
static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-__acquires(__ip_vs_svc_lock)
+ __acquires(RCU)
{
-
- read_lock_bh(&__ip_vs_svc_lock);
+ rcu_read_lock();
return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *e;
+ struct hlist_node *e;
struct ip_vs_iter *iter;
struct ip_vs_service *svc;
@@ -1816,13 +1982,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (iter->table == ip_vs_svc_table) {
/* next service in table hashed by protocol */
- if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, s_list);
-
+ e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, s_list);
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
- s_list) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_table[iter->bucket],
+ s_list) {
return svc;
}
}
@@ -1833,13 +2000,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
/* next service in hashed by fwmark */
- if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, f_list);
+ e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, f_list);
scan_fwmark:
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
- f_list)
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
return svc;
}
@@ -1847,9 +2016,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-__releases(__ip_vs_svc_lock)
+ __releases(RCU)
{
- read_unlock_bh(&__ip_vs_svc_lock);
+ rcu_read_unlock();
}
@@ -1867,6 +2036,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_service *svc = v;
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
@@ -1875,18 +2045,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
- svc->scheduler->name);
+ sched->name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
- svc->scheduler->name,
+ sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
- svc->fwmark, svc->scheduler->name,
+ svc->fwmark, sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
@@ -1897,7 +2067,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
else
seq_putc(seq, '\n');
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
#ifdef CONFIG_IP_VS_IPV6
if (dest->af == AF_INET6)
seq_printf(seq,
@@ -1935,7 +2105,7 @@ static const struct seq_operations ip_vs_info_seq_ops = {
static int ip_vs_info_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &ip_vs_info_seq_ops,
+ return seq_open_net(inode, file, &ip_vs_info_seq_ops,
sizeof(struct ip_vs_iter));
}
@@ -1944,18 +2114,13 @@ static const struct file_operations ip_vs_info_fops = {
.open = ip_vs_info_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-#endif
-
-struct ip_vs_stats ip_vs_stats = {
- .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
+ .release = seq_release_net,
};
-#ifdef CONFIG_PROC_FS
static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats_user show;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
@@ -1963,29 +2128,25 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_printf(seq,
" Conns Packets Packets Bytes Bytes\n");
- spin_lock_bh(&ip_vs_stats.lock);
- seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
- ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
- (unsigned long long) ip_vs_stats.ustats.inbytes,
- (unsigned long long) ip_vs_stats.ustats.outbytes);
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
+ show.inpkts, show.outpkts,
+ (unsigned long long) show.inbytes,
+ (unsigned long long) show.outbytes);
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
- seq_printf(seq,"%8X %8X %8X %16X %16X\n",
- ip_vs_stats.ustats.cps,
- ip_vs_stats.ustats.inpps,
- ip_vs_stats.ustats.outpps,
- ip_vs_stats.ustats.inbps,
- ip_vs_stats.ustats.outbps);
- spin_unlock_bh(&ip_vs_stats.lock);
+ seq_printf(seq, "%8X %8X %8X %16X %16X\n",
+ show.cps, show.inpps, show.outpps,
+ show.inbps, show.outbps);
return 0;
}
static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, ip_vs_stats_show, NULL);
+ return single_open_net(inode, file, ip_vs_stats_show);
}
static const struct file_operations ip_vs_stats_fops = {
@@ -1993,16 +2154,88 @@ static const struct file_operations ip_vs_stats_fops = {
.open = ip_vs_stats_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = single_release_net,
};
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
+ struct ip_vs_stats_user rates;
+ int i;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ "CPU Conns Packets Packets Bytes Bytes\n");
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&u->syncp);
+ inbytes = u->ustats.inbytes;
+ outbytes = u->ustats.outbytes;
+ } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+
+ seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
+ i, u->ustats.conns, u->ustats.inpkts,
+ u->ustats.outpkts, (__u64)inbytes,
+ (__u64)outbytes);
+ }
+
+ spin_lock_bh(&tot_stats->lock);
+
+ seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
+ tot_stats->ustats.conns, tot_stats->ustats.inpkts,
+ tot_stats->ustats.outpkts,
+ (unsigned long long) tot_stats->ustats.inbytes,
+ (unsigned long long) tot_stats->ustats.outbytes);
+
+ ip_vs_read_estimator(&rates, tot_stats);
+
+ spin_unlock_bh(&tot_stats->lock);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, " %8X %8X %8X %16X %16X\n",
+ rates.cps,
+ rates.inpps,
+ rates.outpps,
+ rates.inbps,
+ rates.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_percpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
#endif
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
u->tcp_timeout,
u->tcp_fin_timeout,
@@ -2010,19 +2243,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
#endif
@@ -2087,6 +2323,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
+ struct net *net = sock_net(sk);
int ret;
unsigned char arg[MAX_ARG_LEN];
struct ip_vs_service_user *usvc_compat;
@@ -2094,8 +2331,9 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
struct ip_vs_service *svc;
struct ip_vs_dest_user *udest_compat;
struct ip_vs_dest_user_kern udest;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
@@ -2114,6 +2352,24 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* increase the module use count */
ip_vs_use_count_inc();
+ /* Handle daemons since they have another lock */
+ if (cmd == IP_VS_SO_SET_STARTDAEMON ||
+ cmd == IP_VS_SO_SET_STOPDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+
+ if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
+ ret = -ERESTARTSYS;
+ goto out_dec;
+ }
+ if (cmd == IP_VS_SO_SET_STARTDAEMON)
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
+ else
+ ret = stop_sync_thread(net, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ goto out_dec;
+ }
+
if (mutex_lock_interruptible(&__ip_vs_mutex)) {
ret = -ERESTARTSYS;
goto out_dec;
@@ -2121,19 +2377,11 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush();
+ ret = ip_vs_flush(net, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
- ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
- goto out_unlock;
- } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
- struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
- goto out_unlock;
- } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
- struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = stop_sync_thread(dm->state);
+ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
}
@@ -2148,7 +2396,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out_unlock;
}
}
@@ -2164,11 +2412,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
}
/* Lookup the exact service by <protocol, addr, port> or fwmark */
+ rcu_read_lock();
if (usvc.fwmark == 0)
- svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+ svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2181,7 +2431,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (svc != NULL)
ret = -EEXIST;
else
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
@@ -2218,21 +2468,16 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
static void
-ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
-{
- spin_lock_bh(&src->lock);
- memcpy(dst, &src->ustats, sizeof(*dst));
- spin_unlock_bh(&src->lock);
-}
-
-static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(src->scheduler, 1);
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+ strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2241,7 +2486,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
}
static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+__ip_vs_get_service_entries(struct net *net,
+ const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
int idx, count=0;
@@ -2250,9 +2496,9 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
int ret = 0;
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2269,9 +2515,9 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
}
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2286,29 +2532,32 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
count++;
}
}
- out:
+out:
return ret;
}
static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
union nf_inet_addr addr = { .ip = get->addr };
int ret = 0;
+ rcu_read_lock();
if (get->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+ svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
+ rcu_read_unlock();
if (svc) {
int count = 0;
struct ip_vs_dest *dest;
struct ip_vs_dest_entry entry;
+ memset(&entry, 0, sizeof(entry));
list_for_each_entry(dest, &svc->destinations, n_list) {
if (count >= get->num_dests)
break;
@@ -2336,17 +2585,23 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
}
static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
+ memset(u, 0, sizeof (*u));
+
#ifdef CONFIG_IP_VS_PROTO_TCP
- u->tcp_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
- u->tcp_fin_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
u->udp_timeout =
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+ pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
}
@@ -2375,8 +2630,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
unsigned char arg[128];
int ret = 0;
unsigned int copylen;
+ struct net *net = sock_net(sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- if (!capable(CAP_NET_ADMIN))
+ BUG_ON(!net);
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
@@ -2394,6 +2652,33 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
if (copy_from_user(arg, user, copylen) != 0)
return -EFAULT;
+ /*
+ * Handle daemons first since it has its own locking
+ */
+ if (cmd == IP_VS_SO_GET_DAEMON) {
+ struct ip_vs_daemon_user d[2];
+
+ memset(&d, 0, sizeof(d));
+ if (mutex_lock_interruptible(&ipvs->sync_mutex))
+ return -ERESTARTSYS;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
+ d[0].state = IP_VS_STATE_MASTER;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
+ }
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
+ d[1].state = IP_VS_STATE_BACKUP;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
+ }
+ if (copy_to_user(user, &d, sizeof(d)) != 0)
+ ret = -EFAULT;
+ mutex_unlock(&ipvs->sync_mutex);
+ return ret;
+ }
if (mutex_lock_interruptible(&__ip_vs_mutex))
return -ERESTARTSYS;
@@ -2418,7 +2703,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
info.size = ip_vs_conn_tab_size;
- info.num_services = ip_vs_num_services;
+ info.num_services = ipvs->num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
}
@@ -2437,7 +2722,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_service_entries(get, user);
+ ret = __ip_vs_get_service_entries(net, get, user);
}
break;
@@ -2449,11 +2734,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
+ rcu_read_lock();
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, entry->protocol,
- &addr, entry->port);
+ svc = __ip_vs_service_find(net, AF_INET,
+ entry->protocol, &addr,
+ entry->port);
+ rcu_read_unlock();
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2476,7 +2764,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_dest_entries(get, user);
+ ret = __ip_vs_get_dest_entries(net, get, user);
}
break;
@@ -2484,37 +2772,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
break;
- case IP_VS_SO_GET_DAEMON:
- {
- struct ip_vs_daemon_user d[2];
-
- memset(&d, 0, sizeof(d));
- if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
- d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
- d[0].syncid = ip_vs_master_syncid;
- }
- if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
- d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
- d[1].syncid = ip_vs_backup_syncid;
- }
- if (copy_to_user(user, &d, sizeof(d)) != 0)
- ret = -EFAULT;
- }
- break;
-
default:
ret = -EINVAL;
}
- out:
+out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
@@ -2542,6 +2810,7 @@ static struct genl_family ip_vs_genl_family = {
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
.maxattr = IPVS_CMD_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
};
/* Policy used for first-level command attributes */
@@ -2599,31 +2868,29 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
struct ip_vs_stats *stats)
{
+ struct ip_vs_stats_user ustats;
struct nlattr *nl_stats = nla_nest_start(skb, container_type);
if (!nl_stats)
return -EMSGSIZE;
- spin_lock_bh(&stats->lock);
-
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
- NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
- NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
-
- spin_unlock_bh(&stats->lock);
-
+ ip_vs_copy_stats(&ustats, stats);
+
+ if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
+ goto nla_put_failure;
nla_nest_end(skb, nl_stats);
return 0;
nla_put_failure:
- spin_unlock_bh(&stats->lock);
nla_nest_cancel(skb, nl_stats);
return -EMSGSIZE;
}
@@ -2631,6 +2898,8 @@ nla_put_failure:
static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_service *svc)
{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_pe *pe;
struct nlattr *nl_service;
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
@@ -2639,23 +2908,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
if (!nl_service)
return -EMSGSIZE;
- NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
-
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
+ goto nla_put_failure;
if (svc->fwmark) {
- NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
+ if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
+ goto nla_put_failure;
} else {
- NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
- NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
- NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
+ nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
+ nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
+ goto nla_put_failure;
}
- NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
- if (svc->pe)
- NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
- NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
- NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
- NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
-
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ pe = rcu_dereference_protected(svc->pe, 1);
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
+ nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
+ nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
+ nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
+ goto nla_put_failure;
if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
goto nla_put_failure;
@@ -2674,7 +2946,7 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb,
{
void *hdr;
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_SERVICE);
if (!hdr)
@@ -2696,11 +2968,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
- if (++idx <= start)
+ hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2710,8 +2983,8 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
}
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
- if (++idx <= start)
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2727,7 +3000,8 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
+static int ip_vs_genl_parse_service(struct net *net,
+ struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, int full_entry,
struct ip_vs_service **ret_svc)
{
@@ -2765,15 +3039,17 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
} else {
usvc->protocol = nla_get_u16(nla_protocol);
nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
- usvc->port = nla_get_u16(nla_port);
+ usvc->port = nla_get_be16(nla_port);
usvc->fwmark = 0;
}
+ rcu_read_lock();
if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
- svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+ svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
+ rcu_read_unlock();
*ret_svc = svc;
/* If a full entry was requested, check for the additional fields */
@@ -2803,19 +3079,20 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
usvc->sched_name = nla_data(nla_sched);
usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
usvc->timeout = nla_get_u32(nla_timeout);
- usvc->netmask = nla_get_u32(nla_netmask);
+ usvc->netmask = nla_get_be32(nla_netmask);
}
return 0;
}
-static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+ struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+ ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
return ret ? ERR_PTR(ret) : svc;
}
@@ -2827,21 +3104,22 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
if (!nl_dest)
return -EMSGSIZE;
- NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
- NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
-
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
- atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
- atomic_read(&dest->activeconns));
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
- atomic_read(&dest->inactconns));
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
- atomic_read(&dest->persistconns));
-
+ if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+ (atomic_read(&dest->conn_flags) &
+ IP_VS_CONN_F_FWD_MASK)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
+ atomic_read(&dest->weight)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+ atomic_read(&dest->activeconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+ atomic_read(&dest->inactconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+ atomic_read(&dest->persistconns)))
+ goto nla_put_failure;
if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
goto nla_put_failure;
@@ -2859,7 +3137,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
{
void *hdr;
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DEST);
if (!hdr)
@@ -2883,6 +3161,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
@@ -2891,7 +3170,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
goto out_err;
- svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+
+ svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc) || svc == NULL)
goto out_err;
@@ -2934,7 +3214,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
memset(udest, 0, sizeof(*udest));
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
- udest->port = nla_get_u16(nla_port);
+ udest->port = nla_get_be16(nla_port);
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
@@ -2959,8 +3239,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
return 0;
}
-static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
- const char *mcast_ifn, __be32 syncid)
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid)
{
struct nlattr *nl_daemon;
@@ -2968,10 +3248,10 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
if (!nl_daemon)
return -EMSGSIZE;
- NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
- NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
- NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
-
+ if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
+ nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
+ nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+ goto nla_put_failure;
nla_nest_end(skb, nl_daemon);
return 0;
@@ -2981,12 +3261,12 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
- const char *mcast_ifn, __be32 syncid,
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid,
struct netlink_callback *cb)
{
void *hdr;
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DAEMON);
if (!hdr)
@@ -3005,56 +3285,61 @@ nla_put_failure:
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
- mutex_lock(&__ip_vs_mutex);
- if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+ struct net *net = skb_sknet(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
- ip_vs_master_mcast_ifn,
- ip_vs_master_syncid, cb) < 0)
+ ipvs->master_mcast_ifn,
+ ipvs->master_syncid, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
}
- if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+ if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
- ip_vs_backup_mcast_ifn,
- ip_vs_backup_syncid, cb) < 0)
+ ipvs->backup_mcast_ifn,
+ ipvs->backup_syncid, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
}
nla_put_failure:
- mutex_unlock(&__ip_vs_mutex);
+ mutex_unlock(&ipvs->sync_mutex);
return skb->len;
}
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
{
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+ return start_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
}
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
{
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ return stop_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
}
-static int ip_vs_genl_set_config(struct nlattr **attrs)
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3066,32 +3351,23 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
- return ip_vs_set_timeout(&t);
+ return ip_vs_set_timeout(net, &t);
}
-static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
{
- struct ip_vs_service *svc = NULL;
- struct ip_vs_service_user_kern usvc;
- struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
- int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
- mutex_lock(&__ip_vs_mutex);
-
- if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush();
- goto out;
- } else if (cmd == IPVS_CMD_SET_CONFIG) {
- ret = ip_vs_genl_set_config(info->attrs);
- goto out;
- } else if (cmd == IPVS_CMD_NEW_DAEMON ||
- cmd == IPVS_CMD_DEL_DAEMON) {
-
+ if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+ mutex_lock(&ipvs->sync_mutex);
if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
info->attrs[IPVS_CMD_ATTR_DAEMON],
@@ -3101,13 +3377,38 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
}
if (cmd == IPVS_CMD_NEW_DAEMON)
- ret = ip_vs_genl_new_daemon(daemon_attrs);
+ ret = ip_vs_genl_new_daemon(net, daemon_attrs);
else
- ret = ip_vs_genl_del_daemon(daemon_attrs);
+ ret = ip_vs_genl_del_daemon(net, daemon_attrs);
+out:
+ mutex_unlock(&ipvs->sync_mutex);
+ }
+ return ret;
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ip_vs_service *svc = NULL;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_dest_user_kern udest;
+ int ret = 0, cmd;
+ int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ if (cmd == IPVS_CMD_FLUSH) {
+ ret = ip_vs_flush(net, false);
+ goto out;
+ } else if (cmd == IPVS_CMD_SET_CONFIG) {
+ ret = ip_vs_genl_set_config(net, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out;
}
@@ -3117,7 +3418,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = 1;
- ret = ip_vs_genl_parse_service(&usvc,
+ ret = ip_vs_genl_parse_service(net, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
@@ -3147,7 +3448,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
else
ret = -EEXIST;
break;
@@ -3185,7 +3486,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
+ struct net *net;
+ net = skb_sknet(skb);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3214,7 +3517,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_service *svc;
- svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+ svc = ip_vs_genl_find_service(net,
+ info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
goto out_err;
@@ -3234,23 +3538,28 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
- NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
- NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
- t.tcp_fin_timeout);
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
+ t.tcp_timeout) ||
+ nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+ t.tcp_fin_timeout))
+ goto nla_put_failure;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
- NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
+ goto nla_put_failure;
#endif
break;
}
case IPVS_CMD_GET_INFO:
- NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
- NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
- ip_vs_conn_tab_size);
+ if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
+ IP_VS_VERSION_CODE) ||
+ nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+ ip_vs_conn_tab_size))
+ goto nla_put_failure;
break;
}
@@ -3271,7 +3580,7 @@ out:
}
-static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+static const struct genl_ops ip_vs_genl_ops[] = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
.flags = GENL_ADMIN_PERM,
@@ -3325,13 +3634,13 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
.cmd = IPVS_CMD_NEW_DAEMON,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
- .doit = ip_vs_genl_set_cmd,
+ .doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_DEL_DAEMON,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
- .doit = ip_vs_genl_set_cmd,
+ .doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_GET_DAEMON,
@@ -3370,7 +3679,7 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
static int __init ip_vs_genl_register(void)
{
return genl_register_family_with_ops(&ip_vs_genl_family,
- ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
+ ip_vs_genl_ops);
}
static void ip_vs_genl_unregister(void)
@@ -3380,46 +3689,212 @@ static void ip_vs_genl_unregister(void)
/* End of Generic Netlink interface definitions */
-
-int __init ip_vs_control_init(void)
+/*
+ * per netns intit/exit func.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
{
- int ret;
int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ctl_table *tbl;
+
+ atomic_set(&ipvs->dropentry, 0);
+ spin_lock_init(&ipvs->dropentry_lock);
+ spin_lock_init(&ipvs->droppacket_lock);
+ spin_lock_init(&ipvs->securetcp_lock);
+
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
+ } else
+ tbl = vs_vars;
+ /* Initialize sysctl defaults */
+ idx = 0;
+ ipvs->sysctl_amemthresh = 1024;
+ tbl[idx++].data = &ipvs->sysctl_amemthresh;
+ ipvs->sysctl_am_droprate = 10;
+ tbl[idx++].data = &ipvs->sysctl_am_droprate;
+ tbl[idx++].data = &ipvs->sysctl_drop_entry;
+ tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+ tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+ tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+ ipvs->sysctl_snat_reroute = 1;
+ tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+ ipvs->sysctl_sync_ver = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ ipvs->sysctl_sync_ports = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ports;
+ tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+ ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+ ipvs->sysctl_sync_sock_size = 0;
+ tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
+ tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+ tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
+ tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+ ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
+ ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
+ tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
+ tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
+ ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
+ tbl[idx++].data = &ipvs->sysctl_sync_retries;
+ tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+ ipvs->sysctl_pmtu_disc = 1;
+ tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+ tbl[idx++].data = &ipvs->sysctl_backup_only;
+
+
+ ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
+ if (ipvs->sysctl_hdr == NULL) {
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return -ENOMEM;
+ }
+ ip_vs_start_estimator(net, &ipvs->tot_stats);
+ ipvs->sysctl_tbl = tbl;
+ /* Schedule defense work */
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
- EnterFunction(2);
+ return 0;
+}
- /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
- INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
- }
- for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ cancel_delayed_work_sync(&ipvs->defense_work);
+ cancel_work_sync(&ipvs->defense_work.work);
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ ip_vs_stop_estimator(net, &ipvs->tot_stats);
+}
+
+#else
+
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
+
+#endif
+
+static struct notifier_block ip_vs_dst_notifier = {
+ .notifier_call = ip_vs_dst_event,
+};
+
+int __net_init ip_vs_control_net_init(struct net *net)
+{
+ int i, idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /* Initialize rs_table */
+ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+ INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
+
+ INIT_LIST_HEAD(&ipvs->dest_trash);
+ spin_lock_init(&ipvs->dest_trash_lock);
+ setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+ (unsigned long) net);
+ atomic_set(&ipvs->ftpsvc_counter, 0);
+ atomic_set(&ipvs->nullsvc_counter, 0);
+
+ /* procfs stats */
+ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!ipvs->tot_stats.cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ipvs_tot_stats;
+ ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
+ u64_stats_init(&ipvs_tot_stats->syncp);
}
- smp_wmb();
+
+ spin_lock_init(&ipvs->tot_stats.lock);
+
+ proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
+ proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
+ proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+ &ip_vs_stats_percpu_fops);
+
+ if (ip_vs_control_net_init_sysctl(net))
+ goto err;
+
+ return 0;
+
+err:
+ free_percpu(ipvs->tot_stats.cpustats);
+ return -ENOMEM;
+}
+
+void __net_exit ip_vs_control_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_trash_cleanup(net);
+ ip_vs_control_net_cleanup_sysctl(net);
+ remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
+ remove_proc_entry("ip_vs_stats", net->proc_net);
+ remove_proc_entry("ip_vs", net->proc_net);
+ free_percpu(ipvs->tot_stats.cpustats);
+}
+
+int __init ip_vs_register_nl_ioctl(void)
+{
+ int ret;
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
- return ret;
+ goto err_sock;
}
ret = ip_vs_genl_register();
if (ret) {
pr_err("cannot register Generic Netlink interface.\n");
- nf_unregister_sockopt(&ip_vs_sockopts);
- return ret;
+ goto err_genl;
}
+ return 0;
- proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
- proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
+err_genl:
+ nf_unregister_sockopt(&ip_vs_sockopts);
+err_sock:
+ return ret;
+}
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
+void ip_vs_unregister_nl_ioctl(void)
+{
+ ip_vs_genl_unregister();
+ nf_unregister_sockopt(&ip_vs_sockopts);
+}
- ip_vs_new_estimator(&ip_vs_stats);
+int __init ip_vs_control_init(void)
+{
+ int idx;
+ int ret;
- /* Hook the defense timer */
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+ EnterFunction(2);
+
+ /* Initialize svc_table, ip_vs_svc_fwm_table */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+
+ smp_wmb(); /* Do we really need it now ? */
+
+ ret = register_netdevice_notifier(&ip_vs_dst_notifier);
+ if (ret < 0)
+ return ret;
LeaveFunction(2);
return 0;
@@ -3429,14 +3904,6 @@ int __init ip_vs_control_init(void)
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
- ip_vs_trash_cleanup();
- cancel_rearming_delayed_work(&defense_work);
- cancel_work_sync(&defense_work.work);
- ip_vs_kill_estimator(&ip_vs_stats);
- unregister_sysctl_table(sysctl_header);
- proc_net_remove(&init_net, "ip_vs_stats");
- proc_net_remove(&init_net, "ip_vs");
- ip_vs_genl_unregister();
- nf_unregister_sockopt(&ip_vs_sockopts);
+ unregister_netdevice_notifier(&ip_vs_dst_notifier);
LeaveFunction(2);
}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 95fd0d14200..c3b84546ea9 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -51,7 +51,7 @@
* IPVS DH bucket
*/
struct ip_vs_dh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -64,11 +64,15 @@ struct ip_vs_dh_bucket {
#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+struct ip_vs_dh_state {
+ struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
/*
* Returns hash value for IPVS DH entry
*/
-static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
{
__be32 addr_fold = addr->ip;
@@ -85,10 +89,9 @@ static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
{
- return (tbl[ip_vs_dh_hashkey(af, addr)]).dest;
+ return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
}
@@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_dh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
p = p->next;
}
@@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
{
int i;
struct ip_vs_dh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -145,52 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl;
+ struct ip_vs_dh_state *s;
/* allocate the DH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
- GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- }
- svc->sched_data = tbl;
+
+ svc->sched_data = s;
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_dh_reassign(s, svc);
return 0;
}
-static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
+ struct ip_vs_dh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ ip_vs_dh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ struct ip_vs_dh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ ip_vs_dh_reassign(s, svc);
return 0;
}
@@ -210,27 +214,26 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Destination hashing scheduling
*/
static struct ip_vs_dest *
-ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
- struct ip_vs_dh_bucket *tbl;
- struct ip_vs_iphdr iph;
-
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+ struct ip_vs_dh_state *s;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
- dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr);
+ s = (struct ip_vs_dh_state *) svc->sched_data;
+ dest = ip_vs_dh_get(svc->af, s, &iph->daddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port));
@@ -249,7 +252,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
.init_service = ip_vs_dh_init_svc,
.done_service = ip_vs_dh_done_svc,
- .update_service = ip_vs_dh_update_svc,
+ .add_dest = ip_vs_dh_dest_changed,
+ .del_dest = ip_vs_dh_dest_changed,
.schedule = ip_vs_dh_schedule,
};
@@ -263,6 +267,7 @@ static int __init ip_vs_dh_init(void)
static void __exit ip_vs_dh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ff28801962e..1425e9a924c 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -8,8 +8,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
- *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
*/
#define KMSG_COMPONENT "IPVS"
@@ -48,11 +52,44 @@
*/
-static void estimation_timer(unsigned long arg);
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
+ struct ip_vs_cpu_stats __percpu *stats)
+{
+ int i;
+ bool add = false;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+ if (add) {
+ sum->conns += s->ustats.conns;
+ sum->inpkts += s->ustats.inpkts;
+ sum->outpkts += s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ inbytes = s->ustats.inbytes;
+ outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ sum->inbytes += inbytes;
+ sum->outbytes += outbytes;
+ } else {
+ add = true;
+ sum->conns = s->ustats.conns;
+ sum->inpkts = s->ustats.inpkts;
+ sum->outpkts = s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ sum->inbytes = s->ustats.inbytes;
+ sum->outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ }
+ }
+}
-static LIST_HEAD(est_list);
-static DEFINE_SPINLOCK(est_lock);
-static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
static void estimation_timer(unsigned long arg)
{
@@ -62,12 +99,16 @@ static void estimation_timer(unsigned long arg)
u32 n_inpkts, n_outpkts;
u64 n_inbytes, n_outbytes;
u32 rate;
+ struct net *net = (struct net *)arg;
+ struct netns_ipvs *ipvs;
- spin_lock(&est_lock);
- list_for_each_entry(e, &est_list, list) {
+ ipvs = net_ipvs(net);
+ spin_lock(&ipvs->est_lock);
+ list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
spin_lock(&s->lock);
+ ip_vs_read_cpu_stats(&s->ustats, s->cpustats);
n_conns = s->ustats.conns;
n_inpkts = s->ustats.inpkts;
n_outpkts = s->ustats.outpkts;
@@ -75,81 +116,64 @@ static void estimation_timer(unsigned long arg)
n_outbytes = s->ustats.outbytes;
/* scaled by 2^10, but divided 2 seconds */
- rate = (n_conns - e->last_conns)<<9;
+ rate = (n_conns - e->last_conns) << 9;
e->last_conns = n_conns;
- e->cps += ((long)rate - (long)e->cps)>>2;
- s->ustats.cps = (e->cps+0x1FF)>>10;
+ e->cps += ((long)rate - (long)e->cps) >> 2;
- rate = (n_inpkts - e->last_inpkts)<<9;
+ rate = (n_inpkts - e->last_inpkts) << 9;
e->last_inpkts = n_inpkts;
- e->inpps += ((long)rate - (long)e->inpps)>>2;
- s->ustats.inpps = (e->inpps+0x1FF)>>10;
+ e->inpps += ((long)rate - (long)e->inpps) >> 2;
- rate = (n_outpkts - e->last_outpkts)<<9;
+ rate = (n_outpkts - e->last_outpkts) << 9;
e->last_outpkts = n_outpkts;
- e->outpps += ((long)rate - (long)e->outpps)>>2;
- s->ustats.outpps = (e->outpps+0x1FF)>>10;
+ e->outpps += ((long)rate - (long)e->outpps) >> 2;
- rate = (n_inbytes - e->last_inbytes)<<4;
+ rate = (n_inbytes - e->last_inbytes) << 4;
e->last_inbytes = n_inbytes;
- e->inbps += ((long)rate - (long)e->inbps)>>2;
- s->ustats.inbps = (e->inbps+0xF)>>5;
+ e->inbps += ((long)rate - (long)e->inbps) >> 2;
- rate = (n_outbytes - e->last_outbytes)<<4;
+ rate = (n_outbytes - e->last_outbytes) << 4;
e->last_outbytes = n_outbytes;
- e->outbps += ((long)rate - (long)e->outbps)>>2;
- s->ustats.outbps = (e->outbps+0xF)>>5;
+ e->outbps += ((long)rate - (long)e->outbps) >> 2;
spin_unlock(&s->lock);
}
- spin_unlock(&est_lock);
- mod_timer(&est_timer, jiffies + 2*HZ);
+ spin_unlock(&ipvs->est_lock);
+ mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_new_estimator(struct ip_vs_stats *stats)
+void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
- est->last_conns = stats->ustats.conns;
- est->cps = stats->ustats.cps<<10;
-
- est->last_inpkts = stats->ustats.inpkts;
- est->inpps = stats->ustats.inpps<<10;
-
- est->last_outpkts = stats->ustats.outpkts;
- est->outpps = stats->ustats.outpps<<10;
-
- est->last_inbytes = stats->ustats.inbytes;
- est->inbps = stats->ustats.inbps<<5;
-
- est->last_outbytes = stats->ustats.outbytes;
- est->outbps = stats->ustats.outbps<<5;
-
- spin_lock_bh(&est_lock);
- list_add(&est->list, &est_list);
- spin_unlock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
+ list_add(&est->list, &ipvs->est_list);
+ spin_unlock_bh(&ipvs->est_lock);
}
-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
- spin_lock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
list_del(&est->list);
- spin_unlock_bh(&est_lock);
+ spin_unlock_bh(&ipvs->est_lock);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
-
- /* set counters zero, caller must hold the stats->lock lock */
- est->last_inbytes = 0;
- est->last_outbytes = 0;
- est->last_conns = 0;
- est->last_inpkts = 0;
- est->last_outpkts = 0;
+ struct ip_vs_stats_user *u = &stats->ustats;
+
+ /* reset counters, caller must hold the stats->lock lock */
+ est->last_inbytes = u->inbytes;
+ est->last_outbytes = u->outbytes;
+ est->last_conns = u->conns;
+ est->last_inpkts = u->inpkts;
+ est->last_outpkts = u->outpkts;
est->cps = 0;
est->inpps = 0;
est->outpps = 0;
@@ -157,13 +181,31 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
est->outbps = 0;
}
-int __init ip_vs_estimator_init(void)
+/* Get decoded rates */
+void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
+ struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *e = &stats->est;
+
+ dst->cps = (e->cps + 0x1FF) >> 10;
+ dst->inpps = (e->inpps + 0x1FF) >> 10;
+ dst->outpps = (e->outpps + 0x1FF) >> 10;
+ dst->inbps = (e->inbps + 0xF) >> 5;
+ dst->outbps = (e->outbps + 0xF) >> 5;
+}
+
+int __net_init ip_vs_estimator_net_init(struct net *net)
{
- mod_timer(&est_timer, jiffies + 2 * HZ);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->est_list);
+ spin_lock_init(&ipvs->est_lock);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
return 0;
}
-void ip_vs_estimator_cleanup(void)
+void __net_exit ip_vs_estimator_net_cleanup(struct net *net)
{
- del_timer_sync(&est_timer);
+ del_timer_sync(&net_ipvs(net)->est_timer);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 75455000ad1..77c173282f3 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -44,16 +44,17 @@
#include <net/ip_vs.h>
-#define SERVER_STRING "227 Entering Passive Mode ("
-#define CLIENT_STRING "PORT "
+#define SERVER_STRING "227 "
+#define CLIENT_STRING "PORT"
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
*/
+static unsigned int ports_count = 1;
static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
-module_param_array(ports, ushort, NULL, 0);
+module_param_array(ports, ushort, &ports_count, 0444);
MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
@@ -79,14 +80,17 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
/*
* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern" and terminated with the "term" character.
+ * with the "pattern", ignoring before "skip" and terminated with
+ * the "term" character.
* <addr,port> is in network order.
*/
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
- const char *pattern, size_t plen, char term,
+ const char *pattern, size_t plen,
+ char skip, char term,
__be32 *addr, __be16 *port,
char **start, char **end)
{
+ char *s, c;
unsigned char p[6];
int i = 0;
@@ -101,19 +105,38 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (strnicmp(data, pattern, plen) != 0) {
return 0;
}
- *start = data + plen;
+ s = data + plen;
+ if (skip) {
+ int found = 0;
+
+ for (;; s++) {
+ if (s == data_limit)
+ return -1;
+ if (!found) {
+ if (*s == skip)
+ found = 1;
+ } else if (*s != skip) {
+ break;
+ }
+ }
+ }
- for (data = *start; *data != term; data++) {
+ for (data = s; ; data++) {
if (data == data_limit)
return -1;
+ if (*data == term)
+ break;
}
*end = data;
memset(p, 0, sizeof(p));
- for (data = *start; data != *end; data++) {
- if (*data >= '0' && *data <= '9') {
- p[i] = p[i]*10 + *data - '0';
- } else if (*data == ',' && i < 5) {
+ for (data = s; ; data++) {
+ c = *data;
+ if (c == term)
+ break;
+ if (c >= '0' && c <= '9') {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
i++;
} else {
/* unexpected character */
@@ -124,8 +147,9 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (i != 5)
return -1;
- *addr = get_unaligned((__be32 *)p);
- *port = get_unaligned((__be16 *)(p + 4));
+ *start = s;
+ *addr = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
return 1;
}
@@ -153,10 +177,11 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
__be16 port;
struct ip_vs_conn *n_cp;
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
- unsigned buf_len;
+ unsigned int buf_len;
int ret = 0;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -184,7 +209,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (ip_vs_ftp_get_addrport(data, data_limit,
SERVER_STRING,
- sizeof(SERVER_STRING)-1, ')',
+ sizeof(SERVER_STRING)-1,
+ '(', ')',
&from.ip, &port,
&start, &end) != 1)
return 1;
@@ -197,18 +223,20 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol,
- &from, port, &cp->caddr, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &from, port,
+ &cp->caddr, 0, &p);
n_cp = ip_vs_conn_out_get(&p);
}
if (!n_cp) {
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ AF_INET, IPPROTO_TCP, &cp->caddr,
0, &cp->vaddr, port, &p);
n_cp = ip_vs_conn_new(&p, &from, port,
IP_VS_CONN_F_NO_CPORT |
IP_VS_CONN_F_NFCT,
- cp->dest);
+ cp->dest, skb->mark);
if (!n_cp)
return 0;
@@ -239,9 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* hopefully it will succeed on the retransmitted
* packet.
*/
+ rcu_read_lock();
ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ iph->ihl * 4,
start-data, end-start,
buf, buf_len);
+ rcu_read_unlock();
if (ret) {
ip_vs_nfct_expect_related(skb, ct, n_cp,
IPPROTO_TCP, 0, 0);
@@ -257,8 +288,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* would be adjusted twice.
*/
+ net = skb_net(skb);
cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return ret;
}
@@ -287,6 +319,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -340,7 +373,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
if (ip_vs_ftp_get_addrport(data_start, data_limit,
CLIENT_STRING, sizeof(CLIENT_STRING)-1,
- '\r', &to.ip, &port,
+ ' ', '\r', &to.ip, &port,
&start, &end) != 1)
return 1;
@@ -358,14 +391,15 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
- &cp->vaddr, htons(ntohs(cp->vport)-1),
- &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &to, port, &cp->vaddr,
+ htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
if (!n_cp) {
n_cp = ip_vs_conn_new(&p, &cp->daddr,
htons(ntohs(cp->dport)-1),
- IP_VS_CONN_F_NFCT, cp->dest);
+ IP_VS_CONN_F_NFCT, cp->dest,
+ skb->mark);
if (!n_cp)
return 0;
@@ -377,7 +411,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Move tunnel to listen state
*/
- ip_vs_tcp_conn_listen(n_cp);
+ net = skb_net(skb);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return 1;
@@ -398,42 +433,66 @@ static struct ip_vs_app ip_vs_ftp = {
.pkt_in = ip_vs_ftp_in,
};
-
/*
- * ip_vs_ftp initialization
+ * per netns ip_vs_ftp initialization
*/
-static int __init ip_vs_ftp_init(void)
+static int __net_init __ip_vs_ftp_init(struct net *net)
{
int i, ret;
- struct ip_vs_app *app = &ip_vs_ftp;
+ struct ip_vs_app *app;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- ret = register_ip_vs_app(app);
- if (ret)
- return ret;
+ if (!ipvs)
+ return -ENOENT;
+
+ app = register_ip_vs_app(net, &ip_vs_ftp);
+ if (IS_ERR(app))
+ return PTR_ERR(app);
- for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+ for (i = 0; i < ports_count; i++) {
if (!ports[i])
continue;
- ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+ ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
if (ret)
- break;
+ goto err_unreg;
pr_info("%s: loaded support on port[%d] = %d\n",
app->name, i, ports[i]);
}
+ return 0;
- if (ret)
- unregister_ip_vs_app(app);
-
+err_unreg:
+ unregister_ip_vs_app(net, &ip_vs_ftp);
return ret;
}
+/*
+ * netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+ unregister_ip_vs_app(net, &ip_vs_ftp);
+}
+static struct pernet_operations ip_vs_ftp_ops = {
+ .init = __ip_vs_ftp_init,
+ .exit = __ip_vs_ftp_exit,
+};
+
+static int __init ip_vs_ftp_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
+ return rv;
+}
/*
* ip_vs_ftp finish.
*/
static void __exit ip_vs_ftp_exit(void)
{
- unregister_ip_vs_app(&ip_vs_ftp);
+ unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 9323f894419..547ff33c1ef 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -63,6 +63,8 @@
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
@@ -70,7 +72,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
/*
@@ -89,11 +90,12 @@ static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
* IP address and its destination server
*/
struct ip_vs_lblc_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest *dest; /* real server (cache) */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -101,48 +103,53 @@ struct ip_vs_lblc_entry {
* IPVS lblc hash table
*/
struct ip_vs_lblc_table {
- struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct timer_list periodic_timer; /* collect stale entries */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
- struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
/*
* IPVS LBLC sysctl table
*/
-
-static ctl_table vs_vars_table[] = {
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vs_vars_table[] = {
{
.procname = "lblc_expiration",
- .data = &sysctl_ip_vs_lblc_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{ }
};
+#endif
-static struct ctl_table_header * sysctl_header;
-
-static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+static void ip_vs_lblc_rcu_free(struct rcu_head *head)
{
- list_del(&en->list);
- /*
- * We don't kfree dest because it is refered either by its service
- * or the trash dest list.
- */
- atomic_dec(&en->dest->refcnt);
+ struct ip_vs_lblc_entry *en = container_of(head,
+ struct ip_vs_lblc_entry,
+ rcu_head);
+
+ ip_vs_dest_put_and_free(en->dest);
kfree(en);
}
+static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
+}
/*
* Returns hash value for IPVS LBLC entry
*/
-static inline unsigned
+static inline unsigned int
ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
{
__be32 addr_fold = addr->ip;
@@ -163,25 +170,22 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
static void
ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
{
- unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr);
+ unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
- * lock
- */
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
const union nf_inet_addr *addr)
{
- unsigned hash = ip_vs_lblc_hashkey(af, addr);
+ unsigned int hash = ip_vs_lblc_hashkey(af, addr);
struct ip_vs_lblc_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -191,7 +195,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
/*
* Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
- * address to a server. Called under write lock.
+ * address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
@@ -200,26 +204,23 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
struct ip_vs_lblc_entry *en;
en = ip_vs_lblc_get(dest->af, tbl, daddr);
- if (!en) {
- en = kmalloc(sizeof(*en), GFP_ATOMIC);
- if (!en) {
- pr_err("%s(): no memory\n", __func__);
- return NULL;
- }
+ if (en) {
+ if (en->dest == dest)
+ return en;
+ ip_vs_lblc_del(en);
+ }
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
- en->af = dest->af;
- ip_vs_addr_copy(dest->af, &en->addr, daddr);
- en->lastuse = jiffies;
+ en->af = dest->af;
+ ip_vs_addr_copy(dest->af, &en->addr, daddr);
+ en->lastuse = jiffies;
- atomic_inc(&dest->refcnt);
- en->dest = dest;
+ ip_vs_dest_hold(dest);
+ en->dest = dest;
- ip_vs_lblc_hash(tbl, en);
- } else if (en->dest != dest) {
- atomic_dec(&en->dest->refcnt);
- atomic_inc(&dest->refcnt);
- en->dest = dest;
- }
+ ip_vs_lblc_hash(tbl, en);
return en;
}
@@ -228,40 +229,56 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
{
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
int i;
- for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
- ip_vs_lblc_free(en);
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
+static int sysctl_lblc_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblc_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
unsigned long now = jiffies;
int i, j;
- for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now,
- en->lastuse + sysctl_ip_vs_lblc_expiration))
+ en->lastuse +
+ sysctl_lblc_expiration(svc)))
continue;
- ip_vs_lblc_free(en);
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -285,7 +302,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -303,26 +321,26 @@ static void ip_vs_lblc_check_expire(unsigned long data)
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
- for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
continue;
- ip_vs_lblc_free(en);
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
tbl->rover = j;
out:
- mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
}
@@ -334,11 +352,10 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblc_table for this service
*/
- tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
return -ENOMEM;
- }
+
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
"current service\n", sizeof(*tbl));
@@ -346,12 +363,13 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
/*
* Initialize the hash buckets
*/
- for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -364,7 +382,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
@@ -372,14 +390,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblc_flush(tbl);
+ ip_vs_lblc_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -390,12 +406,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
int loh, doh;
/*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
+ * We use the following formula to estimate the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
@@ -406,13 +417,12 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -422,14 +432,13 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -457,7 +466,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -472,20 +481,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
- struct ip_vs_iphdr iph;
struct ip_vs_dest *dest = NULL;
struct ip_vs_lblc_entry *en;
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
- en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
+ en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
if (en) {
/* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
@@ -499,30 +505,28 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* free up entries from the trash at any time.
*/
- if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
- dest = en->dest;
+ dest = en->dest;
+ if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+ atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+ goto out;
}
- read_unlock(&svc->sched_lock);
-
- /* If the destination has a weight and is not overloaded, use it */
- if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
- goto out;
/* No cache entry or it is invalid, time to schedule */
dest = __ip_vs_lblc_schedule(svc);
if (!dest) {
- IP_VS_ERR_RL("LBLC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblc_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblc_new(tbl, &iph->daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
return dest;
@@ -532,8 +536,7 @@ out:
/*
* IPVS LBLC Scheduler structure
*/
-static struct ip_vs_scheduler ip_vs_lblc_scheduler =
-{
+static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
.name = "lblc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
@@ -543,23 +546,85 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
.schedule = ip_vs_lblc_schedule,
};
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblc_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblc_ctl_table[0].procname = NULL;
+
+ } else
+ ipvs->lblc_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+ ipvs->lblc_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
+ if (!ipvs->lblc_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblc_ops = {
+ .init = __ip_vs_lblc_init,
+ .exit = __ip_vs_lblc_exit,
+};
static int __init ip_vs_lblc_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblc_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
return ret;
}
-
static void __exit ip_vs_lblc_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
+ rcu_barrier();
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index dbeed8ea421..3f21a2f47de 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -63,6 +63,8 @@
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
@@ -70,8 +72,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
/*
* for IPVS lblcr entry hash table
@@ -89,42 +89,49 @@ static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
*/
struct ip_vs_dest_set_elem {
struct list_head list; /* list link */
- struct ip_vs_dest *dest; /* destination server */
+ struct ip_vs_dest *dest; /* destination server */
+ struct rcu_head rcu_head;
};
struct ip_vs_dest_set {
atomic_t size; /* set size */
unsigned long lastmod; /* last modified time */
struct list_head list; /* destination list */
- rwlock_t lock; /* lock for this list */
};
-static struct ip_vs_dest_set_elem *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+ struct ip_vs_dest *dest, bool check)
{
struct ip_vs_dest_set_elem *e;
- list_for_each_entry(e, &set->list, list) {
- if (e->dest == dest)
- /* already existed */
- return NULL;
+ if (check) {
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest)
+ return;
+ }
}
e = kmalloc(sizeof(*e), GFP_ATOMIC);
- if (e == NULL) {
- pr_err("%s(): no memory\n", __func__);
- return NULL;
- }
+ if (e == NULL)
+ return;
- atomic_inc(&dest->refcnt);
+ ip_vs_dest_hold(dest);
e->dest = dest;
- list_add(&e->list, &set->list);
+ list_add_rcu(&e->list, &set->list);
atomic_inc(&set->size);
set->lastmod = jiffies;
- return e;
+}
+
+static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
+ ip_vs_dest_put_and_free(e->dest);
+ kfree(e);
}
static void
@@ -137,9 +144,8 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
/* HIT */
atomic_dec(&set->size);
set->lastmod = jiffies;
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
break;
}
}
@@ -149,17 +155,10 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
{
struct ip_vs_dest_set_elem *e, *ep;
- write_lock(&set->lock);
list_for_each_entry_safe(e, ep, &set->list, list) {
- /*
- * We don't kfree dest because it is refered either
- * by its service or by the trash dest list.
- */
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
}
- write_unlock(&set->lock);
}
/* get weighted least-connection node in the destination set */
@@ -169,19 +168,15 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
struct ip_vs_dest *dest, *least;
int loh, doh;
- if (set == NULL)
- return NULL;
-
/* select the first destination server, whose weight > 0 */
- list_for_each_entry(e, &set->list, list) {
+ list_for_each_entry_rcu(e, &set->list, list) {
least = e->dest;
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if ((atomic_read(&least->weight) > 0)
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -189,15 +184,14 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* find the destination with the weighted least load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
+ list_for_each_entry_continue_rcu(e, &set->list, list) {
dest = e->dest;
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if ((loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))
+ doh = ip_vs_dest_conn_overhead(dest);
+ if (((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
least = dest;
loh = doh;
@@ -230,8 +224,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
list_for_each_entry(e, &set->list, list) {
most = e->dest;
if (atomic_read(&most->weight) > 0) {
- moh = atomic_read(&most->activeconns) * 50
- + atomic_read(&most->inactconns);
+ moh = ip_vs_dest_conn_overhead(most);
goto nextstage;
}
}
@@ -239,13 +232,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* find the destination with the weighted most load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
+ list_for_each_entry_continue(e, &set->list, list) {
dest = e->dest;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
- if ((moh * atomic_read(&dest->weight) <
- doh * atomic_read(&most->weight))
+ if (((__s64)moh * atomic_read(&dest->weight) <
+ (__s64)doh * atomic_read(&most->weight))
&& (atomic_read(&dest->weight) > 0)) {
most = dest;
moh = doh;
@@ -268,11 +260,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
* IP address and its destination server set
*/
struct ip_vs_lblcr_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
struct ip_vs_dest_set set; /* destination server set */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -280,44 +273,46 @@ struct ip_vs_lblcr_entry {
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
- struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
+#ifdef CONFIG_SYSCTL
/*
* IPVS LBLCR sysctl table
*/
-static ctl_table vs_vars_table[] = {
+static struct ctl_table vs_vars_table[] = {
{
.procname = "lblcr_expiration",
- .data = &sysctl_ip_vs_lblcr_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{ }
};
-
-static struct ctl_table_header * sysctl_header;
+#endif
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
- list_del(&en->list);
+ hlist_del_rcu(&en->list);
ip_vs_dest_set_eraseall(&en->set);
- kfree(en);
+ kfree_rcu(en, rcu_head);
}
/*
* Returns hash value for IPVS LBLCR entry
*/
-static inline unsigned
+static inline unsigned int
ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
{
__be32 addr_fold = addr->ip;
@@ -338,25 +333,22 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
static void
ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
- unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
+ unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
- * read lock.
- */
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
const union nf_inet_addr *addr)
{
- unsigned hash = ip_vs_lblcr_hashkey(af, addr);
+ unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
struct ip_vs_lblcr_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -366,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
/*
* Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server. Called under write lock.
+ * IP address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
@@ -377,10 +369,8 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
en = ip_vs_lblcr_get(dest->af, tbl, daddr);
if (!en) {
en = kmalloc(sizeof(*en), GFP_ATOMIC);
- if (!en) {
- pr_err("%s(): no memory\n", __func__);
+ if (!en)
return NULL;
- }
en->af = dest->af;
ip_vs_addr_copy(dest->af, &en->addr, daddr);
@@ -389,14 +379,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/* initialize its dest set */
atomic_set(&(en->set.size), 0);
INIT_LIST_HEAD(&en->set.list);
- rwlock_init(&en->set.lock);
+
+ ip_vs_dest_set_insert(&en->set, dest, false);
ip_vs_lblcr_hash(tbl, en);
+ return en;
}
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest, true);
return en;
}
@@ -405,40 +395,54 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
int i;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
- /* No locking required, only called during cleanup. */
- for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
ip_vs_lblcr_free(en);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
+static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblcr_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
- for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
- now))
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_after(en->lastuse +
+ sysctl_lblcr_expiration(svc), now))
continue;
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -462,7 +466,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -480,11 +485,11 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
- for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
continue;
@@ -492,7 +497,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -510,11 +515,10 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblcr_table for this service
*/
- tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
return -ENOMEM;
- }
+
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
"current service\n", sizeof(*tbl));
@@ -522,12 +526,13 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Initialize the hash buckets
*/
- for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -540,7 +545,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
@@ -548,14 +553,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblcr_flush(tbl);
+ ip_vs_lblcr_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -566,12 +569,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
int loh, doh;
/*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
+ * We use the following formula to estimate the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
@@ -582,14 +580,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -599,14 +596,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -634,7 +630,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -649,65 +645,56 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
- struct ip_vs_iphdr iph;
- struct ip_vs_dest *dest = NULL;
+ struct ip_vs_dest *dest;
struct ip_vs_lblcr_entry *en;
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
- en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
+ en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);
if (en) {
- /* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
/* Get the least loaded destination */
- read_lock(&en->set.lock);
dest = ip_vs_dest_set_min(&en->set);
- read_unlock(&en->set.lock);
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
- time_after(jiffies, en->set.lastmod +
- sysctl_ip_vs_lblcr_expiration)) {
- struct ip_vs_dest *m;
-
- write_lock(&en->set.lock);
- m = ip_vs_dest_set_max(&en->set);
- if (m)
- ip_vs_dest_set_erase(&en->set, m);
- write_unlock(&en->set.lock);
+ time_after(jiffies, en->set.lastmod +
+ sysctl_lblcr_expiration(svc))) {
+ spin_lock_bh(&svc->sched_lock);
+ if (atomic_read(&en->set.size) > 1) {
+ struct ip_vs_dest *m;
+
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ spin_unlock_bh(&svc->sched_lock);
}
/* If the destination is not overloaded, use it */
- if (dest && !is_overloaded(dest, svc)) {
- read_unlock(&svc->sched_lock);
+ if (dest && !is_overloaded(dest, svc))
goto out;
- }
/* The cache entry is invalid, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
if (!dest) {
- IP_VS_ERR_RL("LBLCR: no destination available\n");
- read_unlock(&svc->sched_lock);
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
/* Update our cache entry */
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
- }
- read_unlock(&svc->sched_lock);
-
- if (dest)
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_dest_set_insert(&en->set, dest, true);
+ spin_unlock_bh(&svc->sched_lock);
goto out;
+ }
/* No cache entry, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
@@ -717,13 +704,14 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblcr_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblcr_new(tbl, &iph->daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
return dest;
@@ -744,23 +732,84 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
.schedule = ip_vs_lblcr_schedule,
};
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblcr_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblcr_ctl_table[0].procname = NULL;
+ } else
+ ipvs->lblcr_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+ ipvs->lblcr_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
+ if (!ipvs->lblcr_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+ .init = __ip_vs_lblcr_init,
+ .exit = __ip_vs_lblcr_exit,
+};
static int __init ip_vs_lblcr_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
return ret;
}
-
static void __exit ip_vs_lblcr_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ rcu_barrier();
}
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index 4f69db1fac5..2bdcb1cf212 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -22,27 +22,12 @@
#include <net/ip_vs.h>
-
-static inline unsigned int
-ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
-{
- /*
- * We think the overhead of processing active connections is 256
- * times higher than that of inactive connections in average. (This
- * 256 times might not be accurate, we will change it later) We
- * use the following formula to estimate the overhead now:
- * dest->activeconns*256 + dest->inactconns
- */
- return (atomic_read(&dest->activeconns) << 8) +
- atomic_read(&dest->inactconns);
-}
-
-
/*
* Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
@@ -58,11 +43,11 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* served, but no new connection is assigned to the server.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
- doh = ip_vs_lc_dest_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest);
if (!least || doh < loh) {
least = dest;
loh = doh;
@@ -70,7 +55,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
if (!least)
- IP_VS_ERR_RL("LC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
else
IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "
"inactconns %d\n",
@@ -100,6 +85,7 @@ static int __init ip_vs_lc_init(void)
static void __exit ip_vs_lc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_lc_init);
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 4680647cd45..5882bbfd198 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -19,8 +19,7 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*
* Authors:
@@ -63,6 +62,7 @@
#include <net/ip_vs.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_zones.h>
@@ -82,7 +82,7 @@ void
ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
{
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
struct nf_conntrack_tuple new_tuple;
if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
@@ -97,6 +97,11 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return;
+ /* Applications may adjust TCP seqs */
+ if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP &&
+ !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct))
+ return;
+
/*
* The connection is not yet in the hashtable, so we update it.
* CIP->VIP will remain the same, so leave the tuple in
@@ -127,7 +132,7 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
nf_conntrack_alter_reply(ct, &new_tuple);
}
-int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+int ip_vs_confirm_conntrack(struct sk_buff *skb)
{
return nf_conntrack_confirm(skb);
}
@@ -141,6 +146,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
struct nf_conntrack_tuple *orig, new_reply;
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = nf_ct_net(ct);
if (exp->tuple.src.l3num != PF_INET)
return;
@@ -155,7 +161,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
/* RS->CLIENT */
orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+ ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
&orig->src.u3, orig->src.u.tcp.port,
&orig->dst.u3, orig->dst.u.tcp.port, &p);
cp = ip_vs_conn_out_get(&p);
@@ -268,7 +274,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
" for conn " FMT_CONN "\n",
__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
- h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+ h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+ &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
/* Show what happens instead of calling nf_ct_kill() */
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index c413e183082..961a6de9bb2 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -40,7 +40,7 @@
#include <net/ip_vs.h>
-static inline unsigned int
+static inline int
ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
{
/*
@@ -55,10 +55,11 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least = NULL;
- unsigned int loh = 0, doh;
+ int loh = 0, doh;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
@@ -75,7 +76,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
!atomic_read(&dest->weight))
@@ -91,15 +92,15 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
if (!least ||
- (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))) {
+ ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))) {
least = dest;
loh = doh;
}
}
if (!least) {
- IP_VS_ERR_RL("NQ: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
@@ -133,6 +134,7 @@ static int __init ip_vs_nq_init(void)
static void __exit ip_vs_nq_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_nq_init);
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 3414af70ee1..1a82b29ce8e 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -13,33 +13,19 @@
/* IPVS pe list */
static LIST_HEAD(ip_vs_pe);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_pe_lock);
-
-/* Bind a service with a pe */
-void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
-{
- svc->pe = pe;
-}
-
-/* Unbind a service from its pe */
-void ip_vs_unbind_pe(struct ip_vs_service *svc)
-{
- svc->pe = NULL;
-}
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
/* Get pe in the pe list by name */
-static struct ip_vs_pe *
-ip_vs_pe_getbyname(const char *pe_name)
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
{
struct ip_vs_pe *pe;
- IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+ IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
- spin_lock_bh(&ip_vs_pe_lock);
-
- list_for_each_entry(pe, &ip_vs_pe, n_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
/* Test and get the modules atomically */
if (pe->module &&
!try_module_get(pe->module)) {
@@ -48,40 +34,34 @@ ip_vs_pe_getbyname(const char *pe_name)
}
if (strcmp(pe_name, pe->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_pe_lock);
+ rcu_read_unlock();
return pe;
}
if (pe->module)
module_put(pe->module);
}
+ rcu_read_unlock();
- spin_unlock_bh(&ip_vs_pe_lock);
return NULL;
}
/* Lookup pe and try to load it if it doesn't exist */
-struct ip_vs_pe *ip_vs_pe_get(const char *name)
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
{
struct ip_vs_pe *pe;
/* Search for the pe by name */
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
/* If pe not found, load the module and search again */
if (!pe) {
request_module("ip_vs_pe_%s", name);
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
}
return pe;
}
-void ip_vs_pe_put(struct ip_vs_pe *pe)
-{
- if (pe && pe->module)
- module_put(pe->module);
-}
-
/* Register a pe in the pe list */
int register_ip_vs_pe(struct ip_vs_pe *pe)
{
@@ -90,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_pe_lock);
-
- if (!list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- ip_vs_use_count_dec();
- pr_err("%s(): [%s] pe already linked\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Make sure that the pe with this name doesn't exist
* in the pe list.
*/
list_for_each_entry(tmp, &ip_vs_pe, n_list) {
if (strcmp(tmp->name, pe->name) == 0) {
- spin_unlock_bh(&ip_vs_pe_lock);
+ mutex_unlock(&ip_vs_pe_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] pe already existed "
"in the system\n", __func__, pe->name);
@@ -113,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
}
}
/* Add it into the d-linked pe list */
- list_add(&pe->n_list, &ip_vs_pe);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
pr_info("[%s] pe registered.\n", pe->name);
@@ -125,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe);
/* Unregister a pe from the pe list */
int unregister_ip_vs_pe(struct ip_vs_pe *pe)
{
- spin_lock_bh(&ip_vs_pe_lock);
- if (list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- pr_err("%s(): [%s] pe is not in the list. failed\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Remove it from the d-linked pe list */
- list_del(&pe->n_list);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index b8b4e9620f3..bed5f704252 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
const char *callid, size_t callid_len,
int *idx)
{
- size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+ size_t max_len = 64;
+ size_t len = min3(max_len, callid_len, buf_len - *idx - 1);
memcpy(buf + *idx, callid, len);
buf[*idx+len] = '\0';
*idx += len + 1;
@@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff,
if (ret > 0)
break;
if (!ret)
- return 0;
+ return -EINVAL;
dataoff += *matchoff;
}
- /* Empty callid is useless */
- if (!*matchlen)
- return -EINVAL;
-
/* Too large is useless */
if (*matchlen > IP_VS_PEDATA_MAXLEN)
return -EINVAL;
@@ -71,32 +68,36 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
struct ip_vs_iphdr iph;
unsigned int dataoff, datalen, matchoff, matchlen;
const char *dptr;
+ int retc;
- ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(p->af, skb, &iph);
/* Only useful with UDP */
if (iph.protocol != IPPROTO_UDP)
return -EINVAL;
-
- /* No Data ? */
+ /* todo: IPv6 fragments:
+ * I think this only should be done for the first fragment. /HS
+ */
dataoff = iph.len + sizeof(struct udphdr);
+
if (dataoff >= skb->len)
return -EINVAL;
-
+ retc = skb_linearize(skb);
+ if (retc < 0)
+ return retc;
dptr = skb->data + dataoff;
datalen = skb->len - dataoff;
if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
return -EINVAL;
- p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
- if (!p->pe_data)
- return -ENOMEM;
-
/* N.B: pe_data is only set on success,
* this allows fallback to the default persistence logic on failure
*/
- memcpy(p->pe_data, dptr + matchoff, matchlen);
+ p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
p->pe_data_len = matchlen;
return 0;
@@ -106,7 +107,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
struct ip_vs_conn *ct)
{
- bool ret = 0;
+ bool ret = false;
if (ct->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
@@ -119,7 +120,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
ct->protocol == p->protocol &&
ct->pe_data && ct->pe_data_len == p->pe_data_len &&
!memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
- ret = 1;
+ ret = true;
IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -162,6 +163,7 @@ static int __init ip_vs_sip_init(void)
static void __exit ip_vs_sip_cleanup(void)
{
unregister_ip_vs_pe(&ip_vs_sip_pe);
+ synchronize_rcu();
}
module_init(ip_vs_sip_init);
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index c5399839087..939f7fbe9b4 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -25,7 +25,6 @@
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/udp.h>
-#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
@@ -49,7 +48,7 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
*/
static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
- unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
pp->next = ip_vs_proto_table[hash];
ip_vs_proto_table[hash] = pp;
@@ -60,6 +59,37 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
return 0;
}
+/*
+ * register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+ struct ip_vs_proto_data *pd =
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
+
+ if (!pd)
+ return -ENOMEM;
+
+ pd->pp = pp; /* For speed issues */
+ pd->next = ipvs->proto_data_table[hash];
+ ipvs->proto_data_table[hash] = pd;
+ atomic_set(&pd->appcnt, 0); /* Init app counter */
+
+ if (pp->init_netns != NULL) {
+ int ret = pp->init_netns(net, pd);
+ if (ret) {
+ /* unlink an free proto data */
+ ipvs->proto_data_table[hash] = pd->next;
+ kfree(pd);
+ return ret;
+ }
+ }
+
+ return 0;
+}
/*
* unregister an ipvs protocol
@@ -67,7 +97,7 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
{
struct ip_vs_protocol **pp_p;
- unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
pp_p = &ip_vs_proto_table[hash];
for (; *pp_p; pp_p = &(*pp_p)->next) {
@@ -82,6 +112,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
return -ESRCH;
}
+/*
+ * unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data **pd_p;
+ unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+ pd_p = &ipvs->proto_data_table[hash];
+ for (; *pd_p; pd_p = &(*pd_p)->next) {
+ if (*pd_p == pd) {
+ *pd_p = pd->next;
+ if (pd->pp->exit_netns != NULL)
+ pd->pp->exit_netns(net, pd);
+ kfree(pd);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
/*
* get ip_vs_protocol object by its proto.
@@ -89,7 +142,7 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
{
struct ip_vs_protocol *pp;
- unsigned hash = IP_VS_PROTO_HASH(proto);
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
if (pp->protocol == proto)
@@ -100,19 +153,44 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
}
EXPORT_SYMBOL(ip_vs_proto_get);
+/*
+ * get ip_vs_protocol object data by netns and proto
+ */
+static struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+ struct ip_vs_proto_data *pd;
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
+
+ for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+ if (pd->pp->protocol == proto)
+ return pd;
+ }
+
+ return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
/*
* Propagate event for state change to all protocols
*/
-void ip_vs_protocol_timeout_change(int flags)
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
{
- struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
int i;
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
- for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
- if (pp->timeout_change)
- pp->timeout_change(pp, flags);
+ for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+ if (pd->pp->timeout_change)
+ pd->pp->timeout_change(pd, flags);
}
}
}
@@ -121,7 +199,7 @@ void ip_vs_protocol_timeout_change(int flags)
int *
ip_vs_create_timeout_table(int *table, int size)
{
- return kmemdup(table, size, GFP_ATOMIC);
+ return kmemdup(table, size, GFP_KERNEL);
}
@@ -202,17 +280,17 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
if (ih == NULL)
sprintf(buf, "TRUNCATED");
else if (ih->nexthdr == IPPROTO_FRAGMENT)
- sprintf(buf, "%pI6->%pI6 frag", &ih->saddr, &ih->daddr);
+ sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr);
else {
__be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
sizeof(_ports), _ports);
if (pptr == NULL)
- sprintf(buf, "TRUNCATED %pI6->%pI6",
+ sprintf(buf, "TRUNCATED %pI6c->%pI6c",
&ih->saddr, &ih->daddr);
else
- sprintf(buf, "%pI6:%u->%pI6:%u",
+ sprintf(buf, "%pI6c:%u->%pI6c:%u",
&ih->saddr, ntohs(pptr[0]),
&ih->daddr, ntohs(pptr[1]));
}
@@ -236,6 +314,54 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
}
+/*
+ * per network name-space init
+ */
+int __net_init ip_vs_protocol_net_init(struct net *net)
+{
+ int i, ret;
+ static struct ip_vs_protocol *protos[] = {
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ &ip_vs_protocol_tcp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ &ip_vs_protocol_udp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ &ip_vs_protocol_sctp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ &ip_vs_protocol_ah,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ &ip_vs_protocol_esp,
+#endif
+ };
+
+ for (i = 0; i < ARRAY_SIZE(protos); i++) {
+ ret = register_ip_vs_proto_netns(net, protos[i]);
+ if (ret < 0)
+ goto cleanup;
+ }
+ return 0;
+
+cleanup:
+ ip_vs_protocol_net_cleanup(net);
+ return ret;
+}
+
+void __net_exit ip_vs_protocol_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ /* unregister all the ipvs proto data for this netns */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pd = ipvs->proto_data_table[i]) != NULL)
+ unregister_ip_vs_proto_netns(net, pd);
+ }
+}
int __init ip_vs_protocol_init(void)
{
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 3a0461117d3..5de3dd312c0 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,28 +41,30 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
static void
-ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
- int inverse, struct ip_vs_conn_param *p)
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+ const struct ip_vs_iphdr *iph, int inverse,
+ struct ip_vs_conn_param *p)
{
if (likely(!inverse))
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->saddr, htons(PORT_ISAKMP),
&iph->daddr, htons(PORT_ISAKMP), p);
else
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->daddr, htons(PORT_ISAKMP),
&iph->saddr, htons(PORT_ISAKMP), p);
}
static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph, unsigned int proto_off,
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
@@ -72,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -83,21 +85,19 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn *
ah_esp_conn_out_get(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off,
- int inverse)
+ const struct ip_vs_iphdr *iph, int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -107,8 +107,9 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
/*
* AH/ESP is only related traffic. Pass the packet to IP stack.
@@ -117,26 +118,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
#ifdef CONFIG_IP_VS_PROTO_AH
struct ip_vs_protocol ip_vs_protocol_ah = {
.name = "AH",
.protocol = IPPROTO_AH,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
@@ -149,7 +138,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
.app_conn_bind = NULL,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
- .set_state_timeout = NULL,
};
#endif
@@ -159,8 +147,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
.protocol = IPPROTO_ESP,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1ea96bcd342..2f7ea756404 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,36 +9,43 @@
#include <net/ip_vs.h>
static int
-sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
+ struct net *net;
struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs;
sctp_chunkhdr_t _schunkh, *sch;
sctp_sctphdr_t *sh, _sctph;
- struct ip_vs_iphdr iph;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
- sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph);
- if (sh == NULL)
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (sh == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
- sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t),
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
sizeof(_schunkh), &_schunkh);
- if (sch == NULL)
+ if (sch == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
- if ((sch->type == SCTP_CID_INIT) &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
- &iph.daddr, sh->dest))) {
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ rcu_read_lock();
+ if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, sh->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -46,101 +53,119 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
-
+ rcu_read_unlock();
+ /* NF_ACCEPT */
return 1;
}
+static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
+ unsigned int sctphoff)
+{
+ sctph->checksum = sctp_compute_cksum(skb, sctphoff);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
static int
-sctp_snat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
sctp_sctphdr_t *sctph;
- unsigned int sctphoff;
- struct sk_buff *iter;
- __be32 crc32;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- sctphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- sctphoff = ip_hdrlen(skb);
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, skb))
+ ret = ip_vs_app_pkt_out(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->source = cp->vport;
- /* Calculate the checksum */
- crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- skb_walk_frags(skb, iter)
- crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
- crc32);
- crc32 = sctp_end_cksum(crc32);
- sctph->checksum = crc32;
+ /* Only update csum if we really have to */
+ if (sctph->source != cp->vport || payload_csum ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ sctph->source = cp->vport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
static int
-sctp_dnat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
sctp_sctphdr_t *sctph;
- unsigned int sctphoff;
- struct sk_buff *iter;
- __be32 crc32;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- sctphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- sctphoff = ip_hdrlen(skb);
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_in(cp, skb))
+ ret = ip_vs_app_pkt_in(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->dest = cp->dport;
- /* Calculate the checksum */
- crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- skb_walk_frags(skb, iter)
- crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
- crc32);
- crc32 = sctp_end_cksum(crc32);
- sctph->checksum = crc32;
+ /* Only update csum if we really have to */
+ if (sctph->dest != cp->dport || payload_csum ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ sctph->dest = cp->dport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
@@ -150,10 +175,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
{
unsigned int sctphoff;
struct sctphdr *sh, _sctph;
- struct sk_buff *iter;
- __le32 cmp;
- __le32 val;
- __u32 tmp;
+ __le32 cmp, val;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
@@ -167,13 +189,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 0;
cmp = sh->checksum;
-
- tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
- skb_walk_frags(skb, iter)
- tmp = sctp_update_cksum((__u8 *) iter->data,
- skb_headlen(iter), tmp);
-
- val = sctp_end_cksum(tmp);
+ val = sctp_compute_cksum(skb, sctphoff);
if (val != cmp) {
/* CRC failure, dump it. */
@@ -184,710 +200,159 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 1;
}
-struct ipvs_sctp_nextstate {
- int next_state;
-};
enum ipvs_sctp_event_t {
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_SER,
- IP_VS_SCTP_EVE_INIT_CLI,
- IP_VS_SCTP_EVE_INIT_SER,
- IP_VS_SCTP_EVE_INIT_ACK_CLI,
- IP_VS_SCTP_EVE_INIT_ACK_SER,
- IP_VS_SCTP_EVE_COOKIE_ECHO_CLI,
- IP_VS_SCTP_EVE_COOKIE_ECHO_SER,
- IP_VS_SCTP_EVE_COOKIE_ACK_CLI,
- IP_VS_SCTP_EVE_COOKIE_ACK_SER,
- IP_VS_SCTP_EVE_ABORT_CLI,
- IP_VS_SCTP_EVE__ABORT_SER,
- IP_VS_SCTP_EVE_SHUT_CLI,
- IP_VS_SCTP_EVE_SHUT_SER,
- IP_VS_SCTP_EVE_SHUT_ACK_CLI,
- IP_VS_SCTP_EVE_SHUT_ACK_SER,
- IP_VS_SCTP_EVE_SHUT_COM_CLI,
- IP_VS_SCTP_EVE_SHUT_COM_SER,
- IP_VS_SCTP_EVE_LAST
+ IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */
+ IP_VS_SCTP_INIT,
+ IP_VS_SCTP_INIT_ACK,
+ IP_VS_SCTP_COOKIE_ECHO,
+ IP_VS_SCTP_COOKIE_ACK,
+ IP_VS_SCTP_SHUTDOWN,
+ IP_VS_SCTP_SHUTDOWN_ACK,
+ IP_VS_SCTP_SHUTDOWN_COMPLETE,
+ IP_VS_SCTP_ERROR,
+ IP_VS_SCTP_ABORT,
+ IP_VS_SCTP_EVENT_LAST
};
-static enum ipvs_sctp_event_t sctp_events[255] = {
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_INIT_CLI,
- IP_VS_SCTP_EVE_INIT_ACK_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_ABORT_CLI,
- IP_VS_SCTP_EVE_SHUT_CLI,
- IP_VS_SCTP_EVE_SHUT_ACK_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_COOKIE_ECHO_CLI,
- IP_VS_SCTP_EVE_COOKIE_ACK_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_SHUT_COM_CLI,
+/* RFC 2960, 3.2 Chunk Field Descriptions */
+static __u8 sctp_events[] = {
+ [SCTP_CID_DATA] = IP_VS_SCTP_DATA,
+ [SCTP_CID_INIT] = IP_VS_SCTP_INIT,
+ [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK,
+ [SCTP_CID_SACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT,
+ [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN,
+ [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK,
+ [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR,
+ [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO,
+ [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK,
+ [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA,
+ [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE,
};
-static struct ipvs_sctp_nextstate
- sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = {
- /*
- * STATE : IP_VS_SCTP_S_NONE
- */
- /*next state *//*event */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ },
- },
- /*
- * STATE : IP_VS_SCTP_S_INIT_CLI
- * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT)
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_INIT_SER
- * Server sent INIT and waiting for INIT ACK from the client
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_INIT_ACK_CLI
- * Client sent INIT ACK and waiting for ECHO from the server
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK has been resent by the client, let us stay is in
- * the same state
- */
- {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK sent by the server, close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * ECHO by client, it should not happen, close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO by server, this is what we are expecting, move to ECHO_SER
- */
- {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, it should not happen, close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * Unexpected COOKIE ACK from server, staty in the same state
- */
- {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_INIT_ACK_SER
- * Server sent INIT ACK and waiting for ECHO from the client
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * Unexpected INIT_ACK by the client, let us close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK resent by the server, let us move to same state
- */
- {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client send the ECHO, this is what we are expecting,
- * move to ECHO_CLI
- */
- {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO received from the server, Not sure what to do,
- * let us close it
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, let us stay in the same state
- */
- {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * COOKIE ACK from server, hmm... this should not happen, lets close
- * the connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_ECHO_CLI
- * Cient sent ECHO and waiting COOKEI ACK from the Server
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK has been by the client, let us close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client resent the ECHO, let us stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO received from the server, Not sure what to do,
- * let us close it
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, this shoud not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * COOKIE ACK from server, this is what we are awaiting,lets move to
- * ESTABLISHED.
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_ECHO_SER
- * Server sent ECHO and waiting COOKEI ACK from the client
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK has been by the server, let us close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent the ECHO, not sure what to do, let's close the
- * connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO resent by the server, stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, this is what we are expecting, let's move
- * to ESTABLISHED.
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * COOKIE ACK from server, this should not happen, lets close the
- * connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_ESTABLISHED
- * Association established
- */
- {{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this should not happen, let's close
- * the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_SHUT_CLI
- * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server
- */
- /*
- * We recieved the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
-
- {{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN resent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this should not happen, let's close
- * the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server sent SHUTDOWN ACK, this is what we are expecting, let's move
- * to SHUDOWN_ACK_SER
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_SHUT_SER
- * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client
- */
- /*
- * We recieved the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
-
- {{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN resent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN resent from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this is what we are expecting, let's
- * move to SHUT_ACK_CLI
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server sent SHUTDOWN ACK, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
-
- /*
- * State : IP_VS_SCTP_S_SHUT_ACK_CLI
- * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server
- */
- /*
- * We recieved the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
-
- {{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN sent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN sent from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client resent SHUDTDOWN_ACK, let's stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server sent SHUTDOWN ACK, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- /*
- * SHUTDOWN COMPLETE from server this is what we are expecting.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
-
- /*
- * State : IP_VS_SCTP_S_SHUT_ACK_SER
- * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client
- */
- /*
- * We recieved the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
+/* SCTP States:
+ * See RFC 2960, 4. SCTP Association State Diagram
+ *
+ * New states (not in diagram):
+ * - INIT1 state: use shorter timeout for dropped INIT packets
+ * - REJECTED state: use shorter timeout if INIT is rejected with ABORT
+ * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging
+ *
+ * The states are as seen in real server. In the diagram, INIT1, INIT,
+ * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state.
+ *
+ * States as per packets from client (C) and server (S):
+ *
+ * Setup of client connection:
+ * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK
+ *
+ * Setup of server connection:
+ * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK
+ */
- {{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN sent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN sent from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this should not happen let's close
- * the connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server resent SHUTDOWN ACK, stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this what we are expecting, let's close
- * the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- /*
- * SHUTDOWN COMPLETE from server this should not happen.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_CLOSED
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- }
+#define sNO IP_VS_SCTP_S_NONE
+#define sI1 IP_VS_SCTP_S_INIT1
+#define sIN IP_VS_SCTP_S_INIT
+#define sCS IP_VS_SCTP_S_COOKIE_SENT
+#define sCR IP_VS_SCTP_S_COOKIE_REPLIED
+#define sCW IP_VS_SCTP_S_COOKIE_WAIT
+#define sCO IP_VS_SCTP_S_COOKIE
+#define sCE IP_VS_SCTP_S_COOKIE_ECHOED
+#define sES IP_VS_SCTP_S_ESTABLISHED
+#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT
+#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED
+#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT
+#define sRJ IP_VS_SCTP_S_REJECTED
+#define sCL IP_VS_SCTP_S_CLOSED
+
+static const __u8 sctp_states
+ [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = {
+ { /* INPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* OUTPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW},
+/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL},
+/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* INPUT-ONLY */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
};
-/*
- * Timeout table[state]
- */
-static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
- [IP_VS_SCTP_S_NONE] = 2 * HZ,
- [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_INIT_ACK_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_INIT_ACK_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_ECHO_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_ECHO_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_ACK_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_ACK_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_CLOSED] = 10 * HZ,
- [IP_VS_SCTP_S_LAST] = 2 * HZ,
+#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ)
+
+/* Timeout table[state] */
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+ [IP_VS_SCTP_S_NONE] = 2 * HZ,
+ [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ,
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_LAST] = 2 * HZ,
};
static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = {
- [IP_VS_SCTP_S_NONE] = "NONE",
- [IP_VS_SCTP_S_INIT_CLI] = "INIT_CLI",
- [IP_VS_SCTP_S_INIT_SER] = "INIT_SER",
- [IP_VS_SCTP_S_INIT_ACK_CLI] = "INIT_ACK_CLI",
- [IP_VS_SCTP_S_INIT_ACK_SER] = "INIT_ACK_SER",
- [IP_VS_SCTP_S_ECHO_CLI] = "COOKIE_ECHO_CLI",
- [IP_VS_SCTP_S_ECHO_SER] = "COOKIE_ECHO_SER",
- [IP_VS_SCTP_S_ESTABLISHED] = "ESTABISHED",
- [IP_VS_SCTP_S_SHUT_CLI] = "SHUTDOWN_CLI",
- [IP_VS_SCTP_S_SHUT_SER] = "SHUTDOWN_SER",
- [IP_VS_SCTP_S_SHUT_ACK_CLI] = "SHUTDOWN_ACK_CLI",
- [IP_VS_SCTP_S_SHUT_ACK_SER] = "SHUTDOWN_ACK_SER",
- [IP_VS_SCTP_S_CLOSED] = "CLOSED",
- [IP_VS_SCTP_S_LAST] = "BUG!"
+ [IP_VS_SCTP_S_NONE] = "NONE",
+ [IP_VS_SCTP_S_INIT1] = "INIT1",
+ [IP_VS_SCTP_S_INIT] = "INIT",
+ [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT",
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED",
+ [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT",
+ [IP_VS_SCTP_S_COOKIE] = "COOKIE",
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED",
+ [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT",
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED",
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT",
+ [IP_VS_SCTP_S_REJECTED] = "REJECTED",
+ [IP_VS_SCTP_S_CLOSED] = "CLOSED",
+ [IP_VS_SCTP_S_LAST] = "BUG!",
};
@@ -900,26 +365,14 @@ static const char *sctp_state_name(int state)
return "?";
}
-static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-}
-
-static int
-sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-
-return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
- sctp_state_name_table, sname, to);
-}
-
-static inline int
-set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+static inline void
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, const struct sk_buff *skb)
{
sctp_chunkhdr_t _sctpch, *sch;
unsigned char chunk_type;
int event, next_state;
- int ihl;
+ int ihl, cofs;
#ifdef CONFIG_IP_VS_IPV6
ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
@@ -927,10 +380,10 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
ihl = ip_hdrlen(skb);
#endif
- sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t),
- sizeof(_sctpch), &_sctpch);
+ cofs = ihl + sizeof(sctp_sctphdr_t);
+ sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);
if (sch == NULL)
- return 0;
+ return;
chunk_type = sch->type;
/*
@@ -946,32 +399,37 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
*/
if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
(sch->type == SCTP_CID_COOKIE_ACK)) {
- sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) +
- sch->length), sizeof(_sctpch), &_sctpch);
- if (sch) {
- if (sch->type == SCTP_CID_ABORT)
+ int clen = ntohs(sch->length);
+
+ if (clen >= sizeof(sctp_chunkhdr_t)) {
+ sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4),
+ sizeof(_sctpch), &_sctpch);
+ if (sch && sch->type == SCTP_CID_ABORT)
chunk_type = sch->type;
}
}
- event = sctp_events[chunk_type];
+ event = (chunk_type < sizeof(sctp_events)) ?
+ sctp_events[chunk_type] : IP_VS_SCTP_DATA;
- /*
- * If the direction is IP_VS_DIR_OUTPUT, this event is from server
- */
- if (direction == IP_VS_DIR_OUTPUT)
- event++;
- /*
- * get next state
+ /* Update direction to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
*/
- next_state = sctp_states_table[cp->state][event].next_state;
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (direction == IP_VS_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ direction = IP_VS_DIR_INPUT_ONLY;
+ }
+
+ next_state = sctp_states[direction][event][cp->state];
if (next_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
IP_VS_DBG_BUF(8, "%s %s %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((direction == IP_VS_DIR_OUTPUT) ?
"output " : "input "),
IP_VS_DBG_ADDR(cp->af, &cp->daddr),
@@ -995,75 +453,62 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
}
-
- cp->timeout = pp->timeout_table[cp->state = next_state];
-
- return 1;
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = next_state];
+ else /* What to do ? */
+ cp->timeout = sctp_timeouts[cp->state = next_state];
}
-static int
+static void
sctp_state_transition(struct ip_vs_conn *cp, int direction,
- const struct sk_buff *skb, struct ip_vs_protocol *pp)
+ const struct sk_buff *skb, struct ip_vs_proto_data *pd)
{
- int ret = 0;
-
- spin_lock(&cp->lock);
- ret = set_sctp_state(pp, cp, direction, skb);
- spin_unlock(&cp->lock);
-
- return ret;
+ spin_lock_bh(&cp->lock);
+ set_sctp_state(pd, cp, direction, skb);
+ spin_unlock_bh(&cp->lock);
}
-/*
- * Hash table for SCTP application incarnations
- */
-#define SCTP_APP_TAB_BITS 4
-#define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS)
-#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1)
-
-static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(sctp_app_lock);
-
static inline __u16 sctp_app_hashkey(__be16 port)
{
return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
& SCTP_APP_TAB_MASK;
}
-static int sctp_register_app(struct ip_vs_app *inc)
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
hash = sctp_app_hashkey(port);
- spin_lock_bh(&sctp_app_lock);
- list_for_each_entry(i, &sctp_apps[hash], p_list) {
+ list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &sctp_apps[hash]);
- atomic_inc(&ip_vs_protocol_sctp.appcnt);
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&sctp_app_lock);
return ret;
}
-static void sctp_unregister_app(struct ip_vs_app *inc)
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&sctp_app_lock);
- atomic_dec(&ip_vs_protocol_sctp.appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&sctp_app_lock);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -1074,12 +519,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&sctp_app_lock);
- list_for_each_entry(inc, &sctp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&sctp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1095,43 +540,52 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&sctp_app_lock);
+ rcu_read_unlock();
out:
return result;
}
-static void ip_vs_sctp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(sctp_apps);
- pp->timeout_table = sctp_timeouts;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+ sizeof(sctp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
}
-
-static void ip_vs_sctp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
-
+ kfree(pd->timeout_table);
}
struct ip_vs_protocol ip_vs_protocol_sctp = {
- .name = "SCTP",
- .protocol = IPPROTO_SCTP,
- .num_states = IP_VS_SCTP_S_LAST,
- .dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_sctp_init,
- .exit = ip_vs_sctp_exit,
- .register_app = sctp_register_app,
+ .name = "SCTP",
+ .protocol = IPPROTO_SCTP,
+ .num_states = IP_VS_SCTP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_sctp_init,
+ .exit_netns = __ip_vs_sctp_exit,
+ .register_app = sctp_register_app,
.unregister_app = sctp_unregister_app,
- .conn_schedule = sctp_conn_schedule,
- .conn_in_get = ip_vs_conn_in_get_proto,
- .conn_out_get = ip_vs_conn_out_get_proto,
- .snat_handler = sctp_snat_handler,
- .dnat_handler = sctp_dnat_handler,
- .csum_check = sctp_csum_check,
- .state_name = sctp_state_name,
+ .conn_schedule = sctp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = sctp_snat_handler,
+ .dnat_handler = sctp_dnat_handler,
+ .csum_check = sctp_csum_check,
+ .state_name = sctp_state_name,
.state_transition = sctp_state_transition,
- .app_conn_bind = sctp_app_conn_bind,
- .debug_packet = ip_vs_tcpudp_debug_packet,
- .timeout_change = sctp_timeout_change,
- .set_state_timeout = sctp_set_state_timeout,
+ .app_conn_bind = sctp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index f6c5200e214..e3a697234a9 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,8 +9,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
*
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * tcp_timeouts table has copy per netns in a hash table per
+ * protocol ip_vs_proto_data and is handled by netns
*/
#define KMSG_COMPONENT "IPVS"
@@ -28,33 +32,35 @@
#include <net/ip_vs.h>
static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
+ struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
- struct ip_vs_iphdr iph;
+ struct netns_ipvs *ipvs;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
- th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
if (th == NULL) {
*verdict = NF_DROP;
return 0;
}
-
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
- if (th->syn &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
- th->dest))) {
+ rcu_read_lock();
+ if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, th->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -63,13 +69,18 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
+ /* NF_ACCEPT */
return 1;
}
@@ -117,20 +128,18 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
static int
-tcp_snat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct tcphdr *tcph;
- unsigned int tcphoff;
+ unsigned int tcphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- tcphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- tcphoff = ip_hdrlen(skb);
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
@@ -197,20 +206,18 @@ tcp_snat_handler(struct sk_buff *skb,
static int
-tcp_dnat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct tcphdr *tcph;
- unsigned int tcphoff;
+ unsigned int tcphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- tcphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- tcphoff = ip_hdrlen(skb);
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
@@ -338,7 +345,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
/*
* Timeout table[state]
*/
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
@@ -396,7 +403,7 @@ static struct tcp_states_t tcp_states [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
/* OUTPUT */
@@ -410,7 +417,7 @@ static struct tcp_states_t tcp_states [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
@@ -419,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
-/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
/* OUTPUT */
@@ -433,14 +440,11 @@ static struct tcp_states_t tcp_states_dos [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
{
int on = (flags & 1); /* secure_tcp */
@@ -450,14 +454,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
** for most if not for all of the applications. Something
** like "capabilities" (flags) for each object.
*/
- tcp_state_table = (on? tcp_states_dos : tcp_states);
-}
-
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
- tcp_state_name_table, sname, to);
+ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
}
static inline int tcp_state_idx(struct tcphdr *th)
@@ -474,7 +471,7 @@ static inline int tcp_state_idx(struct tcphdr *th)
}
static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, struct tcphdr *th)
{
int state_idx;
@@ -497,7 +494,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
goto tcp_state_out;
}
- new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+ new_state =
+ pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
tcp_state_out:
if (new_state != cp->state) {
@@ -505,7 +503,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
th->syn ? 'S' : '.',
@@ -535,17 +533,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
- cp->timeout = pp->timeout_table[cp->state = new_state];
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = new_state];
+ else /* What to do ? */
+ cp->timeout = tcp_timeouts[cp->state = new_state];
}
-
/*
* Handle state transitions
*/
-static int
+static void
tcp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
struct tcphdr _tcph, *th;
@@ -557,26 +557,13 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
if (th == NULL)
- return 0;
+ return;
- spin_lock(&cp->lock);
- set_tcp_state(pp, cp, direction, th);
- spin_unlock(&cp->lock);
-
- return 1;
+ spin_lock_bh(&cp->lock);
+ set_tcp_state(pd, cp, direction, th);
+ spin_unlock_bh(&cp->lock);
}
-
-/*
- * Hash table for TCP application incarnations
- */
-#define TCP_APP_TAB_BITS 4
-#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
-#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
static inline __u16 tcp_app_hashkey(__be16 port)
{
return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
@@ -584,44 +571,45 @@ static inline __u16 tcp_app_hashkey(__be16 port)
}
-static int tcp_register_app(struct ip_vs_app *inc)
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
hash = tcp_app_hashkey(port);
- spin_lock_bh(&tcp_app_lock);
- list_for_each_entry(i, &tcp_apps[hash], p_list) {
+ list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &tcp_apps[hash]);
- atomic_inc(&ip_vs_protocol_tcp.appcnt);
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&tcp_app_lock);
return ret;
}
static void
-tcp_unregister_app(struct ip_vs_app *inc)
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&tcp_app_lock);
- atomic_dec(&ip_vs_protocol_tcp.appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&tcp_app_lock);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
}
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -633,12 +621,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&tcp_app_lock);
- list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&tcp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -655,7 +643,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&tcp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -665,24 +653,37 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
{
- spin_lock(&cp->lock);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ spin_lock_bh(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
- cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
- spin_unlock(&cp->lock);
+ cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+ : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
+ spin_unlock_bh(&cp->lock);
}
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(tcp_apps);
- pp->timeout_table = tcp_timeouts;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+ sizeof(tcp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ pd->tcp_state_table = tcp_states;
+ return 0;
}
-
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -691,9 +692,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.protocol = IPPROTO_TCP,
.num_states = IP_VS_TCP_S_LAST,
.dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_tcp_init,
- .exit = ip_vs_tcp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_tcp_init,
+ .exit_netns = __ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
@@ -707,5 +709,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
- .set_state_timeout = tcp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9d106a06bb0..b62a3c0ff9b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,7 +9,8 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
*
*/
@@ -28,32 +29,33 @@
#include <net/ip6_checksum.h>
static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
+ struct net *net;
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
- struct ip_vs_iphdr iph;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
- uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
+ /* IPv6 fragments, only first fragment will hit this */
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
if (uh == NULL) {
*verdict = NF_DROP;
return 0;
}
-
- svc = ip_vs_service_get(af, skb->mark, iph.protocol,
- &iph.daddr, uh->dest);
+ net = skb_net(skb);
+ rcu_read_lock();
+ svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, uh->dest);
if (svc) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -62,13 +64,18 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
+ /* NF_ACCEPT */
return 1;
}
@@ -117,20 +124,18 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
static int
-udp_snat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct udphdr *udph;
- unsigned int udphoff;
+ unsigned int udphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- udphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- udphoff = ip_hdrlen(skb);
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
@@ -202,20 +207,18 @@ udp_snat_handler(struct sk_buff *skb,
static int
-udp_dnat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct udphdr *udph;
- unsigned int udphoff;
+ unsigned int udphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- udphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- udphoff = ip_hdrlen(skb);
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
@@ -338,19 +341,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 1;
}
-
-/*
- * Note: the caller guarantees that only one of register_app,
- * unregister_app or app_conn_bind is called each time.
- */
-
-#define UDP_APP_TAB_BITS 4
-#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
-#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
static inline __u16 udp_app_hashkey(__be16 port)
{
return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
@@ -358,44 +348,44 @@ static inline __u16 udp_app_hashkey(__be16 port)
}
-static int udp_register_app(struct ip_vs_app *inc)
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
hash = udp_app_hashkey(port);
-
- spin_lock_bh(&udp_app_lock);
- list_for_each_entry(i, &udp_apps[hash], p_list) {
+ list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &udp_apps[hash]);
- atomic_inc(&ip_vs_protocol_udp.appcnt);
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&udp_app_lock);
return ret;
}
static void
-udp_unregister_app(struct ip_vs_app *inc)
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&udp_app_lock);
- atomic_dec(&ip_vs_protocol_udp.appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&udp_app_lock);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
}
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -407,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&udp_app_lock);
- list_for_each_entry(inc, &udp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&udp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -429,14 +419,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&udp_app_lock);
+ rcu_read_unlock();
out:
return result;
}
-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = 5*60*HZ,
[IP_VS_UDP_S_LAST] = 2*HZ,
};
@@ -446,14 +436,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_LAST] = "BUG!",
};
-
-static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
- udp_state_name_table, sname, to);
-}
-
static const char * udp_state_name(int state)
{
if (state >= IP_VS_UDP_S_LAST)
@@ -461,23 +443,34 @@ static const char * udp_state_name(int state)
return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
}
-static int
+static void
udp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
- return 1;
+ if (unlikely(!pd)) {
+ pr_err("UDP no ns data\n");
+ return;
+ }
+
+ cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
}
-static void udp_init(struct ip_vs_protocol *pp)
+static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(udp_apps);
- pp->timeout_table = udp_timeouts;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+ sizeof(udp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
}
-static void udp_exit(struct ip_vs_protocol *pp)
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -486,8 +479,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.protocol = IPPROTO_UDP,
.num_states = IP_VS_UDP_S_LAST,
.dont_defrag = 0,
- .init = udp_init,
- .exit = udp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __udp_init,
+ .exit_netns = __udp_exit,
.conn_schedule = udp_conn_schedule,
.conn_in_get = ip_vs_conn_in_get_proto,
.conn_out_get = ip_vs_conn_out_get_proto,
@@ -501,5 +496,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.app_conn_bind = udp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL,
- .set_state_timeout = udp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index e210f37d8ea..176b87c35e3 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
{
- svc->sched_data = &svc->destinations;
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -46,38 +55,44 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
* Round-Robin Scheduling
*/
static struct ip_vs_dest *
-ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
- struct list_head *p, *q;
- struct ip_vs_dest *dest;
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last;
+ int pass = 0;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- write_lock(&svc->sched_lock);
- p = (struct list_head *)svc->sched_data;
- p = p->next;
- q = p;
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
do {
- /* skip list head */
- if (q == &svc->destinations) {
- q = q->next;
- continue;
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ if (dest == last)
+ goto stop;
}
-
- dest = list_entry(q, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0)
- /* HIT */
- goto out;
- q = q->next;
- } while (q != p);
- write_unlock(&svc->sched_lock);
- IP_VS_ERR_RL("RR: no destination available\n");
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ spin_unlock_bh(&svc->sched_lock);
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
out:
- svc->sched_data = q;
- write_unlock(&svc->sched_lock);
+ svc->sched_data = &dest->n_list;
+ spin_unlock_bh(&svc->sched_lock);
IP_VS_DBG_BUF(6, "RR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
@@ -94,7 +109,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
.init_service = ip_vs_rr_init_svc,
- .update_service = ip_vs_rr_update_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_rr_del_dest,
.schedule = ip_vs_rr_schedule,
};
@@ -106,6 +122,7 @@ static int __init ip_vs_rr_init(void)
static void __exit ip_vs_rr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_rr_init);
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 076ebe00435..4dbcda6258b 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -29,13 +29,14 @@
#include <net/ip_vs.h>
+EXPORT_SYMBOL(ip_vs_scheduler_err);
/*
* IPVS scheduler list
*/
static LIST_HEAD(ip_vs_schedulers);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_sched_lock);
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
/*
@@ -46,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
{
int ret;
- svc->scheduler = scheduler;
-
if (scheduler->init_service) {
ret = scheduler->init_service(svc);
if (ret) {
@@ -55,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
return ret;
}
}
-
+ rcu_assign_pointer(svc->scheduler, scheduler);
return 0;
}
@@ -63,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
/*
* Unbind a service with its scheduler
*/
-int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *sched)
{
- struct ip_vs_scheduler *sched = svc->scheduler;
+ struct ip_vs_scheduler *cur_sched;
- if (!sched)
- return 0;
-
- if (sched->done_service) {
- if (sched->done_service(svc) != 0) {
- pr_err("%s(): done error\n", __func__);
- return -EINVAL;
- }
- }
+ cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+ /* This check proves that old 'sched' was installed */
+ if (!cur_sched)
+ return;
- svc->scheduler = NULL;
- return 0;
+ if (sched->done_service)
+ sched->done_service(svc);
+ /* svc->scheduler can not be set to NULL */
}
@@ -91,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
/*
@@ -105,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return sched;
}
if (sched->module)
module_put(sched->module);
}
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return NULL;
}
@@ -146,6 +142,30 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
module_put(scheduler->module);
}
+/*
+ * Common error output helper for schedulers
+ */
+
+void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
+{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference(svc->scheduler);
+ if (svc->fwmark) {
+ IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
+ sched->name, svc->fwmark, svc->fwmark, msg);
+#ifdef CONFIG_IP_VS_IPV6
+ } else if (svc->af == AF_INET6) {
+ IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6, ntohs(svc->port), msg);
+#endif
+ } else {
+ IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.ip, ntohs(svc->port), msg);
+ }
+}
/*
* Register a scheduler in the scheduler list
@@ -167,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (!list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
@@ -183,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
@@ -194,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_info("[%s] scheduler registered.\n", scheduler->name);
@@ -212,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
return -EINVAL;
}
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_err("%s(): [%s] scheduler is not in the list. failed\n",
__func__, scheduler->name);
return -EINVAL;
@@ -224,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 1ab75a9dc40..e446b9fa742 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -44,7 +44,7 @@
#include <net/ip_vs.h>
-static inline unsigned int
+static inline int
ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
{
/*
@@ -59,10 +59,11 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
+ int loh, doh;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
@@ -79,7 +80,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
@@ -87,19 +88,19 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
goto nextstage;
}
}
- IP_VS_ERR_RL("SED: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_sed_dest_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -134,6 +135,7 @@ static int __init ip_vs_sed_init(void)
static void __exit ip_vs_sed_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_sed_init);
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e6cc174fbc0..cc65b2f42cd 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -30,6 +30,11 @@
* server is dead or overloaded, the load balancer can bypass the cache
* server and send requests to the original server directly.
*
+ * The weight destination attribute can be used to control the
+ * distribution of connections to the destinations in servernode. The
+ * greater the weight, the more connections the destination
+ * will receive.
+ *
*/
#define KMSG_COMPONENT "IPVS"
@@ -43,12 +48,16 @@
#include <net/ip_vs.h>
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
/*
* IPVS SH bucket
*/
struct ip_vs_sh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -61,11 +70,24 @@ struct ip_vs_sh_bucket {
#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+struct ip_vs_sh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
+};
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
/*
* Returns hash value for IPVS SH entry
*/
-static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, unsigned int offset)
{
__be32 addr_fold = addr->ip;
@@ -74,7 +96,8 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+ return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ IP_VS_SH_TAB_MASK;
}
@@ -82,38 +105,102 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
{
- return (tbl[ip_vs_sh_hashkey(af, addr)]).dest;
+ unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
}
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ *
+ * The fallback strategy loops around the table starting from a "random"
+ * point (in fact, it is chosen to be the original hash value to make the
+ * algorithm deterministic) to find a new server.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ dest = rcu_dereference(s->buckets[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
+ hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
+ dest = rcu_dereference(s->buckets[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6, "SH: selected unavailable "
+ "server %s:%d (offset %d), reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
+ int d_count;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
+ d_count = 0;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
+ i, IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ atomic_read(&dest->weight));
+
+ /* Don't move to next dest until filling weight */
+ if (++d_count >= atomic_read(&dest->weight)) {
+ p = p->next;
+ d_count = 0;
+ }
- p = p->next;
}
b++;
}
@@ -124,16 +211,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
{
int i;
struct ip_vs_sh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -142,64 +231,84 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl;
+ struct ip_vs_sh_state *s;
/* allocate the SH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
- GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- }
- svc->sched_data = tbl;
+
+ svc->sched_data = s;
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_sh_reassign(s, svc);
return 0;
}
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
+ struct ip_vs_sh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ ip_vs_sh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ struct ip_vs_sh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ ip_vs_sh_reassign(s, svc);
return 0;
}
-/*
- * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- * consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
+/* Helper function to get port number */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
{
- return dest->flags & IP_VS_DEST_F_OVERLOAD;
+ __be16 port;
+ struct tcphdr _tcph, *th;
+ struct udphdr _udph, *uh;
+ sctp_sctphdr_t _sctph, *sh;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (unlikely(th == NULL))
+ return 0;
+ port = th->source;
+ break;
+ case IPPROTO_UDP:
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (unlikely(uh == NULL))
+ return 0;
+ port = uh->source;
+ break;
+ case IPPROTO_SCTP:
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (unlikely(sh == NULL))
+ return 0;
+ port = sh->source;
+ break;
+ default:
+ port = 0;
+ }
+
+ return port;
}
@@ -207,28 +316,32 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Source Hashing scheduling
*/
static struct ip_vs_dest *
-ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
- struct ip_vs_sh_bucket *tbl;
- struct ip_vs_iphdr iph;
-
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+ struct ip_vs_sh_state *s;
+ __be16 port = 0;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
- tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
- dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr);
- if (!dest
- || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
- || atomic_read(&dest->weight) <= 0
- || is_overloaded(dest)) {
- IP_VS_ERR_RL("SH: no destination available\n");
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+ port = ip_vs_sh_get_port(skb, iph);
+
+ s = (struct ip_vs_sh_state *) svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+ dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+ else
+ dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port));
@@ -247,7 +360,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
- .update_service = ip_vs_sh_update_svc,
+ .add_dest = ip_vs_sh_dest_changed,
+ .del_dest = ip_vs_sh_dest_changed,
+ .upd_dest = ip_vs_sh_dest_changed,
.schedule = ip_vs_sh_schedule,
};
@@ -261,6 +376,7 @@ static int __init ip_vs_sh_init(void)
static void __exit ip_vs_sh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index ab85aedea17..db801263ee9 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -5,6 +5,18 @@
* high-performance and highly available server based on a
* cluster of servers.
*
+ * Version 1, is capable of handling both version 0 and 1 messages.
+ * Version 0 is the plain old format.
+ * Note Version 0 receivers will just drop Ver 1 messages.
+ * Version 1 is capable of handle IPv6, Persistence data,
+ * time-outs, and firewall marks.
+ * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions Message: is a complete datagram
+ * Sync_conn: is a part of a Message
+ * Param Data is an option to a Sync_conn.
+ *
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* ip_vs_sync: sync connection info from master load balancer to backups
@@ -15,6 +27,8 @@
* Alexandre Cassen : Added SyncID support for incoming sync
* messages filtering.
* Justin Ossevoort : Fix endian problem on sync message size.
+ * Hans Schillstrom : Added Version 1: i.e. IPv6,
+ * Persistence support, fwmark and time-out.
*/
#define KMSG_COMPONENT "IPVS"
@@ -35,6 +49,8 @@
#include <linux/wait.h>
#include <linux/kernel.h>
+#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+
#include <net/ip.h>
#include <net/sock.h>
@@ -43,11 +59,14 @@
#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
#define IP_VS_SYNC_PORT 8848 /* multicast port */
+#define SYNC_PROTO_VER 1 /* Protocol version in header */
+static struct lock_class_key __ipvs_sync_key;
/*
* IPVS sync connection entry
+ * Version 0, i.e. original version.
*/
-struct ip_vs_sync_conn {
+struct ip_vs_sync_conn_v0 {
__u8 reserved;
/* Protocol, addresses and port numbers */
@@ -71,51 +90,177 @@ struct ip_vs_sync_conn_options {
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
+/*
+ Sync Connection format (sync_conn)
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Protocol | Ver. | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | State | cport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | vport | dport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | fwmark |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | timeout (in sec.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | ... |
+ | IP-Addresses (v4 or v6) |
+ | ... |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ Optional Parameters.
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param. Type | Param. Length | Param. data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ | ... |
+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | | Param Type | Param. Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param data |
+ | Last Param data should be padded for 32 bit alignment |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ * Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ struct in6_addr caddr; /* client address */
+ struct in6_addr vaddr; /* virtual address */
+ struct in6_addr daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+ struct ip_vs_sync_v4 v4;
+ struct ip_vs_sync_v6 v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6 0
+#define STYPE_F_INET6 (1 << STYPE_INET6)
+
+#define SVER_SHIFT 12 /* Shift to get version */
+#define SVER_MASK 0x0fff /* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA 1
+#define IPVS_OPT_PE_DATA 2
+#define IPVS_OPT_PE_NAME 3
+#define IPVS_OPT_PARAM 7
+
+#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
+
struct ip_vs_sync_thread_data {
+ struct net *net;
struct socket *sock;
char *buf;
+ int id;
};
-#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
#define FULL_CONN_SIZE \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
/*
- The master mulitcasts messages to the backup load balancers in the
- following format.
+ The master mulitcasts messages (Datagrams) to the backup load balancers
+ in the following format.
+
+ Version 1:
+ Note, first byte should be Zero, so ver 0 receivers will drop the packet.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | Count Conns | SyncID | Size |
+ | 0 | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | Version | Reserved, set to Zero |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (1) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| . |
- | . |
+ ~ . ~
| . |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (n) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | IPVS Sync Connection (1) |
*/
#define SYNC_MESG_HEADER_LEN 4
#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-struct ip_vs_sync_mesg {
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
__u8 syncid;
- __u16 size;
+ __be16 size;
/* ip_vs_sync_conn entries start here */
};
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+ __u8 reserved; /* must be zero */
+ __u8 syncid;
+ __be16 size;
+ __u8 nr_conns;
+ __s8 version; /* SYNC_PROTO_VER */
+ __u16 spare;
+ /* ip_vs_sync_conn entries start here */
+};
struct ip_vs_sync_buff {
struct list_head list;
@@ -127,70 +272,75 @@ struct ip_vs_sync_buff {
unsigned char *end;
};
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+ ho->init_seq = get_unaligned_be32(&no->init_seq);
+ ho->delta = get_unaligned_be32(&no->delta);
+ ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
-/* multicast addr */
-static struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
-};
-
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+ put_unaligned_be32(ho->init_seq, &no->init_seq);
+ put_unaligned_be32(ho->delta, &no->delta);
+ put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
+static inline struct ip_vs_sync_buff *
+sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&ip_vs_sync_lock);
- if (list_empty(&ip_vs_sync_queue)) {
+ spin_lock_bh(&ipvs->sync_lock);
+ if (list_empty(&ms->sync_queue)) {
sb = NULL;
+ __set_current_state(TASK_INTERRUPTIBLE);
} else {
- sb = list_entry(ip_vs_sync_queue.next,
- struct ip_vs_sync_buff,
+ sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
list);
list_del(&sb->list);
+ ms->sync_queue_len--;
+ if (!ms->sync_queue_len)
+ ms->sync_queue_delay = 0;
}
- spin_unlock_bh(&ip_vs_sync_lock);
+ spin_unlock_bh(&ipvs->sync_lock);
return sb;
}
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
kfree(sb);
return NULL;
}
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
+ sb->mesg->version = SYNC_PROTO_VER;
+ sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
sb->mesg->nr_conns = 0;
- sb->mesg->syncid = ip_vs_master_syncid;
- sb->mesg->size = 4;
- sb->head = (unsigned char *)sb->mesg + 4;
- sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+ sb->mesg->spare = 0;
+ sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+ sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
sb->firstuse = jiffies;
return sb;
}
@@ -201,14 +351,24 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
kfree(sb);
}
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs,
+ struct ipvs_master_sync_state *ms)
{
- spin_lock(&ip_vs_sync_lock);
- if (ip_vs_sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&sb->list, &ip_vs_sync_queue);
- else
+ struct ip_vs_sync_buff *sb = ms->sync_buff;
+
+ spin_lock(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER &&
+ ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
+ if (!ms->sync_queue_len)
+ schedule_delayed_work(&ms->master_wakeup_work,
+ max(IPVS_SYNC_SEND_DELAY, 1));
+ ms->sync_queue_len++;
+ list_add_tail(&sb->list, &ms->sync_queue);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
+ wake_up_process(ms->master_thread);
+ } else
ip_vs_sync_buff_release(sb);
- spin_unlock(&ip_vs_sync_lock);
+ spin_unlock(&ipvs->sync_lock);
}
/*
@@ -216,47 +376,209 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
+ unsigned long time)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&curr_sb_lock);
- if (curr_sb && (time == 0 ||
- time_before(jiffies - curr_sb->firstuse, time))) {
- sb = curr_sb;
- curr_sb = NULL;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ sb = ms->sync_buff;
+ if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
+ ms->sync_buff = NULL;
+ __set_current_state(TASK_RUNNING);
} else
sb = NULL;
- spin_unlock_bh(&curr_sb_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return sb;
+}
+
+static inline int
+select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
+{
+ return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
+}
+
+/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg_v0 *mesg;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+ mesg->nr_conns = 0;
+ mesg->syncid = ipvs->master_syncid;
+ mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
+ sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+ sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->firstuse = jiffies;
return sb;
}
+/* Check if connection is controlled by persistence */
+static inline bool in_persistence(struct ip_vs_conn *cp)
+{
+ for (cp = cp->control; cp; cp = cp->control) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ return true;
+ }
+ return false;
+}
+
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ * sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ * for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ * with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+ struct ip_vs_conn *cp, int pkts)
+{
+ unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+ unsigned long now = jiffies;
+ unsigned long n = (now + cp->timeout) & ~3UL;
+ unsigned int sync_refresh_period;
+ int sync_period;
+ int force;
+
+ /* Check if we sync in current state */
+ if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ force = 0;
+ else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
+ return 0;
+ else if (likely(cp->protocol == IPPROTO_TCP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_TCP_S_ESTABLISHED) |
+ (1 << IP_VS_TCP_S_FIN_WAIT) |
+ (1 << IP_VS_TCP_S_CLOSE) |
+ (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+ (1 << IP_VS_TCP_S_TIME_WAIT))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+ goto set;
+ } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
+ (1 << IP_VS_SCTP_S_CLOSED))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+ goto set;
+ } else {
+ /* UDP or another protocol with single state */
+ force = 0;
+ }
+
+ sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+ if (sync_refresh_period > 0) {
+ long diff = n - orig;
+ long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+ /* Avoid sync if difference is below sync_refresh_period
+ * and below the half timeout.
+ */
+ if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+ int retries = orig & 3;
+
+ if (retries >= sysctl_sync_retries(ipvs))
+ return 0;
+ if (time_before(now, orig - cp->timeout +
+ (sync_refresh_period >> 3)))
+ return 0;
+ n |= retries + 1;
+ }
+ }
+ sync_period = sysctl_sync_period(ipvs);
+ if (sync_period > 0) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+ pkts % sync_period != sysctl_sync_threshold(ipvs))
+ return 0;
+ } else if (sync_refresh_period <= 0 &&
+ pkts != sysctl_sync_threshold(ipvs))
+ return 0;
+
+set:
+ cp->old_state = cp->state;
+ n = cmpxchg(&cp->sync_endtime, orig, n);
+ return n == orig || force;
+}
/*
+ * Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
- * Called by ip_vs_in.
*/
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+ int pkts)
{
- struct ip_vs_sync_mesg *m;
- struct ip_vs_sync_conn *s;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg_v0 *m;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
int len;
- spin_lock(&curr_sb_lock);
- if (!curr_sb) {
- if (!(curr_sb=ip_vs_sync_buff_create())) {
- spin_unlock(&curr_sb_lock);
+ if (unlikely(cp->af != AF_INET))
+ return;
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+ buff = ms->sync_buff;
+ if (buff) {
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ /* Send buffer if it is for v1 */
+ if (!m->nr_conns) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ }
+ }
+ if (!buff) {
+ buff = ip_vs_sync_buff_create_v0(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
+ ms->sync_buff = buff;
}
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
- m = curr_sb->mesg;
- s = (struct ip_vs_sync_conn *)curr_sb->head;
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *) buff->head;
/* copy members */
+ s->reserved = 0;
s->protocol = cp->protocol;
s->cport = cp->cport;
s->vport = cp->vport;
@@ -273,84 +595,364 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
}
m->nr_conns++;
- m->size += len;
- curr_sb->head += len;
+ m->size = htons(ntohs(m->size) + len);
+ buff->head += len;
/* check if there is a space for next one */
- if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
- sb_queue_tail(curr_sb);
- curr_sb = NULL;
+ if (buff->head + FULL_CONN_SIZE > buff->end) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
}
- spin_unlock(&curr_sb_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
- if (cp->control)
- ip_vs_sync_conn(cp->control);
+ cp = cp->control;
+ if (cp) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ ip_vs_sync_conn(net, cp->control, pkts);
+ }
}
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ * Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
+ __u8 *p;
+ unsigned int len, pe_name_len, pad;
+
+ /* Handle old version of the protocol */
+ if (sysctl_sync_ver(ipvs) == 0) {
+ ip_vs_sync_conn_v0(net, cp, pkts);
+ return;
+ }
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ goto control;
+sloop:
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ goto control;
+
+ /* Sanity checks */
+ pe_name_len = 0;
+ if (cp->pe_data_len) {
+ if (!cp->pe_data || !cp->dest) {
+ IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+ return;
+ }
+ pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+ }
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ len = sizeof(struct ip_vs_sync_v6);
+ else
+#endif
+ len = sizeof(struct ip_vs_sync_v4);
+
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+ len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+ if (cp->pe_data_len)
+ len += cp->pe_data_len + 2; /* + Param hdr field */
+ if (pe_name_len)
+ len += pe_name_len + 2;
+
+ /* check if there is a space for this one */
+ pad = 0;
+ buff = ms->sync_buff;
+ if (buff) {
+ m = buff->mesg;
+ pad = (4 - (size_t) buff->head) & 3;
+ /* Send buffer if it is for v0 */
+ if (buff->head + len + pad > buff->end || m->reserved) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ pad = 0;
+ }
+ }
+
+ if (!buff) {
+ buff = ip_vs_sync_buff_create(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ ms->sync_buff = buff;
+ m = buff->mesg;
+ }
+
+ p = buff->head;
+ buff->head += pad + len;
+ m->size = htons(ntohs(m->size) + pad + len);
+ /* Add ev. padding from prev. sync_conn */
+ while (pad--)
+ *(p++) = 0;
+
+ s = (union ip_vs_sync_conn *)p;
+
+ /* Set message type & copy members */
+ s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+ s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
+ s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->v4.state = htons(cp->state);
+ s->v4.protocol = cp->protocol;
+ s->v4.cport = cp->cport;
+ s->v4.vport = cp->vport;
+ s->v4.dport = cp->dport;
+ s->v4.fwmark = htonl(cp->fwmark);
+ s->v4.timeout = htonl(cp->timeout / HZ);
+ m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6) {
+ p += sizeof(struct ip_vs_sync_v6);
+ s->v6.caddr = cp->caddr.in6;
+ s->v6.vaddr = cp->vaddr.in6;
+ s->v6.daddr = cp->daddr.in6;
+ } else
+#endif
+ {
+ p += sizeof(struct ip_vs_sync_v4); /* options ptr */
+ s->v4.caddr = cp->caddr.ip;
+ s->v4.vaddr = cp->vaddr.ip;
+ s->v4.daddr = cp->daddr.ip;
+ }
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ *(p++) = IPVS_OPT_SEQ_DATA;
+ *(p++) = sizeof(struct ip_vs_sync_conn_options);
+ hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+ p += sizeof(struct ip_vs_seq);
+ hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+ p += sizeof(struct ip_vs_seq);
+ }
+ /* Handle pe data */
+ if (cp->pe_data_len && cp->pe_data) {
+ *(p++) = IPVS_OPT_PE_DATA;
+ *(p++) = cp->pe_data_len;
+ memcpy(p, cp->pe_data, cp->pe_data_len);
+ p += cp->pe_data_len;
+ if (pe_name_len) {
+ /* Add PE_NAME */
+ *(p++) = IPVS_OPT_PE_NAME;
+ *(p++) = pe_name_len;
+ memcpy(p, cp->pe->name, pe_name_len);
+ p += pe_name_len;
+ }
+ }
+
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+control:
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (!cp)
+ return;
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ goto sloop;
+}
+
+/*
+ * fill_param used by version 1
+ */
static inline int
-ip_vs_conn_fill_param_sync(int af, int protocol,
- const union nf_inet_addr *caddr, __be16 cport,
- const union nf_inet_addr *vaddr, __be16 vport,
- struct ip_vs_conn_param *p)
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ struct ip_vs_conn_param *p,
+ __u8 *pe_data, unsigned int pe_data_len,
+ __u8 *pe_name, unsigned int pe_name_len)
{
- /* XXX: Need to take into account persistence engine */
- ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ (const union nf_inet_addr *)&sc->v6.caddr,
+ sc->v6.cport,
+ (const union nf_inet_addr *)&sc->v6.vaddr,
+ sc->v6.vport, p);
+ else
+#endif
+ ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ (const union nf_inet_addr *)&sc->v4.caddr,
+ sc->v4.cport,
+ (const union nf_inet_addr *)&sc->v4.vaddr,
+ sc->v4.vport, p);
+ /* Handle pe data */
+ if (pe_data_len) {
+ if (pe_name_len) {
+ char buff[IP_VS_PENAME_MAXLEN+1];
+
+ memcpy(buff, pe_name, pe_name_len);
+ buff[pe_name_len]=0;
+ p->pe = __ip_vs_pe_getbyname(buff);
+ if (!p->pe) {
+ IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+ buff);
+ return 1;
+ }
+ } else {
+ IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+ return 1;
+ }
+
+ p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
+ if (!p->pe_data) {
+ if (p->pe->module)
+ module_put(p->pe->module);
+ return -ENOMEM;
+ }
+ p->pe_data_len = pe_data_len;
+ }
return 0;
}
/*
- * Process received multicast message and create the corresponding
- * ip_vs_conn entries.
+ * Connection Add / Update.
+ * Common for version 0 and 1 reception of backup sync_conns.
+ * Param: ...
+ * timeout is in sec.
*/
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+ unsigned int flags, unsigned int state,
+ unsigned int protocol, unsigned int type,
+ const union nf_inet_addr *daddr, __be16 dport,
+ unsigned long timeout, __u32 fwmark,
+ struct ip_vs_sync_conn_options *opt)
{
- struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
- struct ip_vs_sync_conn *s;
- struct ip_vs_sync_conn_options *opt;
- struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp;
struct ip_vs_dest *dest;
- struct ip_vs_conn_param param;
- char *p;
- int i;
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- if (buflen < sizeof(struct ip_vs_sync_mesg)) {
- IP_VS_ERR_RL("sync message header too short\n");
- return;
- }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(param);
+ else
+ cp = ip_vs_ct_in_get(param);
- /* Convert size back to host byte order */
- m->size = ntohs(m->size);
+ if (cp) {
+ /* Free pe_data */
+ kfree(param->pe_data);
- if (buflen != m->size) {
- IP_VS_ERR_RL("bogus sync message size\n");
- return;
+ dest = cp->dest;
+ spin_lock_bh(&cp->lock);
+ if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+ !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+ if (flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ } else {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ }
+ }
+ flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+ flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+ cp->flags = flags;
+ spin_unlock_bh(&cp->lock);
+ if (!dest)
+ ip_vs_try_bind_dest(cp);
+ } else {
+ /*
+ * Find the appropriate destination for the connection.
+ * If it is not found the connection will remain unbound
+ * but still handled.
+ */
+ rcu_read_lock();
+ dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
+ param->vport, protocol, fwmark, flags);
+
+ cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+ rcu_read_unlock();
+ if (!cp) {
+ if (param->pe_data)
+ kfree(param->pe_data);
+ IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+ return;
+ }
}
- /* SyncID sanity check */
- if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
- IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
- m->syncid);
- return;
+ if (opt)
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
+ cp->state = state;
+ cp->old_state = cp->state;
+ /*
+ * For Ver 0 messages style
+ * - Not possible to recover the right timeout for templates
+ * - can not find the right fwmark
+ * virtual service. If needed, we can do it for
+ * non-fwmark persistent services.
+ * Ver 1 messages style.
+ * - No problem.
+ */
+ if (timeout) {
+ if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+ timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+ cp->timeout = timeout*HZ;
+ } else {
+ struct ip_vs_proto_data *pd;
+
+ pd = ip_vs_proto_data_get(net, protocol);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+ cp->timeout = pd->timeout_table[state];
+ else
+ cp->timeout = (3*60*HZ);
}
+ ip_vs_conn_put(cp);
+}
- p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+/*
+ * Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+ const size_t buflen)
+{
+ struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ char *p;
+ int i;
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
for (i=0; i<m->nr_conns; i++) {
- unsigned flags, state;
+ unsigned int flags, state;
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
return;
}
- s = (struct ip_vs_sync_conn *) p;
+ s = (struct ip_vs_sync_conn_v0 *) p;
flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
flags &= ~IP_VS_CONN_F_HASHED;
if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
p += FULL_CONN_SIZE;
if (p > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn options in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
return;
}
} else {
@@ -362,123 +964,311 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
pp = ip_vs_proto_get(s->protocol);
if (!pp) {
- IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
s->protocol);
continue;
}
if (state >= pp->num_states) {
- IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
pp->name, state);
continue;
}
} else {
/* protocol in templates is not used for state/timeout */
- pp = NULL;
if (state > 0) {
- IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
state);
state = 0;
}
}
- {
- if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport, &param)) {
- pr_err("ip_vs_conn_fill_param_sync failed");
- return;
+ ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ (const union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (const union nf_inet_addr *)&s->vaddr,
+ s->vport, &param);
+
+ /* Send timeout as Zero */
+ ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ (union nf_inet_addr *)&s->daddr, s->dport,
+ 0, 0, opt);
+ }
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+ __u32 *opt_flags,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_sync_conn_options *topt;
+
+ topt = (struct ip_vs_sync_conn_options *)p;
+
+ if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+ IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+ return -EINVAL;
+ }
+ if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+ IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+ return -EINVAL;
+ }
+ ntoh_seq(&topt->in_seq, &opt->in_seq);
+ ntoh_seq(&topt->out_seq, &opt->out_seq);
+ *opt_flags |= IPVS_OPT_F_SEQ_DATA;
+ return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+ __u8 **data, unsigned int maxlen,
+ __u32 *opt_flags, __u32 flag)
+{
+ if (plen > maxlen) {
+ IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+ return -EINVAL;
+ }
+ if (*opt_flags & flag) {
+ IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+ return -EINVAL;
+ }
+ *data_len = plen;
+ *data = p;
+ *opt_flags |= flag;
+ return 0;
+}
+/*
+ * Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+ struct ip_vs_sync_conn_options opt;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ __u32 flags;
+ unsigned int af, state, pe_data_len=0, pe_name_len=0;
+ __u8 *pe_data=NULL, *pe_name=NULL;
+ __u32 opt_flags=0;
+ int retc=0;
+
+ s = (union ip_vs_sync_conn *) p;
+
+ if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ af = AF_INET6;
+ p += sizeof(struct ip_vs_sync_v6);
+#else
+ IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+ retc = 10;
+ goto out;
+#endif
+ } else if (!s->v4.type) {
+ af = AF_INET;
+ p += sizeof(struct ip_vs_sync_v4);
+ } else {
+ return -10;
+ }
+ if (p > msg_end)
+ return -20;
+
+ /* Process optional params check Type & Len. */
+ while (p < msg_end) {
+ int ptype;
+ int plen;
+
+ if (p+2 > msg_end)
+ return -30;
+ ptype = *(p++);
+ plen = *(p++);
+
+ if (!plen || ((p + plen) > msg_end))
+ return -40;
+ /* Handle seq option p = param data */
+ switch (ptype & ~IPVS_OPT_F_PARAM) {
+ case IPVS_OPT_SEQ_DATA:
+ if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+ return -50;
+ break;
+
+ case IPVS_OPT_PE_DATA:
+ if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+ IP_VS_PEDATA_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_DATA))
+ return -60;
+ break;
+
+ case IPVS_OPT_PE_NAME:
+ if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+ IP_VS_PENAME_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_NAME))
+ return -70;
+ break;
+
+ default:
+ /* Param data mandatory ? */
+ if (!(ptype & IPVS_OPT_F_PARAM)) {
+ IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+ ptype & ~IPVS_OPT_F_PARAM);
+ retc = 20;
+ goto out;
}
- if (!(flags & IP_VS_CONN_F_TEMPLATE))
- cp = ip_vs_conn_in_get(&param);
- else
- cp = ip_vs_ct_in_get(&param);
}
- if (!cp) {
- /*
- * Find the appropriate destination for the connection.
- * If it is not found the connection will remain unbound
- * but still handled.
- */
- dest = ip_vs_find_dest(AF_INET,
- (union nf_inet_addr *)&s->daddr,
- s->dport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport,
- s->protocol);
- /* Set the approprite ativity flag */
- if (s->protocol == IPPROTO_TCP) {
- if (state != IP_VS_TCP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- } else if (s->protocol == IPPROTO_SCTP) {
- if (state != IP_VS_SCTP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
+ p += plen; /* Next option */
+ }
+
+ /* Get flags and Mask off unsupported */
+ flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+ flags |= IP_VS_CONN_F_SYNC;
+ state = ntohs(s->v4.state);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->v4.protocol);
+ if (!pp) {
+ IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+ s->v4.protocol);
+ retc = 30;
+ goto out;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+ pp->name, state);
+ retc = 40;
+ goto out;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+ if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ pe_data_len, pe_name, pe_name_len)) {
+ retc = 50;
+ goto out;
+ }
+ /* If only IPv4, just silent skip IPv6 */
+ if (af == AF_INET)
+ ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+ ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#ifdef CONFIG_IP_VS_IPV6
+ else
+ ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+ ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#endif
+ return 0;
+ /* Error exit */
+out:
+ IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+ return retc;
+
+}
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ * Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+ const size_t buflen)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+ __u8 *p, *msg_end;
+ int i, nr_conns;
+
+ if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+ IP_VS_DBG(2, "BACKUP, message header too short\n");
+ return;
+ }
+
+ if (buflen != ntohs(m2->size)) {
+ IP_VS_DBG(2, "BACKUP, bogus message size\n");
+ return;
+ }
+ /* SyncID sanity check */
+ if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+ return;
+ }
+ /* Handle version 1 message */
+ if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+ && (m2->spare == 0)) {
+
+ msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+ nr_conns = m2->nr_conns;
+
+ for (i=0; i<nr_conns; i++) {
+ union ip_vs_sync_conn *s;
+ unsigned int size;
+ int retc;
+
+ p = msg_end;
+ if (p + sizeof(s->v4) > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ return;
}
- cp = ip_vs_conn_new(&param,
- (union nf_inet_addr *)&s->daddr,
- s->dport, flags, dest);
- if (dest)
- atomic_dec(&dest->refcnt);
- if (!cp) {
- pr_err("ip_vs_conn_new failed\n");
+ s = (union ip_vs_sync_conn *)p;
+ size = ntohs(s->v4.ver_size) & SVER_MASK;
+ msg_end = p + size;
+ /* Basic sanity checks */
+ if (msg_end > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
return;
}
- } else if (!cp->dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
- (cp->state != state)) {
- /* update active/inactive flag for the connection */
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+ ntohs(s->v4.ver_size) >> SVER_SHIFT);
+ return;
}
- } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
- (cp->state != state)) {
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_SCTP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* Process a single sync_conn */
+ retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ if (retc < 0) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+ retc);
+ return;
}
+ /* Make sure we have 32 bit alignment */
+ msg_end = p + ((size + 3) & ~3);
}
-
- if (opt)
- memcpy(&cp->in_seq, opt, sizeof(*opt));
- atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
- cp->state = state;
- cp->old_state = cp->state;
- /*
- * We can not recover the right timeout for templates
- * in all cases, we can not find the right fwmark
- * virtual service. If needed, we can do it for
- * non-fwmark persistent services.
- */
- if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
- cp->timeout = pp->timeout_table[state];
- else
- cp->timeout = (3*60*HZ);
- ip_vs_conn_put(cp);
+ } else {
+ /* Old type of message */
+ ip_vs_process_message_v0(net, buffer, buflen);
+ return;
}
}
/*
+ * Setup sndbuf (mode=1) or rcvbuf (mode=0)
+ */
+static void set_sock_size(struct sock *sk, int mode, int val)
+{
+ /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
+ /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
+ lock_sock(sk);
+ if (mode) {
+ val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
+ sysctl_wmem_max);
+ sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ } else {
+ val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
+ sysctl_rmem_max);
+ sk->sk_rcvbuf = val * 2;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
+/*
* Setup loopback of outgoing multicasts on a sending socket
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
@@ -511,8 +1301,10 @@ static int set_mcast_if(struct sock *sk, char *ifname)
{
struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -531,30 +1323,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)
* Set the maximum length of sync message according to the
* specified interface's MTU.
*/
-static int set_sync_mesg_maxlen(int sync_state)
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct net_device *dev;
int num;
if (sync_state == IP_VS_STATE_MASTER) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+ if (!dev)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
sizeof(struct udphdr) -
SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
- sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+ ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
IP_VS_DBG(7, "setting the maximum length of sync sending "
- "message %d.\n", sync_send_mesg_maxlen);
+ "message %d.\n", ipvs->send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+ if (!dev)
return -ENODEV;
- sync_recv_mesg_maxlen = dev->mtu -
+ ipvs->recv_mesg_maxlen = dev->mtu -
sizeof(struct iphdr) - sizeof(struct udphdr);
IP_VS_DBG(7, "setting the maximum length of sync receiving "
- "message %d.\n", sync_recv_mesg_maxlen);
+ "message %d.\n", ipvs->recv_mesg_maxlen);
}
return 0;
@@ -569,6 +1364,7 @@ static int set_sync_mesg_maxlen(int sync_state)
static int
join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
{
+ struct net *net = sock_net(sk);
struct ip_mreqn mreq;
struct net_device *dev;
int ret;
@@ -576,7 +1372,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -593,11 +1390,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
+ struct net *net = sock_net(sock->sk);
struct net_device *dev;
__be32 addr;
struct sockaddr_in sin;
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -619,19 +1418,31 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
/*
* Set up sending multicast socket over UDP
*/
-static struct socket * make_send_sock(void)
+static struct socket *make_send_sock(struct net *net, int id)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
- /* First create a socket */
+ /* First create a socket move it to right name space later */
result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
-
- result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
+ result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
@@ -639,8 +1450,11 @@ static struct socket * make_send_sock(void)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 1, result);
- result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+ result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error binding address of the mcast interface\n");
goto error;
@@ -655,8 +1469,8 @@ static struct socket * make_send_sock(void)
return sock;
- error:
- sock_release(sock);
+error:
+ sk_release_kernel(sock->sk);
return ERR_PTR(result);
}
@@ -664,8 +1478,15 @@ static struct socket * make_send_sock(void)
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket * make_receive_sock(void)
+static struct socket *make_receive_sock(struct net *net, int id)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
@@ -675,9 +1496,17 @@ static struct socket * make_receive_sock(void)
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
-
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
/* it is equivalent to the REUSEADDR option in user-space */
- sock->sk->sk_reuse = 1;
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 0, result);
result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
sizeof(struct sockaddr));
@@ -689,7 +1518,7 @@ static struct socket * make_receive_sock(void)
/* join the multicast group */
result = join_mcast_group(sock->sk,
(struct in_addr *) &mcast_addr.sin_addr,
- ip_vs_backup_mcast_ifn);
+ ipvs->backup_mcast_ifn);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
@@ -697,8 +1526,8 @@ static struct socket * make_receive_sock(void)
return sock;
- error:
- sock_release(sock);
+error:
+ sk_release_kernel(sock->sk);
return ERR_PTR(result);
}
@@ -720,18 +1549,19 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
return len;
}
-static void
+static int
ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
{
int msize;
+ int ret;
- msize = msg->size;
-
- /* Put size in network byte order */
- msg->size = htons(msg->size);
+ msize = ntohs(msg->size);
- if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
- pr_err("ip_vs_send_async error\n");
+ ret = ip_vs_send_async(sock, (char *)msg, msize);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ pr_err("ip_vs_send_async error %d\n", ret);
+ return 0;
}
static int
@@ -747,53 +1577,95 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
iov.iov_base = buffer;
iov.iov_len = (size_t)buflen;
- len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
if (len < 0)
- return -1;
+ return len;
LeaveFunction(7);
return len;
}
+/* Wakeup the master thread for sending */
+static void master_wakeup_work_handler(struct work_struct *work)
+{
+ struct ipvs_master_sync_state *ms =
+ container_of(work, struct ipvs_master_sync_state,
+ master_wakeup_work.work);
+ struct netns_ipvs *ipvs = ms->ipvs;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ms->sync_queue_len &&
+ ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
+ wake_up_process(ms->master_thread);
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+}
+
+/* Get next buffer to send */
+static inline struct ip_vs_sync_buff *
+next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ sb = sb_dequeue(ipvs, ms);
+ if (sb)
+ return sb;
+ /* Do not delay entries in buffer for more than 2 seconds */
+ return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
+}
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
+ struct sock *sk = tinfo->sock->sk;
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
- "syncid = %d\n",
- ip_vs_master_mcast_ifn, ip_vs_master_syncid);
-
- while (!kthread_should_stop()) {
- while ((sb = sb_dequeue())) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ "syncid = %d, id = %d\n",
+ ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+
+ for (;;) {
+ sb = next_sync_buff(ipvs, ms);
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (!sb) {
+ schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
+ continue;
}
-
- /* check if entries stay in curr_sb for 2 seconds */
- sb = get_curr_sync_buff(2 * HZ);
- if (sb) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
+ /* (Ab)use interruptible sleep to avoid increasing
+ * the load avg.
+ */
+ __wait_event_interruptible(*sk_sleep(sk),
+ sock_writeable(sk) ||
+ kthread_should_stop());
+ if (unlikely(kthread_should_stop()))
+ goto done;
}
-
- schedule_timeout_interruptible(HZ);
+ ip_vs_sync_buff_release(sb);
}
+done:
+ __set_current_state(TASK_RUNNING);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
/* clean up the sync_buff queue */
- while ((sb=sb_dequeue())) {
+ while ((sb = sb_dequeue(ipvs, ms)))
ip_vs_sync_buff_release(sb);
- }
+ __set_current_state(TASK_RUNNING);
/* clean up the current sync_buff */
- if ((sb = get_curr_sync_buff(0))) {
+ sb = get_curr_sync_buff(ipvs, ms, 0);
+ if (sb)
ip_vs_sync_buff_release(sb);
- }
/* release the sending multicast socket */
- sock_release(tinfo->sock);
+ sk_release_kernel(tinfo->sock->sk);
kfree(tinfo);
return 0;
@@ -803,11 +1675,12 @@ static int sync_thread_master(void *data)
static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
- "syncid = %d\n",
- ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+ "syncid = %d, id = %d\n",
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -817,22 +1690,19 @@ static int sync_thread_backup(void *data)
/* do we have data now? */
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
- sync_recv_mesg_maxlen);
+ ipvs->recv_mesg_maxlen);
if (len <= 0) {
- pr_err("receiving message error\n");
+ if (len != -EAGAIN)
+ pr_err("receiving message error\n");
break;
}
- /* disable bottom half, because it accesses the data
- shared by softirq while getting/creating conns */
- local_bh_disable();
- ip_vs_process_message(tinfo->buf, len);
- local_bh_enable();
+ ip_vs_process_message(tinfo->net, tinfo->buf, len);
}
}
/* release the sending multicast socket */
- sock_release(tinfo->sock);
+ sk_release_kernel(tinfo->sock->sk);
kfree(tinfo->buf);
kfree(tinfo);
@@ -840,128 +1710,239 @@ static int sync_thread_backup(void *data)
}
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
- struct task_struct **realtask, *task;
+ struct task_struct **array = NULL, *task;
struct socket *sock;
- char *name, *buf = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ char *name;
int (*threadfn)(void *data);
+ int id, count;
int result = -ENOMEM;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
- sizeof(struct ip_vs_sync_conn));
+ sizeof(struct ip_vs_sync_conn_v0));
+
+ if (!ipvs->sync_state) {
+ count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
+ ipvs->threads_mask = count - 1;
+ } else
+ count = ipvs->threads_mask + 1;
if (state == IP_VS_STATE_MASTER) {
- if (sync_master_thread)
+ if (ipvs->ms)
return -EEXIST;
- strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_master_mcast_ifn));
- ip_vs_master_syncid = syncid;
- realtask = &sync_master_thread;
- name = "ipvs_syncmaster";
+ strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->master_mcast_ifn));
+ ipvs->master_syncid = syncid;
+ name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
- sock = make_send_sock();
} else if (state == IP_VS_STATE_BACKUP) {
- if (sync_backup_thread)
+ if (ipvs->backup_threads)
return -EEXIST;
- strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_backup_mcast_ifn));
- ip_vs_backup_syncid = syncid;
- realtask = &sync_backup_thread;
- name = "ipvs_syncbackup";
+ strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->backup_mcast_ifn));
+ ipvs->backup_syncid = syncid;
+ name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
- sock = make_receive_sock();
} else {
return -EINVAL;
}
- if (IS_ERR(sock)) {
- result = PTR_ERR(sock);
- goto out;
+ if (state == IP_VS_STATE_MASTER) {
+ struct ipvs_master_sync_state *ms;
+
+ ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
+ if (!ipvs->ms)
+ goto out;
+ ms = ipvs->ms;
+ for (id = 0; id < count; id++, ms++) {
+ INIT_LIST_HEAD(&ms->sync_queue);
+ ms->sync_queue_len = 0;
+ ms->sync_queue_delay = 0;
+ INIT_DELAYED_WORK(&ms->master_wakeup_work,
+ master_wakeup_work_handler);
+ ms->ipvs = ipvs;
+ }
+ } else {
+ array = kzalloc(count * sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (!array)
+ goto out;
}
+ set_sync_mesg_maxlen(net, state);
- set_sync_mesg_maxlen(state);
- if (state == IP_VS_STATE_BACKUP) {
- buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
- if (!buf)
+ tinfo = NULL;
+ for (id = 0; id < count; id++) {
+ if (state == IP_VS_STATE_MASTER)
+ sock = make_send_sock(net, id);
+ else
+ sock = make_receive_sock(net, id);
+ if (IS_ERR(sock)) {
+ result = PTR_ERR(sock);
+ goto outtinfo;
+ }
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
goto outsocket;
- }
-
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
- if (!tinfo)
- goto outbuf;
-
- tinfo->sock = sock;
- tinfo->buf = buf;
+ tinfo->net = net;
+ tinfo->sock = sock;
+ if (state == IP_VS_STATE_BACKUP) {
+ tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ GFP_KERNEL);
+ if (!tinfo->buf)
+ goto outtinfo;
+ } else {
+ tinfo->buf = NULL;
+ }
+ tinfo->id = id;
- task = kthread_run(threadfn, tinfo, name);
- if (IS_ERR(task)) {
- result = PTR_ERR(task);
- goto outtinfo;
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
+ if (IS_ERR(task)) {
+ result = PTR_ERR(task);
+ goto outtinfo;
+ }
+ tinfo = NULL;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->ms[id].master_thread = task;
+ else
+ array[id] = task;
}
/* mark as active */
- *realtask = task;
- ip_vs_sync_state |= state;
+
+ if (state == IP_VS_STATE_BACKUP)
+ ipvs->backup_threads = array;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ ipvs->sync_state |= state;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* increase the module use count */
ip_vs_use_count_inc();
return 0;
-outtinfo:
- kfree(tinfo);
-outbuf:
- kfree(buf);
outsocket:
- sock_release(sock);
+ sk_release_kernel(sock->sk);
+
+outtinfo:
+ if (tinfo) {
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ count = id;
+ while (count-- > 0) {
+ if (state == IP_VS_STATE_MASTER)
+ kthread_stop(ipvs->ms[count].master_thread);
+ else
+ kthread_stop(array[count]);
+ }
+ kfree(array);
+
out:
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ }
return result;
}
-int stop_sync_thread(int state)
+int stop_sync_thread(struct net *net, int state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct task_struct **array;
+ int id;
+ int retc = -EINVAL;
+
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
- if (!sync_master_thread)
+ if (!ipvs->ms)
return -ESRCH;
- pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(sync_master_thread));
-
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
* add sync buffers to the queue, when we are already in
* progress of stopping the master sync daemon.
*/
- spin_lock_bh(&ip_vs_sync_lock);
- ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
- spin_unlock_bh(&ip_vs_sync_lock);
- kthread_stop(sync_master_thread);
- sync_master_thread = NULL;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ spin_lock(&ipvs->sync_lock);
+ ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+ spin_unlock(&ipvs->sync_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ struct ipvs_master_sync_state *ms = &ipvs->ms[id];
+ int ret;
+
+ pr_info("stopping master sync thread %d ...\n",
+ task_pid_nr(ms->master_thread));
+ cancel_delayed_work_sync(&ms->master_wakeup_work);
+ ret = kthread_stop(ms->master_thread);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!sync_backup_thread)
+ if (!ipvs->backup_threads)
return -ESRCH;
- pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(sync_backup_thread));
-
- ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
- kthread_stop(sync_backup_thread);
- sync_backup_thread = NULL;
- } else {
- return -EINVAL;
+ ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+ array = ipvs->backup_threads;
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ int ret;
+
+ pr_info("stopping backup sync thread %d ...\n",
+ task_pid_nr(array[id]));
+ ret = kthread_stop(array[id]);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(array);
+ ipvs->backup_threads = NULL;
}
/* decrease the module use count */
ip_vs_use_count_dec();
+ return retc;
+}
+
+/*
+ * Initialize data struct for each netns
+ */
+int __net_init ip_vs_sync_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
+ spin_lock_init(&ipvs->sync_lock);
+ spin_lock_init(&ipvs->sync_buff_lock);
return 0;
}
+
+void ip_vs_sync_net_cleanup(struct net *net)
+{
+ int retc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Master Daemon\n");
+
+ retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Backup Daemon\n");
+ mutex_unlock(&ipvs->sync_mutex);
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bbddfdb10db..b5b4650d50a 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -27,30 +27,15 @@
#include <net/ip_vs.h>
-
-static inline unsigned int
-ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
-{
- /*
- * We think the overhead of processing active connections is 256
- * times higher than that of inactive connections in average. (This
- * 256 times might not be accurate, we will change it later) We
- * use the following formula to estimate the overhead now:
- * dest->activeconns*256 + dest->inactconns
- */
- return (atomic_read(&dest->activeconns) << 8) +
- atomic_read(&dest->inactconns);
-}
-
-
/*
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
+ int loh, doh;
IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
@@ -67,27 +52,27 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
- loh = ip_vs_wlc_dest_overhead(least);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
- IP_VS_ERR_RL("WLC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = ip_vs_wlc_dest_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -122,6 +107,7 @@ static int __init ip_vs_wlc_init(void)
static void __exit ip_vs_wlc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wlc_init);
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 30db633f88f..0546cd572d6 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -29,14 +29,45 @@
#include <net/ip_vs.h>
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
/*
* current destination pointer for weighted round-robin scheduling
*/
struct ip_vs_wrr_mark {
- struct list_head *cl; /* current list head */
+ struct ip_vs_dest *cl; /* current dest or head */
int cw; /* current weight */
int mw; /* maximum weight */
int di; /* decreasing interval */
+ struct rcu_head rcu_head;
};
@@ -84,41 +115,45 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the mark variable for WRR scheduling
*/
- mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
- if (mark == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
+ if (mark == NULL)
return -ENOMEM;
- }
- mark->cl = &svc->destinations;
- mark->cw = 0;
- mark->mw = ip_vs_wrr_max_weight(svc);
+
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ mark->cw = mark->mw;
svc->sched_data = mark;
return 0;
}
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
/*
* Release the mark variable
*/
- kfree(svc->sched_data);
-
- return 0;
+ kfree_rcu(mark, rcu_head);
}
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
struct ip_vs_wrr_mark *mark = svc->sched_data;
- mark->cl = &svc->destinations;
- mark->mw = ip_vs_wrr_max_weight(svc);
+ spin_lock_bh(&svc->sched_lock);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
- if (mark->cw > mark->mw)
- mark->cw = 0;
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ if (mark->cw > mark->mw || !mark->cw)
+ mark->cw = mark->mw;
+ else if (mark->di > 1)
+ mark->cw = (mark->cw / mark->di) * mark->di + 1;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -127,80 +162,82 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
* Weighted Round-Robin Scheduling
*/
static struct ip_vs_dest *
-ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
- struct ip_vs_dest *dest;
+ struct ip_vs_dest *dest, *last, *stop = NULL;
struct ip_vs_wrr_mark *mark = svc->sched_data;
- struct list_head *p;
+ bool last_pass = false, restarted = false;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- /*
- * This loop will always terminate, because mark->cw in (0, max_weight]
- * and at least one server has its weight equal to max_weight.
- */
- write_lock(&svc->sched_lock);
- p = mark->cl;
+ spin_lock_bh(&svc->sched_lock);
+ dest = mark->cl;
+ /* No available dests? */
+ if (mark->mw == 0)
+ goto err_noavail;
+ last = dest;
+ /* Stop only after all dests were checked for weight >= 1 (last pass) */
while (1) {
- if (mark->cl == &svc->destinations) {
- /* it is at the head of the destination list */
-
- if (mark->cl == mark->cl->next) {
- /* no dest entry */
- IP_VS_ERR_RL("WRR: no destination available: "
- "no destinations present\n");
- dest = NULL;
- goto out;
- }
-
- mark->cl = svc->destinations.next;
- mark->cw -= mark->di;
- if (mark->cw <= 0) {
- mark->cw = mark->mw;
- /*
- * Still zero, which means no available servers.
- */
- if (mark->cw == 0) {
- mark->cl = &svc->destinations;
- IP_VS_ERR_RL("WRR: no destination "
- "available\n");
- dest = NULL;
- goto out;
- }
- }
- } else
- mark->cl = mark->cl->next;
-
- if (mark->cl != &svc->destinations) {
- /* not at the head of the list */
- dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) >= mark->cw) {
- /* got it */
- break;
- }
+ atomic_read(&dest->weight) >= mark->cw)
+ goto found;
+ if (dest == stop)
+ goto err_over;
}
-
- if (mark->cl == p && mark->cw == mark->di) {
- /* back to the start, and no dest is found.
- It is only possible when all dests are OVERLOADED */
- dest = NULL;
- IP_VS_ERR_RL("WRR: no destination available: "
- "all destinations are overloaded\n");
- goto out;
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /* Stop if we tried last pass from first dest:
+ * 1. last_pass: we started checks when cw > di but
+ * then all dests were checked for w >= 1
+ * 2. last was head: the first and only traversal
+ * was for weight >= 1, for all dests.
+ */
+ if (last_pass ||
+ &last->n_list == &svc->destinations)
+ goto err_over;
+ restarted = true;
+ }
+ last_pass = mark->cw <= mark->di;
+ if (last_pass && restarted &&
+ &last->n_list != &svc->destinations) {
+ /* First traversal was for w >= 1 but only
+ * for dests after 'last', now do the same
+ * for all dests up to 'last'.
+ */
+ stop = last;
}
}
+found:
IP_VS_DBG_BUF(6, "WRR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt),
atomic_read(&dest->weight));
+ mark->cl = dest;
out:
- write_unlock(&svc->sched_lock);
+ spin_unlock_bh(&svc->sched_lock);
return dest;
+
+err_noavail:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available");
+ goto out;
+
+err_over:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available: "
+ "all destinations are overloaded");
+ goto out;
}
@@ -211,7 +248,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
.n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
.init_service = ip_vs_wrr_init_svc,
.done_service = ip_vs_wrr_done_svc,
- .update_service = ip_vs_wrr_update_svc,
+ .add_dest = ip_vs_wrr_dest_changed,
+ .del_dest = ip_vs_wrr_dest_changed,
+ .upd_dest = ip_vs_wrr_dest_changed,
.schedule = ip_vs_wrr_schedule,
};
@@ -223,6 +262,7 @@ static int __init ip_vs_wrr_init(void)
static void __exit ip_vs_wrr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wrr_init);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 5325a3fbe4a..73ba1cc7a88 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -17,6 +17,8 @@
* - not all connections have destination server, for example,
* connections in backup server when fwmark is used
* - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
* LOCAL_OUT rules:
* - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
* - skb->pkt_type is not set yet
@@ -43,158 +45,259 @@
#include <net/ip_vs.h>
+enum {
+ IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
+ IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
+ IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
+ * local
+ */
+ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
+ IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
+ IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
+};
+
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
/*
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
- u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
{
- struct dst_entry *old_dst;
+ struct ip_vs_dest_dst *old;
- old_dst = dest->dst_cache;
- dest->dst_cache = dst;
- dest->dst_rtos = rtos;
- dest->dst_cookie = dst_cookie;
- dst_release(old_dst);
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
+
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
}
-static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
+static inline struct ip_vs_dest_dst *
+__ip_vs_dst_check(struct ip_vs_dest *dest)
{
- struct dst_entry *dst = dest->dst_cache;
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
- if (!dst)
+ if (!dest_dst)
return NULL;
- if ((dst->obsolete || rtos != dest->dst_rtos) &&
- dst->ops->check(dst, dest->dst_cookie) == NULL) {
- dest->dst_cache = NULL;
- dst_release(dst);
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
+ return dest_dst;
+}
+
+static inline bool
+__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
+{
+ if (IP6CB(skb)->frag_max_size) {
+ /* frag_max_size tell us that, this packet have been
+ * defragmented by netfilter IPv6 conntrack module.
+ */
+ if (IP6CB(skb)->frag_max_size > mtu)
+ return true; /* largest fragment violate MTU */
}
- dst_hold(dst);
- return dst;
+ else if (skb->len > mtu && !skb_is_gso(skb)) {
+ return true; /* Packet size violate MTU size */
+ }
+ return false;
}
-/*
- * Get route to destination or remote server
- * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
- * &4=Allow redirect from remote daddr to local
- */
-static struct rtable *
+/* Get route to daddr, update *saddr, optionally bind route to saddr */
+static struct rtable *do_output_route4(struct net *net, __be32 daddr,
+ int rt_mode, __be32 *saddr)
+{
+ struct flowi4 fl4;
+ struct rtable *rt;
+ int loop = 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
+ fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
+ FLOWI_FLAG_KNOWN_NH : 0;
+
+retry:
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ /* Invalid saddr ? */
+ if (PTR_ERR(rt) == -EINVAL && *saddr &&
+ rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
+ *saddr = 0;
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
+ goto retry;
+ }
+ IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
+ return NULL;
+ } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ ip_rt_put(rt);
+ *saddr = fl4.saddr;
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
+ loop++;
+ goto retry;
+ }
+ *saddr = fl4.saddr;
+ return rt;
+}
+
+/* Get route to destination or remote server */
+static int
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
- __be32 daddr, u32 rtos, int rt_mode)
+ __be32 daddr, int rt_mode, __be32 *ret_saddr)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest_dst *dest_dst;
struct rtable *rt; /* Route to the other host */
struct rtable *ort; /* Original route */
- int local;
+ struct iphdr *iph;
+ __be16 df;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos))) {
- struct flowi fl = {
- .fl4_dst = dest->addr.ip,
- .fl4_tos = rtos,
- };
-
- if (ip_route_output_key(net, &rt, &fl)) {
- spin_unlock(&dest->dst_lock);
- IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &dest->addr.ip);
- return NULL;
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
}
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
- IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
- &dest->addr.ip,
- atomic_read(&rt->dst.__refcnt), rtos);
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest_dst->dst_saddr.ip);
+ if (!rt) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
+ }
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
+ atomic_read(&rt->dst.__refcnt));
}
- spin_unlock(&dest->dst_lock);
+ daddr = dest->addr.ip;
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.ip;
} else {
- struct flowi fl = {
- .fl4_dst = daddr,
- .fl4_tos = rtos,
- };
-
- if (ip_route_output_key(net, &rt, &fl)) {
- IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &daddr);
- return NULL;
- }
+ __be32 saddr = htonl(INADDR_ANY);
+
+ noref = 0;
+
+ /* For such unconfigured boxes avoid many route lookups
+ * for performance reasons because we do not remember saddr
+ */
+ rt_mode &= ~IP_VS_RT_MODE_CONNECT;
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ if (!rt)
+ goto err_unreach;
+ if (ret_saddr)
+ *ret_saddr = saddr;
}
- local = rt->rt_flags & RTCF_LOCAL;
- if (!((local ? 1 : 2) & rt_mode)) {
+ local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
+ if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+ rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
(rt->rt_flags & RTCF_LOCAL) ?
- "local":"non-local", &rt->rt_dst);
- ip_rt_put(rt);
- return NULL;
+ "local":"non-local", &daddr);
+ goto err_put;
}
- if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
- ort->rt_flags & RTCF_LOCAL)) {
- IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
- "requires NAT method, dest: %pI4\n",
- &ip_hdr(skb)->daddr, &rt->rt_dst);
- ip_rt_put(rt);
- return NULL;
- }
- if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
- "to non-local address, dest: %pI4\n",
- &ip_hdr(skb)->saddr, &rt->rt_dst);
- ip_rt_put(rt);
- return NULL;
+ iph = ip_hdr(skb);
+ if (likely(!local)) {
+ if (unlikely(ipv4_is_loopback(iph->saddr))) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI4 to non-local address, dest: %pI4\n",
+ &iph->saddr, &daddr);
+ goto err_put;
+ }
+ } else {
+ ort = skb_rtable(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !(ort->rt_flags & RTCF_LOCAL)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI4 to "
+ "local requires NAT method, dest: %pI4\n",
+ &iph->daddr, &daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ ip_rt_put(rt);
+ return local;
}
- return rt;
-}
-
-/* Reroute packet to local IPv4 stack after DNAT */
-static int
-__ip_vs_reroute_locally(struct sk_buff *skb)
-{
- struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = rt->dst.dev;
- struct net *net = dev_net(dev);
- struct iphdr *iph = ip_hdr(skb);
-
- if (rt_is_input_route(rt)) {
- unsigned long orefdst = skb->_skb_refdst;
-
- if (ip_route_input(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev))
- return 0;
- refdst_drop(orefdst);
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+ mtu = dst_mtu(&rt->dst);
+ df = iph->frag_off & htons(IP_DF);
} else {
- struct flowi fl = {
- .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .mark = skb->mark,
- };
- struct rtable *rt;
-
- if (ip_route_output_key(net, &rt, &fl))
- return 0;
- if (!(rt->rt_flags & RTCF_LOCAL)) {
- ip_rt_put(rt);
- return 0;
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+ goto err_put;
}
- /* Drop old route. */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ort = skb_rtable(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ /* MTU check allowed? */
+ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0;
}
- return 1;
+
+ /* MTU checking */
+ if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ ip_rt_put(rt);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#ifdef CONFIG_IP_VS_IPV6
static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
{
- return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+ return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
}
static struct dst_entry *
@@ -202,22 +305,27 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
struct in6_addr *ret_saddr, int do_xfrm)
{
struct dst_entry *dst;
- struct flowi fl = {
- .fl6_dst = *daddr,
+ struct flowi6 fl6 = {
+ .daddr = *daddr,
};
- dst = ip6_route_output(net, NULL, &fl);
+ dst = ip6_route_output(net, NULL, &fl6);
if (dst->error)
goto out_err;
if (!ret_saddr)
return dst;
- if (ipv6_addr_any(&fl.fl6_src) &&
+ if (ipv6_addr_any(&fl6.saddr) &&
ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
- &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+ &fl6.daddr, 0, &fl6.saddr) < 0)
goto out_err;
- if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
- goto out_err;
- ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+ if (do_xfrm) {
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ dst = NULL;
+ goto out_err;
+ }
+ }
+ *ret_saddr = fl6.saddr;
return dst;
out_err:
@@ -228,133 +336,197 @@ out_err:
/*
* Get route to destination or remote server
- * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
- * &4=Allow redirect from remote daddr to local
*/
-static struct rt6_info *
+static int
__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
struct in6_addr *daddr, struct in6_addr *ret_saddr,
- int do_xfrm, int rt_mode)
+ struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
struct rt6_info *rt; /* Route to the other host */
struct rt6_info *ort; /* Original route */
struct dst_entry *dst;
- int local;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
- if (!rt) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
u32 cookie;
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
- &dest->dst_saddr,
+ &dest_dst->dst_saddr.in6,
do_xfrm);
if (!dst) {
- spin_unlock(&dest->dst_lock);
- return NULL;
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
}
rt = (struct rt6_info *) dst;
cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
- &dest->addr.in6, &dest->dst_saddr,
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
atomic_read(&rt->dst.__refcnt));
}
if (ret_saddr)
- ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.in6;
} else {
+ noref = 0;
dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
if (!dst)
- return NULL;
+ goto err_unreach;
rt = (struct rt6_info *) dst;
}
local = __ip_vs_is_local_route6(rt);
- if (!((local ? 1 : 2) & rt_mode)) {
- IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+ if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+ rt_mode)) {
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
local ? "local":"non-local", daddr);
- dst_release(&rt->dst);
- return NULL;
+ goto err_put;
}
- if (local && !(rt_mode & 4) &&
- !((ort = (struct rt6_info *) skb_dst(skb)) &&
- __ip_vs_is_local_route6(ort))) {
- IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
- "requires NAT method, dest: %pI6\n",
- &ipv6_hdr(skb)->daddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+ if (likely(!local)) {
+ if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+ IPV6_ADDR_LOOPBACK)) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI6c to non-local address, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ goto err_put;
+ }
+ } else {
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !__ip_vs_is_local_route6(ort)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI6c "
+ "to local requires NAT method, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->daddr, daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ dst_release(&rt->dst);
+ return local;
}
- if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
- IPV6_ADDR_LOOPBACK)) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
- "to non-local address, dest: %pI6\n",
- &ipv6_hdr(skb)->saddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+
+ /* MTU checking */
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+ mtu = dst_mtu(&rt->dst);
+ else {
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto err_put;
+ }
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
}
- return rt;
+ if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh->fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ dst_release(&rt->dst);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#endif
-/*
- * Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
{
- struct dst_entry *old_dst;
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
- old_dst = dest->dst_cache;
- dest->dst_cache = NULL;
- dst_release(old_dst);
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
}
-#define IP_VS_XMIT_TUNNEL(skb, cp) \
-({ \
- int __ret = NF_ACCEPT; \
- \
- (skb)->ipvs_property = 1; \
- if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
- __ret = ip_vs_confirm_conntrack(skb, cp); \
- if (__ret == NF_ACCEPT) { \
- nf_reset(skb); \
- skb_forward_csum(skb); \
- } \
- __ret; \
-})
-
-#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- else \
- ip_vs_update_conntrack(skb, cp, 1); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
-
-#define IP_VS_XMIT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
/*
@@ -362,10 +534,10 @@ do { \
*/
int
ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
/* we do not touch skb and do not need pskb ptr */
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
@@ -376,53 +548,31 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
int
ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
struct iphdr *iph = ip_hdr(skb);
- int mtu;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
- RT_TOS(iph->tos), 2)))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+ NULL) < 0)
goto tx_error;
- }
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
- ip_send_check(ip_hdr(skb));
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ip_send_check(iph);
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -430,57 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- struct ipv6hdr *iph = ipv6_hdr(skb);
- int mtu;
-
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL,
+ ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
goto tx_error;
- }
-
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -492,92 +612,71 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
- struct iphdr *iph = ip_hdr(skb);
- int local;
+ int local, rc, was_input;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos), 1|2|4)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ was_input = rt_is_input_route(skb_rtable(skb));
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
"ip_vs_nat_xmit(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(rt->rt_dst) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
- "ip_vs_nat_xmit(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct iphdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
- if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
- goto tx_error_put;
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- if (!local) {
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
@@ -585,64 +684,64 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
- int local;
+ int local, rc;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
- if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
- sizeof(_pt), &_pt);
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, 1|2|4)))
- goto tx_error_icmp;
- local = __ip_vs_is_local_route6(rt);
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -653,43 +752,20 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
- "ip_vs_nat_xmit_v6(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
- if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
goto tx_error;
- ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
-
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
+ ipv6_hdr(skb)->daddr = cp->daddr.in6;
IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
@@ -698,22 +774,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
LeaveFunction(10);
kfree_skb(skb);
+ rcu_read_unlock();
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -739,63 +812,52 @@ tx_error_put:
*/
int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
struct rtable *rt; /* Route to the other host */
+ __be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = ip_hdr(skb);
u8 tos = old_iph->tos;
- __be16 df = old_iph->frag_off;
+ __be16 df;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(tos), 1|2)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT |
+ IP_VS_RT_MODE_TUNNEL, &saddr);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
+ rt = skb_rtable(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
- if (mtu < 68) {
- IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
- goto tx_error_put;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
-
- df |= (old_iph->frag_off & htons(IP_DF));
-
- if ((old_iph->frag_off & htons(IP_DF))
- && mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
+ /* Copy DF, reset fragment offset and MF */
+ df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
- kfree_skb(skb);
+
+ if (!new_skb)
+ goto tx_error;
+ consume_skb(skb);
skb = new_skb;
old_iph = ip_hdr(skb);
}
@@ -809,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -822,39 +880,36 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
iph->frag_off = df;
iph->protocol = IPPROTO_IPIP;
iph->tos = tos;
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
+ iph->daddr = cp->daddr.ip;
+ iph->saddr = saddr;
iph->ttl = old_iph->ttl;
- ip_select_ident(iph, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
struct in6_addr saddr; /* Source for tunnel */
@@ -862,57 +917,38 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *old_iph = ipv6_hdr(skb);
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
- &saddr, 1, 1|2)))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, ipvsh, 1,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_TUNNEL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
+ rt = (struct rt6_info *) skb_dst(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
- if (mtu < IPV6_MIN_MTU) {
- IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
- IPV6_MIN_MTU);
- goto tx_error_put;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
-
- if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
-
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- dst_release(&rt->dst);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
- kfree_skb(skb);
+
+ if (!new_skb)
+ goto tx_error;
+ consume_skb(skb);
skb = new_skb;
old_iph = ipv6_hdr(skb);
}
@@ -923,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -937,32 +969,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
iph->priority = old_iph->priority;
memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
- ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
- ipv6_addr_copy(&iph->saddr, &saddr);
+ iph->daddr = cp->daddr.in6;
+ iph->saddr = saddr;
iph->hop_limit = old_iph->hop_limit;
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip6_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -973,57 +1002,38 @@ tx_error_put:
*/
int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
- struct iphdr *iph = ip_hdr(skb);
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos), 1|2)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
ip_send_check(ip_hdr(skb));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1031,61 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, 1|2)))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1098,12 +1083,13 @@ tx_error:
*/
int
ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, int offset)
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *iph)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
+ int rt_mode, was_input;
EnterFunction(10);
@@ -1112,7 +1098,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
translate address/port back */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
- rc = cp->packet_xmit(skb, cp, pp);
+ rc = cp->packet_xmit(skb, cp, pp, iph);
else
rc = NF_ACCEPT;
/* do not touch skb anymore */
@@ -1123,101 +1109,79 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/*
* mangle and send the packet here (only for VS/NAT)
*/
-
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ was_input = rt_is_input_route(skb_rtable(skb));
+
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(rt->rt_dst) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp(skb, pp, cp, 0);
- if (!local) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
goto out;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, int offset)
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
+ int rt_mode;
EnterFunction(10);
@@ -1226,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
translate address/port back */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
- rc = cp->packet_xmit(skb, cp, pp);
+ rc = cp->packet_xmit(skb, cp, pp, ipvsh);
else
rc = NF_ACCEPT;
/* do not touch skb anymore */
@@ -1238,25 +1202,30 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
* mangle and send the packet here (only for VS/NAT)
*/
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, 1|2|4)))
- goto tx_error_icmp;
-
- local = __ip_vs_is_local_route6(rt);
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0, rt_mode);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -1267,58 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp_v6(skb, pp, cp, 0);
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
goto out;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 5178c691ecb..a4b5e2a435a 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -12,12 +12,13 @@
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/moduleparam.h>
+#include <linux/export.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
-static int nf_ct_acct __read_mostly;
+static bool nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
@@ -38,21 +39,23 @@ static struct ctl_table acct_sysctl_table[] = {
unsigned int
seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
{
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
+ struct nf_conn_counter *counter;
acct = nf_conn_acct_find(ct);
if (!acct)
return 0;
+ counter = acct->counter;
return seq_printf(s, "packets=%llu bytes=%llu ",
- (unsigned long long)acct[dir].packets,
- (unsigned long long)acct[dir].bytes);
+ (unsigned long long)atomic64_read(&counter[dir].packets),
+ (unsigned long long)atomic64_read(&counter[dir].bytes));
};
EXPORT_SYMBOL_GPL(seq_print_acct);
static struct nf_ct_ext_type acct_extend __read_mostly = {
- .len = sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]),
- .align = __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]),
+ .len = sizeof(struct nf_conn_acct),
+ .align = __alignof__(struct nf_conn_acct),
.id = NF_CT_EXT_ACCT,
};
@@ -68,8 +71,12 @@ static int nf_conntrack_acct_init_sysctl(struct net *net)
table[0].data = &net->ct.sysctl_acct;
- net->ct.acct_sysctl_header = register_net_sysctl_table(net,
- nf_net_netfilter_sysctl_path, table);
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter",
+ table);
if (!net->ct.acct_sysctl_header) {
printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");
goto out_register;
@@ -101,36 +108,26 @@ static void nf_conntrack_acct_fini_sysctl(struct net *net)
}
#endif
-int nf_conntrack_acct_init(struct net *net)
+int nf_conntrack_acct_pernet_init(struct net *net)
{
- int ret;
-
net->ct.sysctl_acct = nf_ct_acct;
+ return nf_conntrack_acct_init_sysctl(net);
+}
- if (net_eq(net, &init_net)) {
- ret = nf_ct_extend_register(&acct_extend);
- if (ret < 0) {
- printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n");
- goto out_extend_register;
- }
- }
+void nf_conntrack_acct_pernet_fini(struct net *net)
+{
+ nf_conntrack_acct_fini_sysctl(net);
+}
- ret = nf_conntrack_acct_init_sysctl(net);
+int nf_conntrack_acct_init(void)
+{
+ int ret = nf_ct_extend_register(&acct_extend);
if (ret < 0)
- goto out_sysctl;
-
- return 0;
-
-out_sysctl:
- if (net_eq(net, &init_net))
- nf_ct_extend_unregister(&acct_extend);
-out_extend_register:
+ pr_err("nf_conntrack_acct: Unable to register extension\n");
return ret;
}
-void nf_conntrack_acct_fini(struct net *net)
+void nf_conntrack_acct_fini(void)
{
- nf_conntrack_acct_fini_sysctl(net);
- if (net_eq(net, &init_net))
- nf_ct_extend_unregister(&acct_extend);
+ nf_ct_extend_unregister(&acct_extend);
}
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index 13fd2c55e32..b8b95f4027c 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -2,6 +2,7 @@
*
* (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
* based on HW's ip_conntrack_irc.c as well as other modules
+ * (C) 2006 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -40,6 +41,7 @@ MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned int matchoff,
unsigned int matchlen,
struct nf_conntrack_expect *exp)
@@ -107,8 +109,7 @@ static int amanda_help(struct sk_buff *skb,
/* No data? */
dataoff = protoff + sizeof(struct udphdr);
if (dataoff >= skb->len) {
- if (net_ratelimit())
- printk(KERN_ERR "amanda_help: skblen = %u\n", skb->len);
+ net_err_ratelimited("amanda_help: skblen = %u\n", skb->len);
return NF_ACCEPT;
}
@@ -145,6 +146,7 @@ static int amanda_help(struct sk_buff *skb,
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
ret = NF_DROP;
goto out;
}
@@ -156,10 +158,12 @@ static int amanda_help(struct sk_buff *skb,
nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook);
if (nf_nat_amanda && ct->status & IPS_NAT_MASK)
- ret = nf_nat_amanda(skb, ctinfo, off - dataoff,
- len, exp);
- else if (nf_ct_expect_related(exp) != 0)
+ ret = nf_nat_amanda(skb, ctinfo, protoff,
+ off - dataoff, len, exp);
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
+ }
nf_ct_expect_put(exp);
}
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 00000000000..4e99cca6161
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
+/*
+ * broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+int nf_conntrack_broadcast_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int timeout)
+{
+ struct nf_conntrack_expect *exp;
+ struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct nf_conn_help *help = nfct_help(ct);
+ __be32 mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if (skb->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ exp->mask.src.u3.ip = mask;
+ exp->mask.src.u.udp.port = htons(0xFFFF);
+
+ exp->expectfn = NULL;
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
+ exp->helper = NULL;
+
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+
+ nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 27a5ea6b6a0..1f4f954c4b4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -5,6 +5,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -38,13 +39,19 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
#define NF_CONNTRACK_VERSION "0.5.0"
@@ -53,8 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
const struct nlattr *attr) __read_mostly;
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
-DEFINE_SPINLOCK(nf_conntrack_lock);
-EXPORT_SYMBOL_GPL(nf_conntrack_lock);
+__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
+EXPORT_SYMBOL_GPL(nf_conntrack_locks);
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
+EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+
+static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
+{
+ h1 %= CONNTRACK_LOCKS;
+ h2 %= CONNTRACK_LOCKS;
+ spin_unlock(&nf_conntrack_locks[h1]);
+ if (h1 != h2)
+ spin_unlock(&nf_conntrack_locks[h2]);
+}
+
+/* return true if we need to recompute hashes (in case hash table was resized) */
+static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
+ unsigned int h2, unsigned int sequence)
+{
+ h1 %= CONNTRACK_LOCKS;
+ h2 %= CONNTRACK_LOCKS;
+ if (h1 <= h2) {
+ spin_lock(&nf_conntrack_locks[h1]);
+ if (h1 != h2)
+ spin_lock_nested(&nf_conntrack_locks[h2],
+ SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&nf_conntrack_locks[h2]);
+ spin_lock_nested(&nf_conntrack_locks[h1],
+ SINGLE_DEPTH_NESTING);
+ }
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ nf_conntrack_double_unlock(h1, h2);
+ return true;
+ }
+ return false;
+}
+
+static void nf_conntrack_all_lock(void)
+{
+ int i;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_lock_nested(&nf_conntrack_locks[i], i);
+}
+
+static void nf_conntrack_all_unlock(void)
+{
+ int i;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_unlock(&nf_conntrack_locks[i]);
+}
unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
@@ -65,7 +123,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-static unsigned int nf_conntrack_hash_rnd __read_mostly;
+unsigned int nf_conntrack_hash_rnd __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
{
@@ -184,6 +243,50 @@ clean_from_lists(struct nf_conn *ct)
nf_ct_remove_expectations(ct);
}
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_dying_list(struct nf_conn *ct)
+{
+ struct ct_pcpu *pcpu;
+
+ /* add this conntrack to the (per cpu) dying list */
+ ct->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+ spin_lock(&pcpu->lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->dying);
+ spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
+{
+ struct ct_pcpu *pcpu;
+
+ /* add this conntrack to the (per cpu) unconfirmed list */
+ ct->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+ spin_lock(&pcpu->lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->unconfirmed);
+ spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
+{
+ struct ct_pcpu *pcpu;
+
+ /* We overload first tuple to link into unconfirmed or dying list.*/
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+ spin_lock(&pcpu->lock);
+ BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ spin_unlock(&pcpu->lock);
+}
+
static void
destroy_conntrack(struct nf_conntrack *nfct)
{
@@ -195,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
- /* To make sure we don't get any weird locking issues here:
- * destroy_conntrack() MUST NOT be called with a write lock
- * to nf_conntrack_lock!!! -HW */
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
if (l4proto && l4proto->destroy)
@@ -205,21 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct)
rcu_read_unlock();
- spin_lock_bh(&nf_conntrack_lock);
+ local_bh_disable();
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
- * too. */
+ * too.
+ */
nf_ct_remove_expectations(ct);
- /* We overload first tuple to link into unconfirmed list. */
- if (!nf_ct_is_confirmed(ct)) {
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- }
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
NF_CT_STAT_INC(net, delete);
- spin_unlock_bh(&nf_conntrack_lock);
+ local_bh_enable();
if (ct->master)
nf_ct_put(ct->master);
@@ -228,79 +325,114 @@ destroy_conntrack(struct nf_conntrack *nfct)
nf_conntrack_free(ct);
}
-void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
+ unsigned int hash, reply_hash;
+ u16 zone = nf_ct_zone(ct);
+ unsigned int sequence;
nf_ct_helper_destroy(ct);
- spin_lock_bh(&nf_conntrack_lock);
- /* Inside lock so preempt is disabled on module removal path.
- * Otherwise we can get spurious warnings. */
- NF_CT_STAT_INC(net, delete_list);
+
+ local_bh_disable();
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+
clean_from_lists(ct);
- spin_unlock_bh(&nf_conntrack_lock);
+ nf_conntrack_double_unlock(hash, reply_hash);
+
+ nf_ct_add_to_dying_list(ct);
+
+ NF_CT_STAT_INC(net, delete_list);
+ local_bh_enable();
}
-EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
static void death_by_event(unsigned long ul_conntrack)
{
struct nf_conn *ct = (void *)ul_conntrack;
struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
+
+ BUG_ON(ecache == NULL);
if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
/* bad luck, let's retry again */
- ct->timeout.expires = jiffies +
- (random32() % net->ct.sysctl_events_retry_timeout);
- add_timer(&ct->timeout);
+ ecache->timeout.expires = jiffies +
+ (prandom_u32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ecache->timeout);
return;
}
/* we've got the event delivered, now it's dying */
set_bit(IPS_DYING_BIT, &ct->status);
- spin_lock(&nf_conntrack_lock);
- hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- spin_unlock(&nf_conntrack_lock);
nf_ct_put(ct);
}
-void nf_ct_insert_dying_list(struct nf_conn *ct)
+static void nf_ct_dying_timeout(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
+
+ BUG_ON(ecache == NULL);
- /* add this conntrack to the dying list */
- spin_lock_bh(&nf_conntrack_lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &net->ct.dying);
- spin_unlock_bh(&nf_conntrack_lock);
/* set a new timer to retry event delivery */
- setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
- ct->timeout.expires = jiffies +
- (random32() % net->ct.sysctl_events_retry_timeout);
- add_timer(&ct->timeout);
+ setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct);
+ ecache->timeout.expires = jiffies +
+ (prandom_u32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ecache->timeout);
}
-EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
-static void death_by_timeout(unsigned long ul_conntrack)
+bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
- struct nf_conn *ct = (void *)ul_conntrack;
+ struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp && tstamp->stop == 0)
+ tstamp->stop = ktime_to_ns(ktime_get_real());
- if (!test_bit(IPS_DYING_BIT, &ct->status) &&
- unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+ if (!nf_ct_is_dying(ct) &&
+ unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct,
+ portid, report) < 0)) {
/* destroy event was not delivered */
nf_ct_delete_from_lists(ct);
- nf_ct_insert_dying_list(ct);
- return;
+ nf_ct_dying_timeout(ct);
+ return false;
}
set_bit(IPS_DYING_BIT, &ct->status);
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
+}
+
+static inline bool
+nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
+ const struct nf_conntrack_tuple *tuple,
+ u16 zone)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ /* A conntrack can be recreated with the equal tuple,
+ * so we need to check that the conntrack is confirmed
+ */
+ return nf_ct_tuple_equal(tuple, &h->tuple) &&
+ nf_ct_zone(ct) == zone &&
+ nf_ct_is_confirmed(ct);
}
/*
* Warning :
* - Caller must take a reference on returned object
* and recheck nf_ct_tuple_equal(tuple, &h->tuple)
- * OR
- * - Caller must lock nf_conntrack_lock before calling this function
*/
static struct nf_conntrack_tuple_hash *
____nf_conntrack_find(struct net *net, u16 zone,
@@ -316,8 +448,7 @@ ____nf_conntrack_find(struct net *net, u16 zone,
local_bh_disable();
begin:
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
- if (nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
+ if (nf_ct_key_equal(h, tuple, zone)) {
NF_CT_STAT_INC(net, found);
local_bh_enable();
return h;
@@ -338,15 +469,6 @@ begin:
return NULL;
}
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
-{
- return ____nf_conntrack_find(net, zone, tuple,
- hash_conntrack_raw(tuple, zone));
-}
-EXPORT_SYMBOL_GPL(__nf_conntrack_find);
-
/* Find a connection corresponding to a tuple. */
static struct nf_conntrack_tuple_hash *
__nf_conntrack_find_get(struct net *net, u16 zone,
@@ -364,8 +486,7 @@ begin:
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
- if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
- nf_ct_zone(ct) != zone)) {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
nf_ct_put(ct);
goto begin;
}
@@ -387,42 +508,103 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
static void __nf_conntrack_hash_insert(struct nf_conn *ct,
unsigned int hash,
- unsigned int repl_hash)
+ unsigned int reply_hash)
{
struct net *net = nf_ct_net(ct);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
&net->ct.hash[hash]);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
- &net->ct.hash[repl_hash]);
+ &net->ct.hash[reply_hash]);
}
-void nf_conntrack_hash_insert(struct nf_conn *ct)
+int
+nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
- unsigned int hash, repl_hash;
+ unsigned int hash, reply_hash;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
u16 zone;
+ unsigned int sequence;
zone = nf_ct_zone(ct);
- hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- __nf_conntrack_hash_insert(ct, hash, repl_hash);
+ local_bh_disable();
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+
+ /* See if there's one in the list already, including reverse */
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+
+ add_timer(&ct->timeout);
+ smp_wmb();
+ /* The caller holds a reference to this object */
+ atomic_set(&ct->ct_general.use, 2);
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert);
+ local_bh_enable();
+ return 0;
+
+out:
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert_failed);
+ local_bh_enable();
+ return -EEXIST;
}
-EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
+
+/* deletion from this larval template list happens via nf_ct_put() */
+void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
+{
+ struct ct_pcpu *pcpu;
+
+ __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ nf_conntrack_get(&tmpl->ct_general);
+
+ /* add this conntrack to the (per cpu) tmpl list */
+ local_bh_disable();
+ tmpl->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
+
+ spin_lock(&pcpu->lock);
+ /* Overload tuple linked list to put us in template list. */
+ hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->tmpl);
+ spin_unlock_bh(&pcpu->lock);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
- unsigned int hash, repl_hash;
+ unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
+ struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
u16 zone;
+ unsigned int sequence;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
@@ -435,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
zone = nf_ct_zone(ct);
- /* reuse the hash saved before */
- hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
- hash = hash_bucket(hash, net);
- repl_hash = hash_conntrack(net, zone,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ local_bh_disable();
+
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ /* reuse the hash saved before */
+ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+ hash = hash_bucket(hash, net);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
- connections for unconfirmed conns. But packet copies and
- REJECT will give spurious warnings here. */
+ * connections for unconfirmed conns. But packet copies and
+ * REJECT will give spurious warnings here.
+ */
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
- /* No external references means noone else could have
- confirmed us. */
+ /* No external references means no one else could have
+ * confirmed us.
+ */
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
pr_debug("Confirming conntrack %p\n", ct);
-
- spin_lock_bh(&nf_conntrack_lock);
-
/* We have to check the DYING flag inside the lock to prevent
a race against nf_ct_get_next_corpse() possibly called from
user context, else we insert an already 'dead' hash, blocking
further use of that particular connection -JM */
if (unlikely(nf_ct_is_dying(ct))) {
- spin_unlock_bh(&nf_conntrack_lock);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ local_bh_enable();
return NF_ACCEPT;
}
@@ -471,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb)
&h->tuple) &&
zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
goto out;
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
&h->tuple) &&
zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
goto out;
- /* Remove from unconfirmed list */
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
@@ -486,16 +673,25 @@ __nf_conntrack_confirm(struct sk_buff *skb)
ct->timeout.expires += jiffies;
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
- set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ ct->status |= IPS_CONFIRMED;
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp(skb);
+
+ tstamp->start = ktime_to_ns(skb->tstamp);
+ }
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
* stores are visible.
*/
- __nf_conntrack_hash_insert(ct, hash, repl_hash);
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
+ nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert);
- spin_unlock_bh(&nf_conntrack_lock);
+ local_bh_enable();
help = nfct_help(ct);
if (help && help->helper)
@@ -506,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
out:
+ nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
- spin_unlock_bh(&nf_conntrack_lock);
+ local_bh_enable();
return NF_DROP;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -550,52 +747,77 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
/* There's a small race here where we may free a just-assured
connection. Too bad: we're in trouble anyway. */
-static noinline int early_drop(struct net *net, unsigned int hash)
+static noinline int early_drop(struct net *net, unsigned int _hash)
{
/* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct = NULL, *tmp;
struct hlist_nulls_node *n;
- unsigned int i, cnt = 0;
+ unsigned int i = 0, cnt = 0;
int dropped = 0;
+ unsigned int hash, sequence;
+ spinlock_t *lockp;
- rcu_read_lock();
- for (i = 0; i < net->ct.htable_size; i++) {
+ local_bh_disable();
+restart:
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_bucket(_hash, net);
+ for (; i < net->ct.htable_size; i++) {
+ lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
+ spin_lock(lockp);
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ spin_unlock(lockp);
+ goto restart;
+ }
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
+ if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
+ !nf_ct_is_dying(tmp) &&
+ atomic_inc_not_zero(&tmp->ct_general.use)) {
ct = tmp;
+ break;
+ }
cnt++;
}
- if (ct != NULL) {
- if (likely(!nf_ct_is_dying(ct) &&
- atomic_inc_not_zero(&ct->ct_general.use)))
- break;
- else
- ct = NULL;
- }
+ hash = (hash + 1) % net->ct.htable_size;
+ spin_unlock(lockp);
- if (cnt >= NF_CT_EVICTION_RANGE)
+ if (ct || cnt >= NF_CT_EVICTION_RANGE)
break;
- hash = (hash + 1) % net->ct.htable_size;
}
- rcu_read_unlock();
+ local_bh_enable();
if (!ct)
return dropped;
if (del_timer(&ct->timeout)) {
- death_by_timeout((unsigned long)ct);
- dropped = 1;
- NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ if (nf_ct_delete(ct, 0, 0)) {
+ dropped = 1;
+ NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ }
}
nf_ct_put(ct);
return dropped;
}
+void init_nf_conntrack_hash_rnd(void)
+{
+ unsigned int rand;
+
+ /*
+ * Why not initialize nf_conntrack_rnd in a "init()" function ?
+ * Because there isn't enough entropy when system initializing,
+ * and we initialize it as late as possible.
+ */
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+}
+
static struct nf_conn *
__nf_conntrack_alloc(struct net *net, u16 zone,
const struct nf_conntrack_tuple *orig,
@@ -605,18 +827,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
struct nf_conn *ct;
if (unlikely(!nf_conntrack_hash_rnd)) {
- unsigned int rand;
-
- /*
- * Why not initialize nf_conntrack_rnd in a "init()" function ?
- * Because there isn't enough entropy when system initializing,
- * and we initialize it as late as possible.
- */
- do {
- get_random_bytes(&rand, sizeof(rand));
- } while (!rand);
- cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
-
+ init_nf_conntrack_hash_rnd();
/* recompute the hash as nf_conntrack_hash_rnd is initialized */
hash = hash_conntrack_raw(orig, zone);
}
@@ -626,12 +837,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
- if (!early_drop(net, hash_bucket(hash, net))) {
+ if (!early_drop(net, hash)) {
atomic_dec(&net->ct.count);
- if (net_ratelimit())
- printk(KERN_WARNING
- "nf_conntrack: table full, dropping"
- " packet.\n");
+ net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
return ERR_PTR(-ENOMEM);
}
}
@@ -642,7 +850,6 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
*/
ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
if (ct == NULL) {
- pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
}
@@ -651,7 +858,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
* and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
*/
memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
- sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
+ offsetof(struct nf_conn, proto) -
+ offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -671,15 +879,15 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
nf_ct_zone->id = zone;
}
#endif
- /*
- * changes to lookup keys must be done before setting refcnt to 1
+ /* Because we use RCU lookups, we set ct_general.use to zero before
+ * this is inserted in any list.
*/
- smp_wmb();
- atomic_set(&ct->ct_general.use, 1);
+ atomic_set(&ct->ct_general.use, 0);
return ct;
#ifdef CONFIG_NF_CONNTRACK_ZONES
out_free:
+ atomic_dec(&net->ct.count);
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
return ERR_PTR(-ENOMEM);
#endif
@@ -698,13 +906,20 @@ void nf_conntrack_free(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
+ /* A freed object has refcnt == 0, that's
+ * the golden rule for SLAB_DESTROY_BY_RCU
+ */
+ NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
+
nf_ct_ext_destroy(ct);
- atomic_dec(&net->ct.count);
nf_ct_ext_free(ct);
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ smp_mb__before_atomic();
+ atomic_dec(&net->ct.count);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
+
/* Allocate a new conntrack: we return -ENOMEM if classification
failed due to stress. Otherwise it really is unclassifiable. */
static struct nf_conntrack_tuple_hash *
@@ -719,8 +934,10 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
struct nf_conntrack_ecache *ecache;
- struct nf_conntrack_expect *exp;
+ struct nf_conntrack_expect *exp = NULL;
u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ struct nf_conn_timeout *timeout_ext;
+ unsigned int *timeouts;
if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
pr_debug("Can't invert tuple.\n");
@@ -729,56 +946,76 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
- if (IS_ERR(ct)) {
- pr_debug("Can't allocate conntrack.\n");
+ if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
+
+ if (tmpl && nfct_synproxy(tmpl)) {
+ nfct_seqadj_ext_add(ct);
+ nfct_synproxy_ext_add(ct);
}
- if (!l4proto->new(ct, skb, dataoff)) {
+ timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
+ if (timeout_ext)
+ timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
+ else
+ timeouts = l4proto->get_timeouts(net);
+
+ if (!l4proto->new(ct, skb, dataoff, timeouts)) {
nf_conntrack_free(ct);
pr_debug("init conntrack: can't track with proto module\n");
return NULL;
}
+ if (timeout_ext)
+ nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC);
+
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
+ nf_ct_labels_ext_add(ct);
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
ecache ? ecache->expmask : 0,
GFP_ATOMIC);
- spin_lock_bh(&nf_conntrack_lock);
- exp = nf_ct_find_expectation(net, zone, tuple);
- if (exp) {
- pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
- ct, exp);
- /* Welcome, Mr. Bond. We've been expecting you... */
- __set_bit(IPS_EXPECTED_BIT, &ct->status);
- ct->master = exp->master;
- if (exp->helper) {
- help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
- if (help)
- rcu_assign_pointer(help->helper, exp->helper);
- }
+ local_bh_disable();
+ if (net->ct.expect_count) {
+ spin_lock(&nf_conntrack_expect_lock);
+ exp = nf_ct_find_expectation(net, zone, tuple);
+ if (exp) {
+ pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ ct, exp);
+ /* Welcome, Mr. Bond. We've been expecting you... */
+ __set_bit(IPS_EXPECTED_BIT, &ct->status);
+ /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
+ ct->master = exp->master;
+ if (exp->helper) {
+ help = nf_ct_helper_ext_add(ct, exp->helper,
+ GFP_ATOMIC);
+ if (help)
+ rcu_assign_pointer(help->helper, exp->helper);
+ }
#ifdef CONFIG_NF_CONNTRACK_MARK
- ct->mark = exp->master->mark;
+ ct->mark = exp->master->mark;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- ct->secmark = exp->master->secmark;
+ ct->secmark = exp->master->secmark;
#endif
- nf_conntrack_get(&ct->master->ct_general);
- NF_CT_STAT_INC(net, expect_new);
- } else {
+ NF_CT_STAT_INC(net, expect_new);
+ }
+ spin_unlock(&nf_conntrack_expect_lock);
+ }
+ if (!exp) {
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
NF_CT_STAT_INC(net, new);
}
- /* Overload tuple linked list to put us in unconfirmed list. */
- hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &net->ct.unconfirmed);
+ /* Now it is inserted into the unconfirmed list, bump refcount */
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_add_to_unconfirmed_list(ct);
- spin_unlock_bh(&nf_conntrack_lock);
+ local_bh_enable();
if (exp) {
if (exp->expectfn)
@@ -829,7 +1066,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
/* It exists; we have (non-exclusive) reference. */
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
- *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+ *ctinfo = IP_CT_ESTABLISHED_REPLY;
/* Please set reply bit if this packet OK */
*set_reply = 1;
} else {
@@ -860,6 +1097,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
enum ip_conntrack_info ctinfo;
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
+ unsigned int *timeouts;
unsigned int dataoff;
u_int8_t protonum;
int set_reply = 0;
@@ -880,7 +1118,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
&dataoff, &protonum);
if (ret <= 0) {
- pr_debug("not prepared to track yet or error occured\n");
+ pr_debug("not prepared to track yet or error occurred\n");
NF_CT_STAT_INC_ATOMIC(net, error);
NF_CT_STAT_INC_ATOMIC(net, invalid);
ret = -ret;
@@ -901,6 +1139,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
ret = -ret;
goto out;
}
+ /* ICMP[v6] protocol trackers may assign one conntrack. */
+ if (skb->nfct)
+ goto out;
}
ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
@@ -921,7 +1162,10 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
NF_CT_ASSERT(skb->nfct);
- ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
+ /* Decide what timeout policy we want to apply to this flow. */
+ timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
+
+ ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
@@ -938,8 +1182,15 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
- if (tmpl)
- nf_ct_put(tmpl);
+ if (tmpl) {
+ /* Special case: we have to repeat this hook, assign the
+ * template again to this packet. We assume that this packet
+ * has no conntrack assigned. This is used by nf_ct_tcp. */
+ if (ret == NF_REPEAT)
+ skb->nfct = (struct nf_conntrack *)tmpl;
+ else
+ nf_ct_put(tmpl);
+ }
return ret;
}
@@ -1012,14 +1263,14 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
acct:
if (do_acct) {
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
acct = nf_conn_acct_find(ct);
if (acct) {
- spin_lock_bh(&ct->lock);
- acct[CTINFO2DIR(ctinfo)].packets++;
- acct[CTINFO2DIR(ctinfo)].bytes += skb->len;
- spin_unlock_bh(&ct->lock);
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
}
}
}
@@ -1031,15 +1282,15 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
int do_acct)
{
if (do_acct) {
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
acct = nf_conn_acct_find(ct);
if (acct) {
- spin_lock_bh(&ct->lock);
- acct[CTINFO2DIR(ctinfo)].packets++;
- acct[CTINFO2DIR(ctinfo)].bytes +=
- skb->len - skb_network_offset(skb);
- spin_unlock_bh(&ct->lock);
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len - skb_network_offset(skb),
+ &counter[CTINFO2DIR(ctinfo)].bytes);
}
}
@@ -1059,7 +1310,7 @@ static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
};
#endif
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -1071,8 +1322,9 @@ static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port);
- NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port);
+ if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
+ nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -1107,7 +1359,7 @@ EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
#endif
/* Used by ipt_REJECT and ip6t_REJECT. */
-static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -1115,7 +1367,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
/* This ICMP is in reverse direction to the packet which caused it */
ct = nf_ct_get(skb, &ctinfo);
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
- ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+ ctinfo = IP_CT_RELATED_REPLY;
else
ctinfo = IP_CT_RELATED;
@@ -1133,31 +1385,48 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct hlist_nulls_node *n;
+ int cpu;
+ spinlock_t *lockp;
- spin_lock_bh(&nf_conntrack_lock);
for (; *bucket < net->ct.htable_size; (*bucket)++) {
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
+ local_bh_disable();
+ spin_lock(lockp);
+ if (*bucket < net->ct.htable_size) {
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (iter(ct, data))
+ goto found;
+ }
+ }
+ spin_unlock(lockp);
+ local_bh_enable();
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (iter(ct, data))
- goto found;
+ set_bit(IPS_DYING_BIT, &ct->status);
}
+ spin_unlock_bh(&pcpu->lock);
}
- hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
- set_bit(IPS_DYING_BIT, &ct->status);
- }
- spin_unlock_bh(&nf_conntrack_lock);
return NULL;
found:
atomic_inc(&ct->ct_general.use);
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock(lockp);
+ local_bh_enable();
return ct;
}
void nf_ct_iterate_cleanup(struct net *net,
int (*iter)(struct nf_conn *i, void *data),
- void *data)
+ void *data, u32 portid, int report)
{
struct nf_conn *ct;
unsigned int bucket = 0;
@@ -1165,7 +1434,8 @@ void nf_ct_iterate_cleanup(struct net *net,
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
- death_by_timeout((unsigned long)ct);
+ nf_ct_delete(ct, portid, report);
+
/* ... else the timer will get him soon. */
nf_ct_put(ct);
@@ -1173,33 +1443,14 @@ void nf_ct_iterate_cleanup(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
-struct __nf_ct_flush_report {
- u32 pid;
- int report;
-};
-
-static int kill_report(struct nf_conn *i, void *data)
-{
- struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
-
- /* If we fail to deliver the event, death_by_timeout() will retry */
- if (nf_conntrack_event_report(IPCT_DESTROY, i,
- fr->pid, fr->report) < 0)
- return 1;
-
- /* Avoid the delivery of the destroy event in death_by_timeout(). */
- set_bit(IPS_DYING_BIT, &i->status);
- return 1;
-}
-
static int kill_all(struct nf_conn *i, void *data)
{
return 1;
}
-void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
+void nf_ct_free_hashtable(void *hash, unsigned int size)
{
- if (vmalloced)
+ if (is_vmalloc_addr(hash))
vfree(hash);
else
free_pages((unsigned long)hash,
@@ -1207,13 +1458,9 @@ void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
}
EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
-void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
+void nf_conntrack_flush_report(struct net *net, u32 portid, int report)
{
- struct __nf_ct_flush_report fr = {
- .pid = pid,
- .report = report,
- };
- nf_ct_iterate_cleanup(net, kill_report, &fr);
+ nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report);
}
EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
@@ -1222,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net)
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct hlist_nulls_node *n;
+ int cpu;
- spin_lock_bh(&nf_conntrack_lock);
- hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
- ct = nf_ct_tuplehash_to_ctrack(h);
- /* never fails to remove them, no listeners at this point */
- nf_ct_kill(ct);
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* never fails to remove them, no listeners at this point */
+ nf_ct_kill(ct);
+ }
+ spin_unlock_bh(&pcpu->lock);
}
- spin_unlock_bh(&nf_conntrack_lock);
}
static int untrack_refs(void)
@@ -1244,76 +1496,96 @@ static int untrack_refs(void)
return cnt;
}
-static void nf_conntrack_cleanup_init_net(void)
+void nf_conntrack_cleanup_start(void)
{
+ RCU_INIT_POINTER(ip_ct_attach, NULL);
+}
+
+void nf_conntrack_cleanup_end(void)
+{
+ RCU_INIT_POINTER(nf_ct_destroy, NULL);
while (untrack_refs() > 0)
schedule();
- nf_conntrack_helper_fini();
- nf_conntrack_proto_fini();
#ifdef CONFIG_NF_CONNTRACK_ZONES
nf_ct_extend_unregister(&nf_ct_zone_extend);
#endif
+ nf_conntrack_proto_fini();
+ nf_conntrack_seqadj_fini();
+ nf_conntrack_labels_fini();
+ nf_conntrack_helper_fini();
+ nf_conntrack_timeout_fini();
+ nf_conntrack_ecache_fini();
+ nf_conntrack_tstamp_fini();
+ nf_conntrack_acct_fini();
+ nf_conntrack_expect_fini();
}
-static void nf_conntrack_cleanup_net(struct net *net)
+/*
+ * Mishearing the voices in his head, our hero wonders how he's
+ * supposed to kill the mall.
+ */
+void nf_conntrack_cleanup_net(struct net *net)
{
- i_see_dead_people:
- nf_ct_iterate_cleanup(net, kill_all, NULL);
- nf_ct_release_dying_list(net);
- if (atomic_read(&net->ct.count) != 0) {
- schedule();
- goto i_see_dead_people;
- }
+ LIST_HEAD(single);
- nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
- net->ct.htable_size);
- nf_conntrack_ecache_fini(net);
- nf_conntrack_acct_fini(net);
- nf_conntrack_expect_fini(net);
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
- kfree(net->ct.slabname);
- free_percpu(net->ct.stat);
+ list_add(&net->exit_list, &single);
+ nf_conntrack_cleanup_net_list(&single);
}
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void nf_conntrack_cleanup(struct net *net)
+void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
- if (net_eq(net, &init_net))
- rcu_assign_pointer(ip_ct_attach, NULL);
+ int busy;
+ struct net *net;
- /* This makes sure all current packets have passed through
- netfilter framework. Roll on, two-stage module
- delete... */
+ /*
+ * This makes sure all current packets have passed through
+ * netfilter framework. Roll on, two-stage module
+ * delete...
+ */
synchronize_net();
+i_see_dead_people:
+ busy = 0;
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0);
+ nf_ct_release_dying_list(net);
+ if (atomic_read(&net->ct.count) != 0)
+ busy = 1;
+ }
+ if (busy) {
+ schedule();
+ goto i_see_dead_people;
+ }
- nf_conntrack_cleanup_net(net);
-
- if (net_eq(net, &init_net)) {
- rcu_assign_pointer(nf_ct_destroy, NULL);
- nf_conntrack_cleanup_init_net();
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+ nf_conntrack_proto_pernet_fini(net);
+ nf_conntrack_helper_pernet_fini(net);
+ nf_conntrack_ecache_pernet_fini(net);
+ nf_conntrack_tstamp_pernet_fini(net);
+ nf_conntrack_acct_pernet_fini(net);
+ nf_conntrack_expect_pernet_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+ kfree(net->ct.slabname);
+ free_percpu(net->ct.stat);
+ free_percpu(net->ct.pcpu_lists);
}
}
-void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
+void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
size_t sz;
- *vmalloced = 0;
-
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
sz = nr_slots * sizeof(struct hlist_nulls_head);
hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
get_order(sz));
if (!hash) {
- *vmalloced = 1;
printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
- hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ hash = vzalloc(sz);
}
if (hash && nulls)
@@ -1326,7 +1598,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
{
- int i, bucket, vmalloced, old_vmalloced;
+ int i, bucket, rc;
unsigned int hashsize, old_size;
struct hlist_nulls_head *hash, *old_hash;
struct nf_conntrack_tuple_hash *h;
@@ -1339,20 +1611,26 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
if (!nf_conntrack_htable_size)
return param_set_uint(val, kp);
- hashsize = simple_strtoul(val, NULL, 0);
+ rc = kstrtouint(val, 0, &hashsize);
+ if (rc)
+ return rc;
if (!hashsize)
return -EINVAL;
- hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1);
+ hash = nf_ct_alloc_hashtable(&hashsize, 1);
if (!hash)
return -ENOMEM;
+ local_bh_disable();
+ nf_conntrack_all_lock();
+ write_seqcount_begin(&init_net.ct.generation);
+
/* Lookups in the old hash might happen in parallel, which means we
* might get false negatives during connection lookup. New connections
* created because of a false negative won't make it into the hash
- * though since that required taking the lock.
+ * though since that required taking the locks.
*/
- spin_lock_bh(&nf_conntrack_lock);
+
for (i = 0; i < init_net.ct.htable_size; i++) {
while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
h = hlist_nulls_entry(init_net.ct.hash[i].first,
@@ -1365,15 +1643,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
}
}
old_size = init_net.ct.htable_size;
- old_vmalloced = init_net.ct.hash_vmalloc;
old_hash = init_net.ct.hash;
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
- init_net.ct.hash_vmalloc = vmalloced;
init_net.ct.hash = hash;
- spin_unlock_bh(&nf_conntrack_lock);
- nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
+ write_seqcount_end(&init_net.ct.generation);
+ nf_conntrack_all_unlock();
+ local_bh_enable();
+
+ nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
@@ -1390,10 +1669,13 @@ void nf_ct_untracked_status_or(unsigned long bits)
}
EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
-static int nf_conntrack_init_init_net(void)
+int nf_conntrack_init_start(void)
{
int max_factor = 8;
- int ret, cpu;
+ int i, ret, cpu;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_lock_init(&nf_conntrack_locks[i]);
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1418,19 +1700,47 @@ static int nf_conntrack_init_init_net(void)
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
- ret = nf_conntrack_proto_init();
+ ret = nf_conntrack_expect_init();
if (ret < 0)
- goto err_proto;
+ goto err_expect;
+
+ ret = nf_conntrack_acct_init();
+ if (ret < 0)
+ goto err_acct;
+
+ ret = nf_conntrack_tstamp_init();
+ if (ret < 0)
+ goto err_tstamp;
+
+ ret = nf_conntrack_ecache_init();
+ if (ret < 0)
+ goto err_ecache;
+
+ ret = nf_conntrack_timeout_init();
+ if (ret < 0)
+ goto err_timeout;
ret = nf_conntrack_helper_init();
if (ret < 0)
goto err_helper;
+ ret = nf_conntrack_labels_init();
+ if (ret < 0)
+ goto err_labels;
+
+ ret = nf_conntrack_seqadj_init();
+ if (ret < 0)
+ goto err_seqadj;
+
#ifdef CONFIG_NF_CONNTRACK_ZONES
ret = nf_ct_extend_register(&nf_ct_zone_extend);
if (ret < 0)
goto err_extend;
#endif
+ ret = nf_conntrack_proto_init();
+ if (ret < 0)
+ goto err_proto;
+
/* Set up fake conntrack: to never be deleted, not in any hashes */
for_each_possible_cpu(cpu) {
struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
@@ -1441,118 +1751,127 @@ static int nf_conntrack_init_init_net(void)
nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
return 0;
+err_proto:
#ifdef CONFIG_NF_CONNTRACK_ZONES
+ nf_ct_extend_unregister(&nf_ct_zone_extend);
err_extend:
- nf_conntrack_helper_fini();
#endif
+ nf_conntrack_seqadj_fini();
+err_seqadj:
+ nf_conntrack_labels_fini();
+err_labels:
+ nf_conntrack_helper_fini();
err_helper:
- nf_conntrack_proto_fini();
-err_proto:
+ nf_conntrack_timeout_fini();
+err_timeout:
+ nf_conntrack_ecache_fini();
+err_ecache:
+ nf_conntrack_tstamp_fini();
+err_tstamp:
+ nf_conntrack_acct_fini();
+err_acct:
+ nf_conntrack_expect_fini();
+err_expect:
return ret;
}
+void nf_conntrack_init_end(void)
+{
+ /* For use by REJECT target */
+ RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
+ RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
+}
+
/*
* We need to use special "null" values, not used in hash table
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
#define DYING_NULLS_VAL ((1<<30)+1)
+#define TEMPLATE_NULLS_VAL ((1<<30)+2)
-static int nf_conntrack_init_net(struct net *net)
+int nf_conntrack_init_net(struct net *net)
{
- int ret;
+ int ret = -ENOMEM;
+ int cpu;
atomic_set(&net->ct.count, 0);
- INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
- net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
- if (!net->ct.stat) {
- ret = -ENOMEM;
+ seqcount_init(&net->ct.generation);
+
+ net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
+ if (!net->ct.pcpu_lists)
goto err_stat;
+
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_init(&pcpu->lock);
+ INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
+ INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
+ INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
}
+ net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+ if (!net->ct.stat)
+ goto err_pcpu_lists;
+
net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
- if (!net->ct.slabname) {
- ret = -ENOMEM;
+ if (!net->ct.slabname)
goto err_slabname;
- }
net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
sizeof(struct nf_conn), 0,
SLAB_DESTROY_BY_RCU, NULL);
if (!net->ct.nf_conntrack_cachep) {
printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- ret = -ENOMEM;
goto err_cache;
}
net->ct.htable_size = nf_conntrack_htable_size;
- net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
- &net->ct.hash_vmalloc, 1);
+ net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
if (!net->ct.hash) {
- ret = -ENOMEM;
printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
goto err_hash;
}
- ret = nf_conntrack_expect_init(net);
+ ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
goto err_expect;
- ret = nf_conntrack_acct_init(net);
+ ret = nf_conntrack_acct_pernet_init(net);
if (ret < 0)
goto err_acct;
- ret = nf_conntrack_ecache_init(net);
+ ret = nf_conntrack_tstamp_pernet_init(net);
+ if (ret < 0)
+ goto err_tstamp;
+ ret = nf_conntrack_ecache_pernet_init(net);
if (ret < 0)
goto err_ecache;
-
+ ret = nf_conntrack_helper_pernet_init(net);
+ if (ret < 0)
+ goto err_helper;
+ ret = nf_conntrack_proto_pernet_init(net);
+ if (ret < 0)
+ goto err_proto;
return 0;
+err_proto:
+ nf_conntrack_helper_pernet_fini(net);
+err_helper:
+ nf_conntrack_ecache_pernet_fini(net);
err_ecache:
- nf_conntrack_acct_fini(net);
+ nf_conntrack_tstamp_pernet_fini(net);
+err_tstamp:
+ nf_conntrack_acct_pernet_fini(net);
err_acct:
- nf_conntrack_expect_fini(net);
+ nf_conntrack_expect_pernet_fini(net);
err_expect:
- nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
- net->ct.htable_size);
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
err_hash:
kmem_cache_destroy(net->ct.nf_conntrack_cachep);
err_cache:
kfree(net->ct.slabname);
err_slabname:
free_percpu(net->ct.stat);
+err_pcpu_lists:
+ free_percpu(net->ct.pcpu_lists);
err_stat:
return ret;
}
-
-s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- u32 seq);
-EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
-
-int nf_conntrack_init(struct net *net)
-{
- int ret;
-
- if (net_eq(net, &init_net)) {
- ret = nf_conntrack_init_init_net();
- if (ret < 0)
- goto out_init_net;
- }
- ret = nf_conntrack_init_net(net);
- if (ret < 0)
- goto out_net;
-
- if (net_eq(net, &init_net)) {
- /* For use by REJECT target */
- rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
- rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
-
- /* Howto get NAT offsets */
- rcu_assign_pointer(nf_ct_nat_offset, NULL);
- }
- return 0;
-
-out_net:
- if (net_eq(net, &init_net))
- nf_conntrack_cleanup_init_net();
-out_init_net:
- return ret;
-}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 5702de35e2b..1df17614656 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -1,8 +1,10 @@
/* Event cache for netfilter. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+/*
+ * (C) 2005 Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 Patrick McHardy <kaber@trash.net>
+ * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -19,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
+#include <linux/export.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -26,22 +29,19 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
-
-struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
-EXPORT_SYMBOL_GPL(nf_expect_event_cb);
-
/* deliver cached events and clear cache entry - must be called with locally
* disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
- unsigned long events;
+ struct net *net = nf_ct_net(ct);
+ unsigned long events, missed;
struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ int ret;
rcu_read_lock();
- notify = rcu_dereference(nf_conntrack_event_cb);
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
if (notify == NULL)
goto out_unlock;
@@ -51,49 +51,53 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
events = xchg(&e->cache, 0);
- if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) {
- struct nf_ct_event item = {
- .ct = ct,
- .pid = 0,
- .report = 0
- };
- int ret;
- /* We make a copy of the missed event cache without taking
- * the lock, thus we may send missed events twice. However,
- * this does not harm and it happens very rarely. */
- unsigned long missed = e->missed;
-
- ret = notify->fcn(events | missed, &item);
- if (unlikely(ret < 0 || missed)) {
- spin_lock_bh(&ct->lock);
- if (ret < 0)
- e->missed |= events;
- else
- e->missed &= ~missed;
- spin_unlock_bh(&ct->lock);
- }
- }
+ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events)
+ goto out_unlock;
+
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely. */
+ missed = e->missed;
+
+ if (!((events | missed) & e->ctmask))
+ goto out_unlock;
+
+ item.ct = ct;
+ item.portid = 0;
+ item.report = 0;
+
+ ret = notify->fcn(events | missed, &item);
+
+ if (likely(ret >= 0 && !missed))
+ goto out_unlock;
+
+ spin_lock_bh(&ct->lock);
+ if (ret < 0)
+ e->missed |= events;
+ else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
out_unlock:
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
-int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
+int nf_conntrack_register_notifier(struct net *net,
+ struct nf_ct_event_notifier *new)
{
- int ret = 0;
+ int ret;
struct nf_ct_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(nf_conntrack_event_cb,
+ notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
if (notify != NULL) {
ret = -EBUSY;
goto out_unlock;
}
- rcu_assign_pointer(nf_conntrack_event_cb, new);
- mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
+ rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
+ ret = 0;
out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
@@ -101,34 +105,35 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
-void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
+void nf_conntrack_unregister_notifier(struct net *net,
+ struct nf_ct_event_notifier *new)
{
struct nf_ct_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(nf_conntrack_event_cb,
+ notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
BUG_ON(notify != new);
- rcu_assign_pointer(nf_conntrack_event_cb, NULL);
+ RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
}
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
-int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
+int nf_ct_expect_register_notifier(struct net *net,
+ struct nf_exp_event_notifier *new)
{
- int ret = 0;
+ int ret;
struct nf_exp_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(nf_expect_event_cb,
+ notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
if (notify != NULL) {
ret = -EBUSY;
goto out_unlock;
}
- rcu_assign_pointer(nf_expect_event_cb, new);
- mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
+ rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
+ ret = 0;
out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
@@ -136,15 +141,16 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
-void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
+void nf_ct_expect_unregister_notifier(struct net *net,
+ struct nf_exp_event_notifier *new)
{
struct nf_exp_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(nf_expect_event_cb,
+ notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
BUG_ON(notify != new);
- rcu_assign_pointer(nf_expect_event_cb, NULL);
+ RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
}
EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
@@ -192,9 +198,12 @@ static int nf_conntrack_event_init_sysctl(struct net *net)
table[0].data = &net->ct.sysctl_events;
table[1].data = &net->ct.sysctl_events_retry_timeout;
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
net->ct.event_sysctl_header =
- register_net_sysctl_table(net,
- nf_net_netfilter_sysctl_path, table);
+ register_net_sysctl(net, "net/netfilter", table);
if (!net->ct.event_sysctl_header) {
printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
goto out_register;
@@ -226,38 +235,27 @@ static void nf_conntrack_event_fini_sysctl(struct net *net)
}
#endif /* CONFIG_SYSCTL */
-int nf_conntrack_ecache_init(struct net *net)
+int nf_conntrack_ecache_pernet_init(struct net *net)
{
- int ret;
-
net->ct.sysctl_events = nf_ct_events;
net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
+ return nf_conntrack_event_init_sysctl(net);
+}
- if (net_eq(net, &init_net)) {
- ret = nf_ct_extend_register(&event_extend);
- if (ret < 0) {
- printk(KERN_ERR "nf_ct_event: Unable to register "
- "event extension.\n");
- goto out_extend_register;
- }
- }
+void nf_conntrack_ecache_pernet_fini(struct net *net)
+{
+ nf_conntrack_event_fini_sysctl(net);
+}
- ret = nf_conntrack_event_init_sysctl(net);
+int nf_conntrack_ecache_init(void)
+{
+ int ret = nf_ct_extend_register(&event_extend);
if (ret < 0)
- goto out_sysctl;
-
- return 0;
-
-out_sysctl:
- if (net_eq(net, &init_net))
- nf_ct_extend_unregister(&event_extend);
-out_extend_register:
+ pr_err("nf_ct_event: Unable to register event extension.\n");
return ret;
}
-void nf_conntrack_ecache_fini(struct net *net)
+void nf_conntrack_ecache_fini(void)
{
- nf_conntrack_event_fini_sysctl(net);
- if (net_eq(net, &init_net))
- nf_ct_extend_unregister(&event_extend);
+ nf_ct_extend_unregister(&event_extend);
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 46e8966912b..f87e8f68ad4 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (c) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -20,6 +21,8 @@
#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/jhash.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/netfilter/nf_conntrack.h>
@@ -32,31 +35,27 @@
unsigned int nf_ct_expect_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
-static unsigned int nf_ct_expect_hash_rnd __read_mostly;
unsigned int nf_ct_expect_max __read_mostly;
-static int nf_ct_expect_hash_rnd_initted __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
-static HLIST_HEAD(nf_ct_userspace_expect_list);
-
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
- u32 pid, int report)
+ u32 portid, int report)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
+ NF_CT_ASSERT(master_help);
NF_CT_ASSERT(!timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
net->ct.expect_count--;
hlist_del(&exp->lnode);
- if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
- master_help->expecting[exp->class]--;
+ master_help->expecting[exp->class]--;
- nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
+ nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
nf_ct_expect_put(exp);
NF_CT_STAT_INC(net, expect_delete);
@@ -67,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)
{
struct nf_conntrack_expect *exp = (void *)ul_expect;
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
nf_ct_unlink_expect(exp);
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
nf_ct_expect_put(exp);
}
@@ -77,15 +76,13 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple
{
unsigned int hash;
- if (unlikely(!nf_ct_expect_hash_rnd_initted)) {
- get_random_bytes(&nf_ct_expect_hash_rnd,
- sizeof(nf_ct_expect_hash_rnd));
- nf_ct_expect_hash_rnd_initted = 1;
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ init_nf_conntrack_hash_rnd();
}
hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
(((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
- (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hash_rnd);
+ (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
return ((u64)hash * nf_ct_expect_hsize) >> 32;
}
@@ -94,14 +91,13 @@ __nf_ct_expect_find(struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i;
- struct hlist_node *n;
unsigned int h;
if (!net->ct.expect_count)
return NULL;
h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) {
+ hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
nf_ct_zone(i->master) == zone)
return i;
@@ -134,14 +130,13 @@ nf_ct_find_expectation(struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i, *exp = NULL;
- struct hlist_node *n;
unsigned int h;
if (!net->ct.expect_count)
return NULL;
h = nf_ct_expect_dst_hash(tuple);
- hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
+ hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
nf_ct_zone(i->master) == zone) {
@@ -160,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone,
if (!nf_ct_is_confirmed(exp->master))
return NULL;
+ /* Avoid race with other CPUs, that for exp->master ct, is
+ * about to invoke ->destroy(), or nf_ct_delete() via timeout
+ * or early_drop().
+ *
+ * The atomic_inc_not_zero() check tells: If that fails, we
+ * know that the ct is being destroyed. If it succeeds, we
+ * can be sure the ct cannot disappear underneath.
+ */
+ if (unlikely(nf_ct_is_dying(exp->master) ||
+ !atomic_inc_not_zero(&exp->master->ct_general.use)))
+ return NULL;
+
if (exp->flags & NF_CT_EXPECT_PERMANENT) {
atomic_inc(&exp->use);
return exp;
@@ -167,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone,
nf_ct_unlink_expect(exp);
return exp;
}
+ /* Undo exp->master refcnt increase, if del_timer() failed */
+ nf_ct_put(exp->master);
return NULL;
}
@@ -176,18 +185,20 @@ void nf_ct_remove_expectations(struct nf_conn *ct)
{
struct nf_conn_help *help = nfct_help(ct);
struct nf_conntrack_expect *exp;
- struct hlist_node *n, *next;
+ struct hlist_node *next;
/* Optimization: most connection never expect any others. */
if (!help)
return;
- hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
}
}
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
@@ -222,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
/* Generally a bad idea to call this: could have matched already. */
void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
{
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
}
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
@@ -298,6 +309,11 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
sizeof(exp->tuple.dst.u3) - len);
exp->tuple.dst.u.all = *dst;
+
+#ifdef CONFIG_NF_NAT_NEEDED
+ memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
+ memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
+#endif
}
EXPORT_SYMBOL_GPL(nf_ct_expect_init);
@@ -316,34 +332,34 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
}
EXPORT_SYMBOL_GPL(nf_ct_expect_put);
-static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
+ struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
- const struct nf_conntrack_expect_policy *p;
unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
- atomic_inc(&exp->use);
+ /* two references : one for hash insert, one for the timer */
+ atomic_add(2, &exp->use);
- if (master_help) {
- hlist_add_head(&exp->lnode, &master_help->expectations);
- master_help->expecting[exp->class]++;
- } else if (exp->flags & NF_CT_EXPECT_USERSPACE)
- hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
+ hlist_add_head(&exp->lnode, &master_help->expectations);
+ master_help->expecting[exp->class]++;
hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
(unsigned long)exp);
- if (master_help) {
- p = &master_help->helper->expect_policy[exp->class];
- exp->timeout.expires = jiffies + p->timeout * HZ;
+ helper = rcu_dereference_protected(master_help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock));
+ if (helper) {
+ exp->timeout.expires = jiffies +
+ helper->expect_policy[exp->class].timeout * HZ;
}
add_timer(&exp->timeout);
- atomic_inc(&exp->use);
NF_CT_STAT_INC(net, expect_create);
+ return 0;
}
/* Race with expectations being used means we could have none to find; OK. */
@@ -352,9 +368,8 @@ static void evict_oldest_expect(struct nf_conn *master,
{
struct nf_conn_help *master_help = nfct_help(master);
struct nf_conntrack_expect *exp, *last = NULL;
- struct hlist_node *n;
- hlist_for_each_entry(exp, n, &master_help->expectations, lnode) {
+ hlist_for_each_entry(exp, &master_help->expectations, lnode) {
if (exp->class == new->class)
last = exp;
}
@@ -365,44 +380,29 @@ static void evict_oldest_expect(struct nf_conn *master,
}
}
-static inline int refresh_timer(struct nf_conntrack_expect *i)
-{
- struct nf_conn_help *master_help = nfct_help(i->master);
- const struct nf_conntrack_expect_policy *p;
-
- if (!del_timer(&i->timeout))
- return 0;
-
- p = &master_help->helper->expect_policy[i->class];
- i->timeout.expires = jiffies + p->timeout * HZ;
- add_timer(&i->timeout);
- return 1;
-}
-
static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
{
const struct nf_conntrack_expect_policy *p;
struct nf_conntrack_expect *i;
struct nf_conn *master = expect->master;
struct nf_conn_help *master_help = nfct_help(master);
+ struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(expect);
- struct hlist_node *n;
+ struct hlist_node *next;
unsigned int h;
int ret = 1;
- /* Don't allow expectations created from kernel-space with no helper */
- if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
- (!master_help || (master_help && !master_help->helper))) {
+ if (!master_help) {
ret = -ESHUTDOWN;
goto out;
}
h = nf_ct_expect_dst_hash(&expect->tuple);
- hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
+ hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
if (expect_matches(i, expect)) {
- /* Refresh timer: if it's dying, ignore.. */
- if (refresh_timer(i)) {
- ret = 0;
- goto out;
+ if (del_timer(&i->timeout)) {
+ nf_ct_unlink_expect(i);
+ nf_ct_expect_put(i);
+ break;
}
} else if (expect_clash(i, expect)) {
ret = -EBUSY;
@@ -410,8 +410,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
}
/* Will be over limit? */
- if (master_help) {
- p = &master_help->helper->expect_policy[expect->class];
+ helper = rcu_dereference_protected(master_help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock));
+ if (helper) {
+ p = &helper->expect_policy[expect->class];
if (p->max_expected &&
master_help->expecting[expect->class] >= p->max_expected) {
evict_oldest_expect(master, expect);
@@ -424,52 +426,36 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
if (net->ct.expect_count >= nf_ct_expect_max) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "nf_conntrack: expectation table full\n");
+ net_warn_ratelimited("nf_conntrack: expectation table full\n");
ret = -EMFILE;
}
out:
return ret;
}
-int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
- u32 pid, int report)
+int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
+ u32 portid, int report)
{
int ret;
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
ret = __nf_ct_expect_check(expect);
if (ret <= 0)
goto out;
- ret = 0;
- nf_ct_expect_insert(expect);
- spin_unlock_bh(&nf_conntrack_lock);
- nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report);
+ ret = nf_ct_expect_insert(expect);
+ if (ret < 0)
+ goto out;
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
return ret;
out:
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
-void nf_ct_remove_userspace_expectations(void)
-{
- struct nf_conntrack_expect *exp;
- struct hlist_node *n, *next;
-
- hlist_for_each_entry_safe(exp, n, next,
- &nf_ct_userspace_expect_list, lnode) {
- if (del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
- nf_ct_expect_put(exp);
- }
- }
-}
-EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
-
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
struct ct_expect_iter_state {
struct seq_net_private p;
unsigned int bucket;
@@ -482,7 +468,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -495,11 +481,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
@@ -597,79 +583,74 @@ static const struct file_operations exp_file_ops = {
.llseek = seq_lseek,
.release = seq_release_net,
};
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
static int exp_proc_init(struct net *net)
{
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
struct proc_dir_entry *proc;
- proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops);
+ proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
+ &exp_file_ops);
if (!proc)
return -ENOMEM;
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
return 0;
}
static void exp_proc_remove(struct net *net)
{
-#ifdef CONFIG_PROC_FS
- proc_net_remove(net, "nf_conntrack_expect");
-#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+ remove_proc_entry("nf_conntrack_expect", net->proc_net);
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
}
module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
-int nf_conntrack_expect_init(struct net *net)
+int nf_conntrack_expect_pernet_init(struct net *net)
{
int err = -ENOMEM;
- if (net_eq(net, &init_net)) {
- if (!nf_ct_expect_hsize) {
- nf_ct_expect_hsize = net->ct.htable_size / 256;
- if (!nf_ct_expect_hsize)
- nf_ct_expect_hsize = 1;
- }
- nf_ct_expect_max = nf_ct_expect_hsize * 4;
- }
-
net->ct.expect_count = 0;
- net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
- &net->ct.expect_vmalloc, 0);
+ net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
if (net->ct.expect_hash == NULL)
goto err1;
- if (net_eq(net, &init_net)) {
- nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
- sizeof(struct nf_conntrack_expect),
- 0, 0, NULL);
- if (!nf_ct_expect_cachep)
- goto err2;
- }
-
err = exp_proc_init(net);
if (err < 0)
- goto err3;
+ goto err2;
return 0;
-
-err3:
- if (net_eq(net, &init_net))
- kmem_cache_destroy(nf_ct_expect_cachep);
err2:
- nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
- nf_ct_expect_hsize);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
err1:
return err;
}
-void nf_conntrack_expect_fini(struct net *net)
+void nf_conntrack_expect_pernet_fini(struct net *net)
{
exp_proc_remove(net);
- if (net_eq(net, &init_net)) {
- rcu_barrier(); /* Wait for call_rcu() before destroy */
- kmem_cache_destroy(nf_ct_expect_cachep);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
+}
+
+int nf_conntrack_expect_init(void)
+{
+ if (!nf_ct_expect_hsize) {
+ nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
+ if (!nf_ct_expect_hsize)
+ nf_ct_expect_hsize = 1;
}
- nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
- nf_ct_expect_hsize);
+ nf_ct_expect_max = nf_ct_expect_hsize * 4;
+ nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
+ sizeof(struct nf_conntrack_expect),
+ 0, 0, NULL);
+ if (!nf_ct_expect_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void nf_conntrack_expect_fini(void)
+{
+ rcu_barrier(); /* Wait for call_rcu() before destroy */
+ kmem_cache_destroy(nf_ct_expect_cachep);
}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index bd82450c193..1a9545965c0 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -44,7 +44,8 @@ void __nf_ct_ext_destroy(struct nf_conn *ct)
EXPORT_SYMBOL(__nf_ct_ext_destroy);
static void *
-nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
+nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id,
+ size_t var_alloc_len, gfp_t gfp)
{
unsigned int off, len;
struct nf_ct_ext_type *t;
@@ -54,8 +55,8 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
t = rcu_dereference(nf_ct_ext_types[id]);
BUG_ON(t == NULL);
off = ALIGN(sizeof(struct nf_ct_ext), t->align);
- len = off + t->len;
- alloc_size = t->alloc_size;
+ len = off + t->len + var_alloc_len;
+ alloc_size = t->alloc_size + var_alloc_len;
rcu_read_unlock();
*ext = kzalloc(alloc_size, gfp);
@@ -68,13 +69,8 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
return (void *)(*ext) + off;
}
-static void __nf_ct_ext_free_rcu(struct rcu_head *head)
-{
- struct nf_ct_ext *ext = container_of(head, struct nf_ct_ext, rcu);
- kfree(ext);
-}
-
-void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
+void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
+ size_t var_alloc_len, gfp_t gfp)
{
struct nf_ct_ext *old, *new;
int i, newlen, newoff;
@@ -85,7 +81,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
old = ct->ext;
if (!old)
- return nf_ct_ext_create(&ct->ext, id, gfp);
+ return nf_ct_ext_create(&ct->ext, id, var_alloc_len, gfp);
if (__nf_ct_ext_exist(old, id))
return NULL;
@@ -95,7 +91,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
BUG_ON(t == NULL);
newoff = ALIGN(old->len, t->align);
- newlen = newoff + t->len;
+ newlen = newoff + t->len + var_alloc_len;
rcu_read_unlock();
new = __krealloc(old, newlen, gfp);
@@ -114,7 +110,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
(void *)old + old->offset[i]);
rcu_read_unlock();
}
- call_rcu(&old->rcu, __nf_ct_ext_free_rcu);
+ kfree_rcu(old, rcu);
ct->ext = new;
}
@@ -123,7 +119,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
memset((void *)new + newoff, 0, newlen - newoff);
return (void *)new + newoff;
}
-EXPORT_SYMBOL(__nf_ct_ext_add);
+EXPORT_SYMBOL(__nf_ct_ext_add_length);
static void update_alloc_size(struct nf_ct_ext_type *type)
{
@@ -140,15 +136,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type)
/* This assumes that extended areas in conntrack for the types
whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
for (i = min; i <= max; i++) {
- t1 = nf_ct_ext_types[i];
+ t1 = rcu_dereference_protected(nf_ct_ext_types[i],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (!t1)
continue;
- t1->alloc_size = sizeof(struct nf_ct_ext)
- + ALIGN(sizeof(struct nf_ct_ext), t1->align)
- + t1->len;
+ t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
+ t1->len;
for (j = 0; j < NF_CT_EXT_NUM; j++) {
- t2 = nf_ct_ext_types[j];
+ t2 = rcu_dereference_protected(nf_ct_ext_types[j],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (t2 == NULL || t2 == t1 ||
(t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
continue;
@@ -186,7 +183,7 @@ EXPORT_SYMBOL_GPL(nf_ct_extend_register);
void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
{
mutex_lock(&nf_ct_ext_type_mutex);
- rcu_assign_pointer(nf_ct_ext_types[type->id], NULL);
+ RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
update_alloc_size(type);
mutex_unlock(&nf_ct_ext_type_mutex);
rcu_barrier(); /* Wait for completion of call_rcu()'s */
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e17cb7c7dd8..b8a0924064e 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -42,21 +43,26 @@ static u_int16_t ports[MAX_PORTS];
static unsigned int ports_c;
module_param_array(ports, ushort, &ports_c, 0400);
-static int loose;
+static bool loose;
module_param(loose, bool, 0600);
unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
enum nf_ct_ftp_type type,
+ unsigned int protoff,
unsigned int matchoff,
unsigned int matchlen,
struct nf_conntrack_expect *exp);
EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
-static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char);
-static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char);
+static int try_rfc959(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+static int try_eprt(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
- char);
+ char, unsigned int *);
static struct ftp_search {
const char *pattern;
@@ -64,7 +70,7 @@ static struct ftp_search {
char skip;
char term;
enum nf_ct_ftp_type ftptype;
- int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
+ int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *);
} search[IP_CT_DIR_MAX][2] = {
[IP_CT_DIR_ORIGINAL] = {
{
@@ -88,10 +94,8 @@ static struct ftp_search {
{
.pattern = "227 ",
.plen = sizeof("227 ") - 1,
- .skip = '(',
- .term = ')',
.ftptype = NF_CT_FTP_PASV,
- .getnum = try_rfc959,
+ .getnum = try_rfc1123,
},
{
.pattern = "229 ",
@@ -130,8 +134,9 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],
i++;
else {
/* Unexpected character; true if it's the
- terminator and we're finished. */
- if (*data == term && i == array_size - 1)
+ terminator (or we don't care about one)
+ and we're finished. */
+ if ((*data == term || !term) && i == array_size - 1)
return len;
pr_debug("Char %u (got %u nums) `%u' unexpected\n",
@@ -146,7 +151,8 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],
/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
static int try_rfc959(const char *data, size_t dlen,
- struct nf_conntrack_man *cmd, char term)
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
{
int length;
u_int32_t array[6];
@@ -161,6 +167,33 @@ static int try_rfc959(const char *data, size_t dlen,
return length;
}
+/*
+ * From RFC 1123:
+ * The format of the 227 reply to a PASV command is not
+ * well standardized. In particular, an FTP client cannot
+ * assume that the parentheses shown on page 40 of RFC-959
+ * will be present (and in fact, Figure 3 on page 43 omits
+ * them). Therefore, a User-FTP program that interprets
+ * the PASV reply must scan the reply for the first digit
+ * of the host and port numbers.
+ */
+static int try_rfc1123(const char *data, size_t dlen,
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
+{
+ int i;
+ for (i = 0; i < dlen; i++)
+ if (isdigit(data[i]))
+ break;
+
+ if (i == dlen)
+ return 0;
+
+ *offset += i;
+
+ return try_rfc959(data + i, dlen - i, cmd, 0, offset);
+}
+
/* Grab port: number up to delimiter */
static int get_port(const char *data, int start, size_t dlen, char delim,
__be16 *port)
@@ -189,7 +222,7 @@ static int get_port(const char *data, int start, size_t dlen, char delim,
/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
- char term)
+ char term, unsigned int *offset)
{
char delim;
int length;
@@ -237,7 +270,8 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
/* Returns 0, or length of numbers: |||6446| */
static int try_epsv_response(const char *data, size_t dlen,
- struct nf_conntrack_man *cmd, char term)
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
{
char delim;
@@ -259,9 +293,10 @@ static int find_pattern(const char *data, size_t dlen,
unsigned int *numlen,
struct nf_conntrack_man *cmd,
int (*getnum)(const char *, size_t,
- struct nf_conntrack_man *, char))
+ struct nf_conntrack_man *, char,
+ unsigned int *))
{
- size_t i;
+ size_t i = plen;
pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
if (dlen == 0)
@@ -291,16 +326,18 @@ static int find_pattern(const char *data, size_t dlen,
pr_debug("Pattern matches!\n");
/* Now we've found the constant string, try to skip
to the 'skip' character */
- for (i = plen; data[i] != skip; i++)
- if (i == dlen - 1) return -1;
+ if (skip) {
+ for (i = plen; data[i] != skip; i++)
+ if (i == dlen - 1) return -1;
- /* Skip over the last character */
- i++;
+ /* Skip over the last character */
+ i++;
+ }
pr_debug("Skipped up to `%c'!\n", skip);
*numoff = i;
- *numlen = getnum(data + i, dlen - i, cmd, term);
+ *numlen = getnum(data + i, dlen - i, cmd, term, numoff);
if (!*numlen)
return -1;
@@ -358,7 +395,7 @@ static int help(struct sk_buff *skb,
u32 seq;
int dir = CTINFO2DIR(ctinfo);
unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
- struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info;
+ struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
union nf_inet_addr *daddr;
struct nf_conntrack_man cmd = {};
@@ -368,7 +405,7 @@ static int help(struct sk_buff *skb,
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+ ctinfo != IP_CT_ESTABLISHED_REPLY) {
pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
return NF_ACCEPT;
}
@@ -395,6 +432,12 @@ static int help(struct sk_buff *skb,
/* Look up to see if we're just after a \n. */
if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+ /* We're picking up this, clear flags and let it continue */
+ if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) {
+ ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP;
+ goto skip_nl_seq;
+ }
+
/* Now if this ends in \n, update ftp info. */
pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",
ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
@@ -405,6 +448,7 @@ static int help(struct sk_buff *skb,
goto out_update_nl;
}
+skip_nl_seq:
/* Initialize IP/IPv6 addr to expected address (it's not mentioned
in EPSV responses) */
cmd.l3num = nf_ct_l3num(ct);
@@ -427,8 +471,8 @@ static int help(struct sk_buff *skb,
connection tracking, not packet filtering.
However, it is necessary for accurate tracking in
this case. */
- pr_debug("conntrack_ftp: partial %s %u+%u\n",
- search[dir][i].pattern, ntohl(th->seq), datalen);
+ nf_ct_helper_log(skb, ct, "partial matching of `%s'",
+ search[dir][i].pattern);
ret = NF_DROP;
goto out;
} else if (found == 0) { /* No match */
@@ -442,6 +486,7 @@ static int help(struct sk_buff *skb,
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
ret = NF_DROP;
goto out;
}
@@ -489,12 +534,13 @@ static int help(struct sk_buff *skb,
nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
- matchoff, matchlen, exp);
+ protoff, matchoff, matchlen, exp);
else {
/* Can't expect this? Best to drop packet now. */
- if (nf_ct_expect_related(exp) != 0)
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
- else
+ } else
ret = NF_ACCEPT;
}
@@ -511,8 +557,20 @@ out_update_nl:
return ret;
}
+static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
+{
+ struct nf_ct_ftp_master *ftp = nfct_help_data(ct);
+
+ /* This conntrack has been injected from user-space, always pick up
+ * sequence tracking. Otherwise, the first FTP command after the
+ * failover breaks.
+ */
+ ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP;
+ ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP;
+ return 0;
+}
+
static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly;
-static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")] __read_mostly;
static const struct nf_conntrack_expect_policy ftp_exp_policy = {
.max_expected = 1,
@@ -541,7 +599,6 @@ static void nf_conntrack_ftp_fini(void)
static int __init nf_conntrack_ftp_init(void)
{
int i, j = -1, ret = 0;
- char *tmpname;
ftp_buffer = kmalloc(65536, GFP_KERNEL);
if (!ftp_buffer)
@@ -556,17 +613,17 @@ static int __init nf_conntrack_ftp_init(void)
ftp[i][0].tuple.src.l3num = PF_INET;
ftp[i][1].tuple.src.l3num = PF_INET6;
for (j = 0; j < 2; j++) {
+ ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);
ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
ftp[i][j].expect_policy = &ftp_exp_policy;
ftp[i][j].me = THIS_MODULE;
ftp[i][j].help = help;
- tmpname = &ftp_names[i][j][0];
+ ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr;
if (ports[i] == FTP_PORT)
- sprintf(tmpname, "ftp");
+ sprintf(ftp[i][j].name, "ftp");
else
- sprintf(tmpname, "ftp-%d", ports[i]);
- ftp[i][j].name = tmpname;
+ sprintf(ftp[i][j].name, "ftp-%d", ports[i]);
pr_debug("nf_ct_ftp: registering helper for pf: %d "
"port: %d\n",
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index 867882313e4..bcd5ed6b713 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -631,7 +631,7 @@ static int decode_seqof(bitstr_t *bs, const struct field_t *f,
CHECK_BOUND(bs, 2);
count = *bs->cur++;
count <<= 8;
- count = *bs->cur++;
+ count += *bs->cur++;
break;
case SEMI:
BYTE_ALIGN(bs);
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index b969025cf82..3a3a60b126e 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -2,6 +2,7 @@
* H.323 connection tracking helper
*
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This source code is licensed under General Public License version 2.
*
@@ -42,19 +43,19 @@ static int gkrouted_only __read_mostly = 1;
module_param(gkrouted_only, int, 0600);
MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
-static int callforward_filter __read_mostly = 1;
+static bool callforward_filter __read_mostly = true;
module_param(callforward_filter, bool, 0600);
MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
"if both endpoints are on different sides "
"(determined by routing information)");
/* Hooks for NAT */
-int (*set_h245_addr_hook) (struct sk_buff *skb,
+int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr,
union nf_inet_addr *addr, __be16 port)
__read_mostly;
-int (*set_h225_addr_hook) (struct sk_buff *skb,
+int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
unsigned char **data, int dataoff,
TransportAddress *taddr,
union nf_inet_addr *addr, __be16 port)
@@ -62,16 +63,17 @@ int (*set_h225_addr_hook) (struct sk_buff *skb,
int (*set_sig_addr_hook) (struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data,
+ unsigned int protoff, unsigned char **data,
TransportAddress *taddr, int count) __read_mostly;
int (*set_ras_addr_hook) (struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data,
+ unsigned int protoff, unsigned char **data,
TransportAddress *taddr, int count) __read_mostly;
int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr,
__be16 port, __be16 rtp_port,
@@ -80,24 +82,28 @@ int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
int (*nat_t120_hook) (struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr, __be16 port,
struct nf_conntrack_expect *exp) __read_mostly;
int (*nat_h245_hook) (struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
TransportAddress *taddr, __be16 port,
struct nf_conntrack_expect *exp) __read_mostly;
int (*nat_callforwarding_hook) (struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
TransportAddress *taddr, __be16 port,
struct nf_conntrack_expect *exp) __read_mostly;
int (*nat_q931_hook) (struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, TransportAddress *taddr, int idx,
__be16 port, struct nf_conntrack_expect *exp)
__read_mostly;
@@ -114,7 +120,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
struct nf_conn *ct, enum ip_conntrack_info ctinfo,
unsigned char **data, int *datalen, int *dataoff)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
const struct tcphdr *th;
struct tcphdr _tcph;
@@ -251,6 +257,7 @@ static int get_h245_addr(struct nf_conn *ct, const unsigned char *data,
/****************************************************************************/
static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
@@ -270,9 +277,8 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
return 0;
/* RTP port is even */
- port &= htons(~1);
- rtp_port = port;
- rtcp_port = htons(ntohs(port) + 1);
+ rtp_port = port & ~htons(1);
+ rtcp_port = port | htons(1);
/* Create expect for RTP */
if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL)
@@ -296,9 +302,10 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+ ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
taddr, port, rtp_port, rtp_exp, rtcp_exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(rtp_exp) == 0) {
@@ -325,6 +332,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
static int expect_t120(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
@@ -354,9 +362,10 @@ static int expect_t120(struct sk_buff *skb,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_t120(skb, ct, ctinfo, data, dataoff, taddr,
+ ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp) == 0) {
@@ -375,6 +384,7 @@ static int expect_t120(struct sk_buff *skb,
static int process_h245_channel(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
H2250LogicalChannelParameters *channel)
{
@@ -382,7 +392,7 @@ static int process_h245_channel(struct sk_buff *skb,
if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {
/* RTP */
- ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+ ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
&channel->mediaChannel);
if (ret < 0)
return -1;
@@ -391,7 +401,7 @@ static int process_h245_channel(struct sk_buff *skb,
if (channel->
options & eH2250LogicalChannelParameters_mediaControlChannel) {
/* RTCP */
- ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+ ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
&channel->mediaControlChannel);
if (ret < 0)
return -1;
@@ -403,6 +413,7 @@ static int process_h245_channel(struct sk_buff *skb,
/****************************************************************************/
static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
OpenLogicalChannel *olc)
{
@@ -413,7 +424,8 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
{
- ret = process_h245_channel(skb, ct, ctinfo, data, dataoff,
+ ret = process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
&olc->
forwardLogicalChannelParameters.
multiplexParameters.
@@ -431,7 +443,8 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
{
ret =
- process_h245_channel(skb, ct, ctinfo, data, dataoff,
+ process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
&olc->
reverseLogicalChannelParameters.
multiplexParameters.
@@ -449,7 +462,7 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
t120.choice == eDataProtocolCapability_separateLANStack &&
olc->separateStack.networkAddress.choice ==
eNetworkAccessParameters_networkAddress_localAreaAddress) {
- ret = expect_t120(skb, ct, ctinfo, data, dataoff,
+ ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,
&olc->separateStack.networkAddress.
localAreaAddress);
if (ret < 0)
@@ -462,7 +475,7 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
+ unsigned int protoff, unsigned char **data, int dataoff,
OpenLogicalChannelAck *olca)
{
H2250LogicalChannelAckParameters *ack;
@@ -478,7 +491,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
choice ==
eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
{
- ret = process_h245_channel(skb, ct, ctinfo, data, dataoff,
+ ret = process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
&olca->
reverseLogicalChannelParameters.
multiplexParameters.
@@ -497,7 +511,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
if (ack->options &
eH2250LogicalChannelAckParameters_mediaChannel) {
/* RTP */
- ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+ ret = expect_rtp_rtcp(skb, ct, ctinfo,
+ protoff, data, dataoff,
&ack->mediaChannel);
if (ret < 0)
return -1;
@@ -506,7 +521,8 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
if (ack->options &
eH2250LogicalChannelAckParameters_mediaControlChannel) {
/* RTCP */
- ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+ ret = expect_rtp_rtcp(skb, ct, ctinfo,
+ protoff, data, dataoff,
&ack->mediaControlChannel);
if (ret < 0)
return -1;
@@ -516,7 +532,7 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
if ((olca->options & eOpenLogicalChannelAck_separateStack) &&
olca->separateStack.networkAddress.choice ==
eNetworkAccessParameters_networkAddress_localAreaAddress) {
- ret = expect_t120(skb, ct, ctinfo, data, dataoff,
+ ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,
&olca->separateStack.networkAddress.
localAreaAddress);
if (ret < 0)
@@ -529,14 +545,15 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_h245(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
+ unsigned int protoff, unsigned char **data, int dataoff,
MultimediaSystemControlMessage *mscm)
{
switch (mscm->choice) {
case eMultimediaSystemControlMessage_request:
if (mscm->request.choice ==
eRequestMessage_openLogicalChannel) {
- return process_olc(skb, ct, ctinfo, data, dataoff,
+ return process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
&mscm->request.openLogicalChannel);
}
pr_debug("nf_ct_h323: H.245 Request %d\n",
@@ -545,7 +562,8 @@ static int process_h245(struct sk_buff *skb, struct nf_conn *ct,
case eMultimediaSystemControlMessage_response:
if (mscm->response.choice ==
eResponseMessage_openLogicalChannelAck) {
- return process_olca(skb, ct, ctinfo, data, dataoff,
+ return process_olca(skb, ct, ctinfo,
+ protoff, data, dataoff,
&mscm->response.
openLogicalChannelAck);
}
@@ -571,10 +589,9 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,
int ret;
/* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
return NF_ACCEPT;
- }
+
pr_debug("nf_ct_h245: skblen = %u\n", skb->len);
spin_lock_bh(&nf_h323_lock);
@@ -597,7 +614,8 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,
}
/* Process H.245 signal */
- if (process_h245(skb, ct, ctinfo, &data, dataoff, &mscm) < 0)
+ if (process_h245(skb, ct, ctinfo, protoff,
+ &data, dataoff, &mscm) < 0)
goto drop;
}
@@ -606,8 +624,7 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff,
drop:
spin_unlock_bh(&nf_h323_lock);
- if (net_ratelimit())
- pr_info("nf_ct_h245: packet dropped\n");
+ nf_ct_helper_log(skb, ct, "cannot process H.245 message");
return NF_DROP;
}
@@ -620,6 +637,7 @@ static const struct nf_conntrack_expect_policy h245_exp_policy = {
static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
.name = "H.245",
.me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
.tuple.src.l3num = AF_UNSPEC,
.tuple.dst.protonum = IPPROTO_UDP,
.help = h245_help,
@@ -661,7 +679,7 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,
/****************************************************************************/
static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
+ unsigned int protoff, unsigned char **data, int dataoff,
TransportAddress *taddr)
{
int dir = CTINFO2DIR(ctinfo);
@@ -690,9 +708,10 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_h245(skb, ct, ctinfo, data, dataoff, taddr,
+ ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp) == 0) {
@@ -714,7 +733,6 @@ static int callforward_do_filter(const union nf_inet_addr *src,
u_int8_t family)
{
const struct nf_afinfo *afinfo;
- struct flowi fl1, fl2;
int ret = 0;
/* rcu_read_lock()ed by nf_hook_slow() */
@@ -722,18 +740,22 @@ static int callforward_do_filter(const union nf_inet_addr *src,
if (!afinfo)
return 0;
- memset(&fl1, 0, sizeof(fl1));
- memset(&fl2, 0, sizeof(fl2));
-
switch (family) {
case AF_INET: {
+ struct flowi4 fl1, fl2;
struct rtable *rt1, *rt2;
- fl1.fl4_dst = src->ip;
- fl2.fl4_dst = dst->ip;
- if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
- if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
- if (rt1->rt_gateway == rt2->rt_gateway &&
+ memset(&fl1, 0, sizeof(fl1));
+ fl1.daddr = src->ip;
+
+ memset(&fl2, 0, sizeof(fl2));
+ fl2.daddr = dst->ip;
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt1,
+ flowi4_to_flowi(&fl1), false)) {
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,
+ flowi4_to_flowi(&fl2), false)) {
+ if (rt_nexthop(rt1, fl1.daddr) ==
+ rt_nexthop(rt2, fl2.daddr) &&
rt1->dst.dev == rt2->dst.dev)
ret = 1;
dst_release(&rt2->dst);
@@ -742,17 +764,22 @@ static int callforward_do_filter(const union nf_inet_addr *src,
}
break;
}
-#if defined(CONFIG_NF_CONNTRACK_IPV6) || \
- defined(CONFIG_NF_CONNTRACK_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
case AF_INET6: {
+ struct flowi6 fl1, fl2;
struct rt6_info *rt1, *rt2;
- memcpy(&fl1.fl6_dst, src, sizeof(fl1.fl6_dst));
- memcpy(&fl2.fl6_dst, dst, sizeof(fl2.fl6_dst));
- if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
- if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
- if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway,
- sizeof(rt1->rt6i_gateway)) &&
+ memset(&fl1, 0, sizeof(fl1));
+ fl1.daddr = src->in6;
+
+ memset(&fl2, 0, sizeof(fl2));
+ fl2.daddr = dst->in6;
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt1,
+ flowi6_to_flowi(&fl1), false)) {
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,
+ flowi6_to_flowi(&fl2), false)) {
+ if (ipv6_addr_equal(rt6_nexthop(rt1),
+ rt6_nexthop(rt2)) &&
rt1->dst.dev == rt2->dst.dev)
ret = 1;
dst_release(&rt2->dst);
@@ -771,6 +798,7 @@ static int callforward_do_filter(const union nf_inet_addr *src,
static int expect_callforwarding(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
TransportAddress *taddr)
{
@@ -806,9 +834,11 @@ static int expect_callforwarding(struct sk_buff *skb,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Need NAT */
- ret = nat_callforwarding(skb, ct, ctinfo, data, dataoff,
+ ret = nat_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp) == 0) {
@@ -826,6 +856,7 @@ static int expect_callforwarding(struct sk_buff *skb,
/****************************************************************************/
static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
Setup_UUIE *setup)
{
@@ -839,7 +870,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: Setup\n");
if (setup->options & eSetup_UUIE_h245Address) {
- ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
&setup->h245Address);
if (ret < 0)
return -1;
@@ -847,14 +878,15 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
set_h225_addr = rcu_dereference(set_h225_addr_hook);
if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
- (set_h225_addr) && ct->status & IPS_NAT_MASK &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->destCallSignalAddress,
&addr, &port) &&
memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {
pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
- ret = set_h225_addr(skb, data, dataoff,
+ ret = set_h225_addr(skb, protoff, data, dataoff,
&setup->destCallSignalAddress,
&ct->tuplehash[!dir].tuple.src.u3,
ct->tuplehash[!dir].tuple.src.u.tcp.port);
@@ -863,14 +895,15 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
}
if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
- (set_h225_addr) && ct->status & IPS_NAT_MASK &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
&addr, &port) &&
memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {
pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
- ret = set_h225_addr(skb, data, dataoff,
+ ret = set_h225_addr(skb, protoff, data, dataoff,
&setup->sourceCallSignalAddress,
&ct->tuplehash[!dir].tuple.dst.u3,
ct->tuplehash[!dir].tuple.dst.u.tcp.port);
@@ -880,7 +913,8 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
if (setup->options & eSetup_UUIE_fastStart) {
for (i = 0; i < setup->fastStart.count; i++) {
- ret = process_olc(skb, ct, ctinfo, data, dataoff,
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
&setup->fastStart.item[i]);
if (ret < 0)
return -1;
@@ -894,6 +928,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
static int process_callproceeding(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
CallProceeding_UUIE *callproc)
{
@@ -903,7 +938,7 @@ static int process_callproceeding(struct sk_buff *skb,
pr_debug("nf_ct_q931: CallProceeding\n");
if (callproc->options & eCallProceeding_UUIE_h245Address) {
- ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
&callproc->h245Address);
if (ret < 0)
return -1;
@@ -911,7 +946,8 @@ static int process_callproceeding(struct sk_buff *skb,
if (callproc->options & eCallProceeding_UUIE_fastStart) {
for (i = 0; i < callproc->fastStart.count; i++) {
- ret = process_olc(skb, ct, ctinfo, data, dataoff,
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
&callproc->fastStart.item[i]);
if (ret < 0)
return -1;
@@ -924,6 +960,7 @@ static int process_callproceeding(struct sk_buff *skb,
/****************************************************************************/
static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
Connect_UUIE *connect)
{
@@ -933,7 +970,7 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: Connect\n");
if (connect->options & eConnect_UUIE_h245Address) {
- ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
&connect->h245Address);
if (ret < 0)
return -1;
@@ -941,7 +978,8 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
if (connect->options & eConnect_UUIE_fastStart) {
for (i = 0; i < connect->fastStart.count; i++) {
- ret = process_olc(skb, ct, ctinfo, data, dataoff,
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
&connect->fastStart.item[i]);
if (ret < 0)
return -1;
@@ -954,6 +992,7 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
Alerting_UUIE *alert)
{
@@ -963,7 +1002,7 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: Alerting\n");
if (alert->options & eAlerting_UUIE_h245Address) {
- ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
&alert->h245Address);
if (ret < 0)
return -1;
@@ -971,7 +1010,8 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
if (alert->options & eAlerting_UUIE_fastStart) {
for (i = 0; i < alert->fastStart.count; i++) {
- ret = process_olc(skb, ct, ctinfo, data, dataoff,
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
&alert->fastStart.item[i]);
if (ret < 0)
return -1;
@@ -984,6 +1024,7 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
Facility_UUIE *facility)
{
@@ -994,15 +1035,15 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
if (facility->reason.choice == eFacilityReason_callForwarded) {
if (facility->options & eFacility_UUIE_alternativeAddress)
- return expect_callforwarding(skb, ct, ctinfo, data,
- dataoff,
+ return expect_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
&facility->
alternativeAddress);
return 0;
}
if (facility->options & eFacility_UUIE_h245Address) {
- ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
&facility->h245Address);
if (ret < 0)
return -1;
@@ -1010,7 +1051,8 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
if (facility->options & eFacility_UUIE_fastStart) {
for (i = 0; i < facility->fastStart.count; i++) {
- ret = process_olc(skb, ct, ctinfo, data, dataoff,
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
&facility->fastStart.item[i]);
if (ret < 0)
return -1;
@@ -1023,6 +1065,7 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
Progress_UUIE *progress)
{
@@ -1032,7 +1075,7 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: Progress\n");
if (progress->options & eProgress_UUIE_h245Address) {
- ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
&progress->h245Address);
if (ret < 0)
return -1;
@@ -1040,7 +1083,8 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
if (progress->options & eProgress_UUIE_fastStart) {
for (i = 0; i < progress->fastStart.count; i++) {
- ret = process_olc(skb, ct, ctinfo, data, dataoff,
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
&progress->fastStart.item[i]);
if (ret < 0)
return -1;
@@ -1053,7 +1097,8 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff, Q931 *q931)
+ unsigned int protoff, unsigned char **data, int dataoff,
+ Q931 *q931)
{
H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;
int i;
@@ -1061,28 +1106,29 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
switch (pdu->h323_message_body.choice) {
case eH323_UU_PDU_h323_message_body_setup:
- ret = process_setup(skb, ct, ctinfo, data, dataoff,
+ ret = process_setup(skb, ct, ctinfo, protoff, data, dataoff,
&pdu->h323_message_body.setup);
break;
case eH323_UU_PDU_h323_message_body_callProceeding:
- ret = process_callproceeding(skb, ct, ctinfo, data, dataoff,
+ ret = process_callproceeding(skb, ct, ctinfo,
+ protoff, data, dataoff,
&pdu->h323_message_body.
callProceeding);
break;
case eH323_UU_PDU_h323_message_body_connect:
- ret = process_connect(skb, ct, ctinfo, data, dataoff,
+ ret = process_connect(skb, ct, ctinfo, protoff, data, dataoff,
&pdu->h323_message_body.connect);
break;
case eH323_UU_PDU_h323_message_body_alerting:
- ret = process_alerting(skb, ct, ctinfo, data, dataoff,
+ ret = process_alerting(skb, ct, ctinfo, protoff, data, dataoff,
&pdu->h323_message_body.alerting);
break;
case eH323_UU_PDU_h323_message_body_facility:
- ret = process_facility(skb, ct, ctinfo, data, dataoff,
+ ret = process_facility(skb, ct, ctinfo, protoff, data, dataoff,
&pdu->h323_message_body.facility);
break;
case eH323_UU_PDU_h323_message_body_progress:
- ret = process_progress(skb, ct, ctinfo, data, dataoff,
+ ret = process_progress(skb, ct, ctinfo, protoff, data, dataoff,
&pdu->h323_message_body.progress);
break;
default:
@@ -1096,7 +1142,8 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
if (pdu->options & eH323_UU_PDU_h245Control) {
for (i = 0; i < pdu->h245Control.count; i++) {
- ret = process_h245(skb, ct, ctinfo, data, dataoff,
+ ret = process_h245(skb, ct, ctinfo,
+ protoff, data, dataoff,
&pdu->h245Control.item[i]);
if (ret < 0)
return -1;
@@ -1117,10 +1164,9 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,
int ret;
/* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
return NF_ACCEPT;
- }
+
pr_debug("nf_ct_q931: skblen = %u\n", skb->len);
spin_lock_bh(&nf_h323_lock);
@@ -1142,7 +1188,8 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,
}
/* Process Q.931 signal */
- if (process_q931(skb, ct, ctinfo, &data, dataoff, &q931) < 0)
+ if (process_q931(skb, ct, ctinfo, protoff,
+ &data, dataoff, &q931) < 0)
goto drop;
}
@@ -1151,8 +1198,7 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff,
drop:
spin_unlock_bh(&nf_h323_lock);
- if (net_ratelimit())
- pr_info("nf_ct_q931: packet dropped\n");
+ nf_ct_helper_log(skb, ct, "cannot process Q.931 message");
return NF_DROP;
}
@@ -1167,6 +1213,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = {
{
.name = "Q.931",
.me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
.tuple.src.l3num = AF_INET,
.tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT),
.tuple.dst.protonum = IPPROTO_TCP,
@@ -1225,7 +1272,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
/****************************************************************************/
static int set_expect_timeout(struct nf_conntrack_expect *exp,
- unsigned timeout)
+ unsigned int timeout)
{
if (!exp || !del_timer(&exp->timeout))
return 0;
@@ -1239,10 +1286,10 @@ static int set_expect_timeout(struct nf_conntrack_expect *exp,
/****************************************************************************/
static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data,
+ unsigned int protoff, unsigned char **data,
TransportAddress *taddr, int count)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
int i;
@@ -1274,8 +1321,10 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
nat_q931 = rcu_dereference(nat_q931_hook);
- if (nat_q931 && ct->status & IPS_NAT_MASK) { /* Need NAT */
- ret = nat_q931(skb, ct, ctinfo, data, taddr, i, port, exp);
+ if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) { /* Need NAT */
+ ret = nat_q931(skb, ct, ctinfo, protoff, data,
+ taddr, i, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
@@ -1295,6 +1344,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, GatekeeperRequest *grq)
{
typeof(set_ras_addr_hook) set_ras_addr;
@@ -1302,8 +1352,9 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: GRQ\n");
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK) /* NATed */
- return set_ras_addr(skb, ct, ctinfo, data,
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) /* NATed */
+ return set_ras_addr(skb, ct, ctinfo, protoff, data,
&grq->rasAddress, 1);
return 0;
}
@@ -1311,6 +1362,7 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, GatekeeperConfirm *gcf)
{
int dir = CTINFO2DIR(ctinfo);
@@ -1355,23 +1407,25 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, RegistrationRequest *rrq)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int ret;
typeof(set_ras_addr_hook) set_ras_addr;
pr_debug("nf_ct_ras: RRQ\n");
- ret = expect_q931(skb, ct, ctinfo, data,
+ ret = expect_q931(skb, ct, ctinfo, protoff, data,
rrq->callSignalAddress.item,
rrq->callSignalAddress.count);
if (ret < 0)
return -1;
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, data,
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
rrq->rasAddress.item,
rrq->rasAddress.count);
if (ret < 0)
@@ -1390,9 +1444,10 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, RegistrationConfirm *rcf)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
int ret;
struct nf_conntrack_expect *exp;
@@ -1401,8 +1456,9 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: RCF\n");
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, data,
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
rcf->callSignalAddress.item,
rcf->callSignalAddress.count);
if (ret < 0)
@@ -1420,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
nf_ct_refresh(ct, skb, info->timeout * HZ);
/* Set expect timeout */
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,
info->sig_port[!dir]);
if (exp) {
@@ -1430,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
nf_ct_dump_tuple(&exp->tuple);
set_expect_timeout(exp, info->timeout);
}
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
return 0;
@@ -1439,9 +1495,10 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, UnregistrationRequest *urq)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
int ret;
typeof(set_sig_addr_hook) set_sig_addr;
@@ -1449,8 +1506,9 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: URQ\n");
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, data,
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
urq->callSignalAddress.item,
urq->callSignalAddress.count);
if (ret < 0)
@@ -1471,9 +1529,10 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, AdmissionRequest *arq)
{
- const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
__be16 port;
union nf_inet_addr addr;
@@ -1487,9 +1546,10 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
port == info->sig_port[dir] &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
set_h225_addr && ct->status & IPS_NAT_MASK) {
/* Answering ARQ */
- return set_h225_addr(skb, data, 0,
+ return set_h225_addr(skb, protoff, data, 0,
&arq->destCallSignalAddress,
&ct->tuplehash[!dir].tuple.dst.u3,
info->sig_port[!dir]);
@@ -1499,9 +1559,10 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
- set_h225_addr && ct->status & IPS_NAT_MASK) {
+ set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
/* Calling ARQ */
- return set_h225_addr(skb, data, 0,
+ return set_h225_addr(skb, protoff, data, 0,
&arq->srcCallSignalAddress,
&ct->tuplehash[!dir].tuple.dst.u3,
port);
@@ -1513,6 +1574,7 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, AdmissionConfirm *acf)
{
int dir = CTINFO2DIR(ctinfo);
@@ -1531,8 +1593,9 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
/* Answering ACF */
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK)
- return set_sig_addr(skb, ct, ctinfo, data,
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
+ return set_sig_addr(skb, ct, ctinfo, protoff, data,
&acf->destCallSignalAddress, 1);
return 0;
}
@@ -1560,6 +1623,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, LocationRequest *lrq)
{
typeof(set_ras_addr_hook) set_ras_addr;
@@ -1567,8 +1631,9 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: LRQ\n");
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK)
- return set_ras_addr(skb, ct, ctinfo, data,
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
+ return set_ras_addr(skb, ct, ctinfo, protoff, data,
&lrq->replyAddress, 1);
return 0;
}
@@ -1576,6 +1641,7 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, LocationConfirm *lcf)
{
int dir = CTINFO2DIR(ctinfo);
@@ -1615,6 +1681,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, InfoRequestResponse *irr)
{
int ret;
@@ -1624,16 +1691,18 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: IRR\n");
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, data,
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
&irr->rasAddress, 1);
if (ret < 0)
return -1;
}
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, data,
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
irr->callSignalAddress.item,
irr->callSignalAddress.count);
if (ret < 0)
@@ -1646,38 +1715,39 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int process_ras(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, RasMessage *ras)
{
switch (ras->choice) {
case eRasMessage_gatekeeperRequest:
- return process_grq(skb, ct, ctinfo, data,
+ return process_grq(skb, ct, ctinfo, protoff, data,
&ras->gatekeeperRequest);
case eRasMessage_gatekeeperConfirm:
- return process_gcf(skb, ct, ctinfo, data,
+ return process_gcf(skb, ct, ctinfo, protoff, data,
&ras->gatekeeperConfirm);
case eRasMessage_registrationRequest:
- return process_rrq(skb, ct, ctinfo, data,
+ return process_rrq(skb, ct, ctinfo, protoff, data,
&ras->registrationRequest);
case eRasMessage_registrationConfirm:
- return process_rcf(skb, ct, ctinfo, data,
+ return process_rcf(skb, ct, ctinfo, protoff, data,
&ras->registrationConfirm);
case eRasMessage_unregistrationRequest:
- return process_urq(skb, ct, ctinfo, data,
+ return process_urq(skb, ct, ctinfo, protoff, data,
&ras->unregistrationRequest);
case eRasMessage_admissionRequest:
- return process_arq(skb, ct, ctinfo, data,
+ return process_arq(skb, ct, ctinfo, protoff, data,
&ras->admissionRequest);
case eRasMessage_admissionConfirm:
- return process_acf(skb, ct, ctinfo, data,
+ return process_acf(skb, ct, ctinfo, protoff, data,
&ras->admissionConfirm);
case eRasMessage_locationRequest:
- return process_lrq(skb, ct, ctinfo, data,
+ return process_lrq(skb, ct, ctinfo, protoff, data,
&ras->locationRequest);
case eRasMessage_locationConfirm:
- return process_lcf(skb, ct, ctinfo, data,
+ return process_lcf(skb, ct, ctinfo, protoff, data,
&ras->locationConfirm);
case eRasMessage_infoRequestResponse:
- return process_irr(skb, ct, ctinfo, data,
+ return process_irr(skb, ct, ctinfo, protoff, data,
&ras->infoRequestResponse);
default:
pr_debug("nf_ct_ras: RAS message %d\n", ras->choice);
@@ -1717,7 +1787,7 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff,
}
/* Process RAS message */
- if (process_ras(skb, ct, ctinfo, &data, &ras) < 0)
+ if (process_ras(skb, ct, ctinfo, protoff, &data, &ras) < 0)
goto drop;
accept:
@@ -1726,8 +1796,7 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff,
drop:
spin_unlock_bh(&nf_h323_lock);
- if (net_ratelimit())
- pr_info("nf_ct_ras: packet dropped\n");
+ nf_ct_helper_log(skb, ct, "cannot process RAS message");
return NF_DROP;
}
@@ -1741,6 +1810,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {
{
.name = "RAS",
.me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
.tuple.src.l3num = AF_INET,
.tuple.src.u.udp.port = cpu_to_be16(RAS_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
@@ -1750,6 +1820,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {
{
.name = "RAS",
.me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
.tuple.src.l3num = AF_INET6,
.tuple.src.u.udp.port = cpu_to_be16(RAS_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
@@ -1828,4 +1899,6 @@ MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
MODULE_DESCRIPTION("H.323 connection tracking helper");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_h323");
-MODULE_ALIAS_NFCT_HELPER("h323");
+MODULE_ALIAS_NFCT_HELPER("RAS");
+MODULE_ALIAS_NFCT_HELPER("Q.931");
+MODULE_ALIAS_NFCT_HELPER("H.245");
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 59e1a4cd4e8..5b3eae7d4c9 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -28,13 +29,80 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_log.h>
static DEFINE_MUTEX(nf_ct_helper_mutex);
-static struct hlist_head *nf_ct_helper_hash __read_mostly;
-static unsigned int nf_ct_helper_hsize __read_mostly;
+struct hlist_head *nf_ct_helper_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hash);
+unsigned int nf_ct_helper_hsize __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static int nf_ct_helper_vmalloc;
+static bool nf_ct_auto_assign_helper __read_mostly = true;
+module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
+MODULE_PARM_DESC(nf_conntrack_helper,
+ "Enable automatic conntrack helper assignment (default 1)");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table helper_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_helper",
+ .data = &init_net.ct.sysctl_auto_assign_helper,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_auto_assign_helper;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.helper_sysctl_header =
+ register_net_sysctl(net, "net/netfilter", table);
+
+ if (!net->ct.helper_sysctl_header) {
+ pr_err("nf_conntrack_helper: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.helper_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.helper_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
/* Stupid hash, but collision free for the default registrations of the
* helpers currently in the kernel. */
@@ -49,14 +117,13 @@ __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_helper *helper;
struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
- struct hlist_node *n;
unsigned int h;
if (!nf_ct_helper_count)
return NULL;
h = helper_hash(tuple);
- hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) {
+ hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
return helper;
}
@@ -67,11 +134,10 @@ struct nf_conntrack_helper *
__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
{
struct nf_conntrack_helper *h;
- struct hlist_node *n;
unsigned int i;
for (i = 0; i < nf_ct_helper_hsize; i++) {
- hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) {
+ hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) {
if (!strcmp(h->name, name) &&
h->tuple.src.l3num == l3num &&
h->tuple.dst.protonum == protonum)
@@ -101,11 +167,14 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
-struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
+struct nf_conn_help *
+nf_ct_helper_ext_add(struct nf_conn *ct,
+ struct nf_conntrack_helper *helper, gfp_t gfp)
{
struct nf_conn_help *help;
- help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp);
+ help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER,
+ helper->data_len, gfp);
if (help)
INIT_HLIST_HEAD(&help->expectations);
else
@@ -119,31 +188,60 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
{
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
+ struct net *net = nf_ct_net(ct);
int ret = 0;
+ /* We already got a helper explicitly attached. The function
+ * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
+ * the helper up again. Since now the user is in full control of
+ * making consistent helper configurations, skip this automatic
+ * re-lookup, otherwise we'll lose the helper.
+ */
+ if (test_bit(IPS_HELPER_BIT, &ct->status))
+ return 0;
+
if (tmpl != NULL) {
help = nfct_help(tmpl);
- if (help != NULL)
+ if (help != NULL) {
helper = help->helper;
+ set_bit(IPS_HELPER_BIT, &ct->status);
+ }
}
help = nfct_help(ct);
- if (helper == NULL)
+ if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
+ pr_info("nf_conntrack: automatic helper "
+ "assignment is deprecated and it will "
+ "be removed soon. Use the iptables CT target "
+ "to attach helpers instead.\n");
+ net->ct.auto_assign_helper_warned = true;
+ }
+ }
+
if (helper == NULL) {
if (help)
- rcu_assign_pointer(help->helper, NULL);
+ RCU_INIT_POINTER(help->helper, NULL);
goto out;
}
if (help == NULL) {
- help = nf_ct_helper_ext_add(ct, flags);
+ help = nf_ct_helper_ext_add(ct, helper, flags);
if (help == NULL) {
ret = -ENOMEM;
goto out;
}
} else {
- memset(&help->help, 0, sizeof(help->help));
+ /* We only allow helper re-assignment of the same sort since
+ * we cannot reallocate the helper extension area.
+ */
+ struct nf_conntrack_helper *tmp = rcu_dereference(help->helper);
+
+ if (tmp && tmp->help != helper->help) {
+ RCU_INIT_POINTER(help->helper, NULL);
+ goto out;
+ }
}
rcu_assign_pointer(help->helper, helper);
@@ -152,15 +250,16 @@ out:
}
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
+/* appropiate ct lock protecting must be taken by caller */
static inline int unhelp(struct nf_conntrack_tuple_hash *i,
const struct nf_conntrack_helper *me)
{
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
struct nf_conn_help *help = nfct_help(ct);
- if (help && help->helper == me) {
+ if (help && rcu_dereference_raw(help->helper) == me) {
nf_conntrack_event(IPCT_HELPER, ct);
- rcu_assign_pointer(help->helper, NULL);
+ RCU_INIT_POINTER(help->helper, NULL);
}
return 0;
}
@@ -179,8 +278,91 @@ void nf_ct_helper_destroy(struct nf_conn *ct)
}
}
+static LIST_HEAD(nf_ct_helper_expectfn_list);
+
+void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ list_add_rcu(&n->head, &nf_ct_helper_expectfn_list);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register);
+
+void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ list_del_rcu(&n->head);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
+
+struct nf_ct_helper_expectfn *
+nf_ct_helper_expectfn_find_by_name(const char *name)
+{
+ struct nf_ct_helper_expectfn *cur;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
+ if (!strcmp(cur->name, name)) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? cur : NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name);
+
+struct nf_ct_helper_expectfn *
+nf_ct_helper_expectfn_find_by_symbol(const void *symbol)
+{
+ struct nf_ct_helper_expectfn *cur;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
+ if (cur->expectfn == symbol) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? cur : NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
+
+__printf(3, 4)
+void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
+ const char *fmt, ...)
+{
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ /* Called from the helper function, this call never fails */
+ help = nfct_help(ct);
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+
+ nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
+ "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_log);
+
int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
{
+ int ret = 0;
+ struct nf_conntrack_helper *cur;
unsigned int h = helper_hash(&me->tuple);
BUG_ON(me->expect_policy == NULL);
@@ -188,11 +370,19 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
mutex_lock(&nf_ct_helper_mutex);
+ hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
+ if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
+ cur->tuple.src.l3num == me->tuple.src.l3num &&
+ cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
nf_ct_helper_count++;
+out:
mutex_unlock(&nf_ct_helper_mutex);
-
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_register);
@@ -201,30 +391,48 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
{
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_expect *exp;
- const struct hlist_node *n, *next;
+ const struct hlist_node *next;
const struct hlist_nulls_node *nn;
unsigned int i;
+ int cpu;
/* Get rid of expectations */
+ spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
- hlist_for_each_entry_safe(exp, n, next,
+ hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
- if ((help->helper == me || exp->helper == me) &&
+ if ((rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock)
+ ) == me || exp->helper == me) &&
del_timer(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
}
}
}
+ spin_unlock_bh(&nf_conntrack_expect_lock);
/* Get rid of expecteds, set helpers to NULL. */
- hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
- unhelp(h, me);
- for (i = 0; i < net->ct.htable_size; i++) {
- hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
unhelp(h, me);
+ spin_unlock_bh(&pcpu->lock);
+ }
+ local_bh_disable();
+ for (i = 0; i < net->ct.htable_size; i++) {
+ spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ if (i < net->ct.htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ unhelp(h, me);
+ }
+ spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
}
+ local_bh_enable();
}
void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
@@ -242,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
synchronize_rcu();
rtnl_lock();
- spin_lock_bh(&nf_conntrack_lock);
for_each_net(net)
__nf_conntrack_helper_unregister(me, net);
- spin_unlock_bh(&nf_conntrack_lock);
rtnl_unlock();
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
@@ -256,31 +462,41 @@ static struct nf_ct_ext_type helper_extend __read_mostly = {
.id = NF_CT_EXT_HELPER,
};
-int nf_conntrack_helper_init(void)
+int nf_conntrack_helper_pernet_init(struct net *net)
{
- int err;
+ net->ct.auto_assign_helper_warned = false;
+ net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
+ return nf_conntrack_helper_init_sysctl(net);
+}
+
+void nf_conntrack_helper_pernet_fini(struct net *net)
+{
+ nf_conntrack_helper_fini_sysctl(net);
+}
+int nf_conntrack_helper_init(void)
+{
+ int ret;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
- nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize,
- &nf_ct_helper_vmalloc, 0);
+ nf_ct_helper_hash =
+ nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
- err = nf_ct_extend_register(&helper_extend);
- if (err < 0)
- goto err1;
+ ret = nf_ct_extend_register(&helper_extend);
+ if (ret < 0) {
+ pr_err("nf_ct_helper: Unable to register helper extension.\n");
+ goto out_extend;
+ }
return 0;
-
-err1:
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
- nf_ct_helper_hsize);
- return err;
+out_extend:
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ return ret;
}
void nf_conntrack_helper_fini(void)
{
nf_ct_extend_unregister(&helper_extend);
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
- nf_ct_helper_hsize);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index b394aa31877..0fd2976db7e 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -1,6 +1,7 @@
/* IRC extension for IP connection tracking, Version 1.21
* (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
* based on RR's ip_conntrack_ftp.c
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -33,6 +34,7 @@ static DEFINE_SPINLOCK(irc_buffer_lock);
unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned int matchoff,
unsigned int matchlen,
struct nf_conntrack_expect *exp) __read_mostly;
@@ -125,8 +127,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
return NF_ACCEPT;
/* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY)
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
return NF_ACCEPT;
/* Not a full tcp header? */
@@ -186,16 +187,16 @@ static int help(struct sk_buff *skb, unsigned int protoff,
tuple = &ct->tuplehash[dir].tuple;
if (tuple->src.u3.ip != dcc_ip &&
tuple->dst.u3.ip != dcc_ip) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "Forged DCC command from %pI4: %pI4:%u\n",
- &tuple->src.u3.ip,
- &dcc_ip, dcc_port);
+ net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
+ &tuple->src.u3.ip,
+ &dcc_ip, dcc_port);
continue;
}
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
+ nf_ct_helper_log(skb, ct,
+ "cannot alloc expectation");
ret = NF_DROP;
goto out;
}
@@ -208,12 +209,15 @@ static int help(struct sk_buff *skb, unsigned int protoff,
nf_nat_irc = rcu_dereference(nf_nat_irc_hook);
if (nf_nat_irc && ct->status & IPS_NAT_MASK)
- ret = nf_nat_irc(skb, ctinfo,
+ ret = nf_nat_irc(skb, ctinfo, protoff,
addr_beg_p - ib_ptr,
addr_end_p - addr_beg_p,
exp);
- else if (nf_ct_expect_related(exp) != 0)
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct,
+ "cannot add expectation");
ret = NF_DROP;
+ }
nf_ct_expect_put(exp);
goto out;
}
@@ -224,7 +228,6 @@ static int help(struct sk_buff *skb, unsigned int protoff,
}
static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
-static char irc_names[MAX_PORTS][sizeof("irc-65535")] __read_mostly;
static struct nf_conntrack_expect_policy irc_exp_policy;
static void nf_conntrack_irc_fini(void);
@@ -232,7 +235,6 @@ static void nf_conntrack_irc_fini(void);
static int __init nf_conntrack_irc_init(void)
{
int i, ret;
- char *tmpname;
if (max_dcc_channels < 1) {
printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n");
@@ -258,12 +260,10 @@ static int __init nf_conntrack_irc_init(void)
irc[i].me = THIS_MODULE;
irc[i].help = help;
- tmpname = &irc_names[i][0];
if (ports[i] == IRC_PORT)
- sprintf(tmpname, "irc");
+ sprintf(irc[i].name, "irc");
else
- sprintf(tmpname, "irc-%u", i);
- irc[i].name = tmpname;
+ sprintf(irc[i].name, "irc-%u", i);
ret = nf_conntrack_helper_register(&irc[i]);
if (ret) {
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
new file mode 100644
index 00000000000..bb53f120e79
--- /dev/null
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -0,0 +1,108 @@
+/*
+ * test/set flag bits stored in conntrack extension area.
+ *
+ * (C) 2013 Astaro GmbH & Co KG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/export.h>
+#include <linux/types.h>
+
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+static unsigned int label_bits(const struct nf_conn_labels *l)
+{
+ unsigned int longs = l->words;
+ return longs * BITS_PER_LONG;
+}
+
+bool nf_connlabel_match(const struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return false;
+
+ return bit < label_bits(labels) && test_bit(bit, labels->bits);
+}
+EXPORT_SYMBOL_GPL(nf_connlabel_match);
+
+int nf_connlabel_set(struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels || bit >= label_bits(labels))
+ return -ENOSPC;
+
+ if (test_bit(bit, labels->bits))
+ return 0;
+
+ if (!test_and_set_bit(bit, labels->bits))
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabel_set);
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static void replace_u32(u32 *address, u32 mask, u32 new)
+{
+ u32 old, tmp;
+
+ do {
+ old = *address;
+ tmp = (old & mask) ^ new;
+ } while (cmpxchg(address, old, tmp) != old);
+}
+
+int nf_connlabels_replace(struct nf_conn *ct,
+ const u32 *data,
+ const u32 *mask, unsigned int words32)
+{
+ struct nf_conn_labels *labels;
+ unsigned int size, i;
+ u32 *dst;
+
+ labels = nf_ct_labels_find(ct);
+ if (!labels)
+ return -ENOSPC;
+
+ size = labels->words * sizeof(long);
+ if (size < (words32 * sizeof(u32)))
+ words32 = size / sizeof(u32);
+
+ dst = (u32 *) labels->bits;
+ if (words32) {
+ for (i = 0; i < words32; i++)
+ replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]);
+ }
+
+ size /= sizeof(u32);
+ for (i = words32; i < size; i++) /* pad */
+ replace_u32(&dst[i], 0, 0);
+
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_replace);
+#endif
+
+static struct nf_ct_ext_type labels_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_labels),
+ .align = __alignof__(struct nf_conn_labels),
+ .id = NF_CT_EXT_LABELS,
+};
+
+int nf_conntrack_labels_init(void)
+{
+ return nf_ct_extend_register(&labels_extend);
+}
+
+void nf_conntrack_labels_fini(void)
+{
+ nf_ct_extend_unregister(&labels_extend);
+}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index aadde018a07..4c8f30a3d6d 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -18,14 +18,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_addr.h>
#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/route.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns");
MODULE_ALIAS_NFCT_HELPER("netbios_ns");
static unsigned int timeout __read_mostly = 3;
-module_param(timeout, uint, 0400);
+module_param(timeout, uint, S_IRUSR);
MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
-static int help(struct sk_buff *skb, unsigned int protoff,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo)
-{
- struct nf_conntrack_expect *exp;
- struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt = skb_rtable(skb);
- struct in_device *in_dev;
- __be32 mask = 0;
-
- /* we're only interested in locally generated packets */
- if (skb->sk == NULL)
- goto out;
- if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
- goto out;
- if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- goto out;
-
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(rt->dst.dev);
- if (in_dev != NULL) {
- for_primary_ifa(in_dev) {
- if (ifa->ifa_broadcast == iph->daddr) {
- mask = ifa->ifa_mask;
- break;
- }
- } endfor_ifa(in_dev);
- }
- rcu_read_unlock();
-
- if (mask == 0)
- goto out;
-
- exp = nf_ct_expect_alloc(ct);
- if (exp == NULL)
- goto out;
-
- exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = htons(NMBD_PORT);
-
- exp->mask.src.u3.ip = mask;
- exp->mask.src.u.udp.port = htons(0xFFFF);
-
- exp->expectfn = NULL;
- exp->flags = NF_CT_EXPECT_PERMANENT;
- exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
- exp->helper = NULL;
-
- nf_ct_expect_related(exp);
- nf_ct_expect_put(exp);
-
- nf_ct_refresh(ct, skb, timeout * HZ);
-out:
- return NF_ACCEPT;
-}
-
static struct nf_conntrack_expect_policy exp_policy = {
.max_expected = 1,
};
+static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+}
+
static struct nf_conntrack_helper helper __read_mostly = {
.name = "netbios-ns",
- .tuple.src.l3num = AF_INET,
+ .tuple.src.l3num = NFPROTO_IPV4,
.tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
.me = THIS_MODULE,
- .help = help,
+ .help = netbios_ns_help,
.expect_policy = &exp_policy,
};
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b729ace1dcc..300ed1eec72 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -4,7 +4,7 @@
* (C) 2001 by Jay Schulist <jschlst@samba.org>
* (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>
* (C) 2003 by Patrick Mchardy <kaber@trash.net>
- * (C) 2005-2008 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org>
*
* Initial connection tracking via netlink development funded and
* generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -37,14 +37,18 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_labels.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_helper.h>
#endif
#include <linux/netfilter/nfnetlink.h>
@@ -65,7 +69,8 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb,
nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
- NLA_PUT_U8(skb, CTA_PROTO_NUM, tuple->dst.protonum);
+ if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum))
+ goto nla_put_failure;
if (likely(l4proto->tuple_to_nlattr))
ret = l4proto->tuple_to_nlattr(skb, tuple);
@@ -109,22 +114,24 @@ ctnetlink_dump_tuples(struct sk_buff *skb,
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
+ rcu_read_lock();
l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
- if (unlikely(ret < 0))
- return ret;
-
- l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
- ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
-
+ if (ret >= 0) {
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
+ tuple->dst.protonum);
+ ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
+ }
+ rcu_read_unlock();
return ret;
}
static inline int
ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
- NLA_PUT_BE32(skb, CTA_STATUS, htonl(ct->status));
+ if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -134,12 +141,13 @@ nla_put_failure:
static inline int
ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
{
- long timeout = (ct->timeout.expires - jiffies) / HZ;
+ long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
if (timeout < 0)
timeout = 0;
- NLA_PUT_BE32(skb, CTA_TIMEOUT, htonl(timeout));
+ if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -188,7 +196,8 @@ ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED);
if (!nest_helper)
goto nla_put_failure;
- NLA_PUT_STRING(skb, CTA_HELP_NAME, helper->name);
+ if (nla_put_string(skb, CTA_HELP_NAME, helper->name))
+ goto nla_put_failure;
if (helper->to_nlattr)
helper->to_nlattr(skb, ct);
@@ -202,26 +211,72 @@ nla_put_failure:
}
static int
-ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
- enum ip_conntrack_dir dir)
+dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
+ enum ip_conntrack_dir dir, int type)
{
- enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nf_conn_counter *counter = acct->counter;
struct nlattr *nest_count;
- const struct nf_conn_counter *acct;
+ u64 pkts, bytes;
+
+ if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&counter[dir].packets, 0);
+ bytes = atomic64_xchg(&counter[dir].bytes, 0);
+ } else {
+ pkts = atomic64_read(&counter[dir].packets);
+ bytes = atomic64_read(&counter[dir].bytes);
+ }
+
+ nest_count = nla_nest_start(skb, attr | NLA_F_NESTED);
+ if (!nest_count)
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) ||
+ nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_count);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type)
+{
+ struct nf_conn_acct *acct = nf_conn_acct_find(ct);
- acct = nf_conn_acct_find(ct);
if (!acct)
return 0;
- nest_count = nla_nest_start(skb, type | NLA_F_NESTED);
+ if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0)
+ return -1;
+ if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int
+ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_count;
+ const struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (!tstamp)
+ return 0;
+
+ nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
if (!nest_count)
goto nla_put_failure;
- NLA_PUT_BE64(skb, CTA_COUNTERS_PACKETS,
- cpu_to_be64(acct[dir].packets));
- NLA_PUT_BE64(skb, CTA_COUNTERS_BYTES,
- cpu_to_be64(acct[dir].bytes));
-
+ if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) ||
+ (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP,
+ cpu_to_be64(tstamp->stop))))
+ goto nla_put_failure;
nla_nest_end(skb, nest_count);
return 0;
@@ -234,7 +289,8 @@ nla_put_failure:
static inline int
ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
{
- NLA_PUT_BE32(skb, CTA_MARK, htonl(ct->mark));
+ if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -254,14 +310,15 @@ ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
if (ret)
- return ret;
+ return 0;
ret = -1;
nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
if (!nest_secctx)
goto nla_put_failure;
- NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+ if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+ goto nla_put_failure;
nla_nest_end(skb, nest_secctx);
ret = 0;
@@ -273,6 +330,40 @@ nla_put_failure:
#define ctnetlink_dump_secctx(a, b) (0)
#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+static int ctnetlink_label_size(const struct nf_conn *ct)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return 0;
+ return nla_total_size(labels->words * sizeof(long));
+}
+
+static int
+ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+ unsigned int len, i;
+
+ if (!labels)
+ return 0;
+
+ len = labels->words * sizeof(long);
+ i = 0;
+ do {
+ if (labels->bits[i] != 0)
+ return nla_put(skb, CTA_LABELS, len, labels->bits);
+ i++;
+ } while (i < labels->words);
+
+ return 0;
+}
+#else
+#define ctnetlink_dump_labels(a, b) (0)
+#define ctnetlink_label_size(a) (0)
+#endif
+
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
static inline int
@@ -296,9 +387,8 @@ nla_put_failure:
return -1;
}
-#ifdef CONFIG_NF_NAT_NEEDED
static int
-dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
+dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)
{
struct nlattr *nest_parms;
@@ -306,12 +396,13 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
if (!nest_parms)
goto nla_put_failure;
- NLA_PUT_BE32(skb, CTA_NAT_SEQ_CORRECTION_POS,
- htonl(natseq->correction_pos));
- NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_BEFORE,
- htonl(natseq->offset_before));
- NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_AFTER,
- htonl(natseq->offset_after));
+ if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS,
+ htonl(seq->correction_pos)) ||
+ nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE,
+ htonl(seq->offset_before)) ||
+ nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER,
+ htonl(seq->offset_after)))
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
@@ -322,32 +413,30 @@ nla_put_failure:
}
static inline int
-ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
{
- struct nf_nat_seq *natseq;
- struct nf_conn_nat *nat = nfct_nat(ct);
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *seq;
- if (!(ct->status & IPS_SEQ_ADJUST) || !nat)
+ if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)
return 0;
- natseq = &nat->seq[IP_CT_DIR_ORIGINAL];
- if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1)
+ seq = &seqadj->seq[IP_CT_DIR_ORIGINAL];
+ if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)
return -1;
- natseq = &nat->seq[IP_CT_DIR_REPLY];
- if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1)
+ seq = &seqadj->seq[IP_CT_DIR_REPLY];
+ if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)
return -1;
return 0;
}
-#else
-#define ctnetlink_dump_nat_seq_adj(a, b) (0)
-#endif
static inline int
ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
- NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct));
+ if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -357,7 +446,8 @@ nla_put_failure:
static inline int
ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
- NLA_PUT_BE32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)));
+ if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -365,16 +455,16 @@ nla_put_failure:
}
static int
-ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
- int event, struct nf_conn *ct)
+ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ struct nf_conn *ct)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
- unsigned int flags = pid ? NLM_F_MULTI : 0;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
- event |= NFNL_SUBSYS_CTNETLINK << 8;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags);
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
@@ -397,21 +487,23 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct))
- NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct)));
+ if (nf_ct_zone(ct) &&
+ nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
if (ctnetlink_dump_status(skb, ct) < 0 ||
ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
ctnetlink_dump_mark(skb, ct) < 0 ||
ctnetlink_dump_secctx(skb, ct) < 0 ||
+ ctnetlink_dump_labels(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
ctnetlink_dump_use(skb, ct) < 0 ||
ctnetlink_dump_master(skb, ct) < 0 ||
- ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -423,7 +515,6 @@ nla_put_failure:
return -1;
}
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
static inline size_t
ctnetlink_proto_size(const struct nf_conn *ct)
{
@@ -443,7 +534,7 @@ ctnetlink_proto_size(const struct nf_conn *ct)
}
static inline size_t
-ctnetlink_counters_size(const struct nf_conn *ct)
+ctnetlink_acct_size(const struct nf_conn *ct)
{
if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
return 0;
@@ -453,16 +544,34 @@ ctnetlink_counters_size(const struct nf_conn *ct)
;
}
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
+static inline int
+ctnetlink_secctx_size(const struct nf_conn *ct)
{
- int len;
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ int len, ret;
- security_secid_to_secctx(ct->secmark, NULL, &len);
+ ret = security_secid_to_secctx(ct->secmark, NULL, &len);
+ if (ret)
+ return 0;
- return sizeof(char) * len;
+ return nla_total_size(0) /* CTA_SECCTX */
+ + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+#else
+ return 0;
+#endif
}
+
+static inline size_t
+ctnetlink_timestamp_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ return 0;
+ return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+#else
+ return 0;
#endif
+}
static inline size_t
ctnetlink_nlmsg_size(const struct nf_conn *ct)
@@ -474,15 +583,13 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
- + ctnetlink_counters_size(ct)
+ + ctnetlink_acct_size(ct)
+ + ctnetlink_timestamp_size(ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ nla_total_size(0) /* CTA_PROTOINFO */
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
- + nla_total_size(0) /* CTA_SECCTX */
- + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
-#endif
+ + ctnetlink_secctx_size(ct)
#ifdef CONFIG_NF_NAT_NEEDED
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
@@ -490,10 +597,15 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
#ifdef CONFIG_NF_CONNTRACK_MARK
+ nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+#endif
+ ctnetlink_proto_size(ct)
+ + ctnetlink_label_size(ct)
;
}
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
@@ -533,7 +645,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto errout;
type |= NFNL_SUBSYS_CTNETLINK << 8;
- nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags);
+ nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
@@ -557,8 +669,9 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct))
- NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct)));
+ if (nf_ct_zone(ct) &&
+ nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
if (ctnetlink_dump_id(skb, ct) < 0)
goto nla_put_failure;
@@ -567,8 +680,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
- if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+ if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
} else {
if (ctnetlink_dump_timeout(skb, ct) < 0)
@@ -587,13 +700,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
&& ctnetlink_dump_secctx(skb, ct) < 0)
goto nla_put_failure;
#endif
+ if (events & (1 << IPCT_LABEL) &&
+ ctnetlink_dump_labels(skb, ct) < 0)
+ goto nla_put_failure;
if (events & (1 << IPCT_RELATED) &&
ctnetlink_dump_master(skb, ct) < 0)
goto nla_put_failure;
- if (events & (1 << IPCT_NATSEQADJ) &&
- ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+ if (events & (1 << IPCT_SEQADJ) &&
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
goto nla_put_failure;
}
@@ -605,7 +721,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
rcu_read_unlock();
nlmsg_end(skb, nlh);
- err = nfnetlink_send(skb, net, item->pid, group, item->report,
+ err = nfnetlink_send(skb, net, item->portid, group, item->report,
GFP_ATOMIC);
if (err == -ENOBUFS || err == -EAGAIN)
return -ENOBUFS;
@@ -629,9 +745,18 @@ static int ctnetlink_done(struct netlink_callback *cb)
{
if (cb->args[1])
nf_ct_put((struct nf_conn *)cb->args[1]);
+ if (cb->data)
+ kfree(cb->data);
return 0;
}
+struct ctnetlink_dump_filter {
+ struct {
+ u_int32_t val;
+ u_int32_t mask;
+ } mark;
+};
+
static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -641,53 +766,67 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
struct hlist_nulls_node *n;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
+ int res;
+ spinlock_t *lockp;
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ const struct ctnetlink_dump_filter *filter = cb->data;
+#endif
- rcu_read_lock();
last = (struct nf_conn *)cb->args[1];
+
+ local_bh_disable();
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
restart:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[cb->args[0]],
+ lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
+ spin_lock(lockp);
+ if (cb->args[0] >= net->ct.htable_size) {
+ spin_unlock(lockp);
+ goto out;
+ }
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- continue;
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
if (l3proto && nf_ct_l3num(ct) != l3proto)
- goto releasect;
+ continue;
if (cb->args[1]) {
if (ct != last)
- goto releasect;
+ continue;
cb->args[1] = 0;
}
- if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- IPCTNL_MSG_CT_NEW, ct) < 0) {
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (filter && !((ct->mark & filter->mark.mask) ==
+ filter->mark.val)) {
+ continue;
+ }
+#endif
+ rcu_read_lock();
+ res =
+ ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct);
+ rcu_read_unlock();
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct;
+ spin_unlock(lockp);
goto out;
}
-
- if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) ==
- IPCTNL_MSG_CT_GET_CTRZERO) {
- struct nf_conn_counter *acct;
-
- acct = nf_conn_acct_find(ct);
- if (acct)
- memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]));
- }
-releasect:
- nf_ct_put(ct);
}
+ spin_unlock(lockp);
if (cb->args[1]) {
cb->args[1] = 0;
goto restart;
}
}
out:
- rcu_read_unlock();
+ local_bh_enable();
if (last)
nf_ct_put(last);
@@ -701,7 +840,9 @@ ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
struct nf_conntrack_l3proto *l3proto;
int ret = 0;
- nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
+ ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
+ if (ret < 0)
+ return ret;
rcu_read_lock();
l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
@@ -761,14 +902,16 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
struct nf_conntrack_tuple *tuple,
- enum ctattr_tuple type, u_int8_t l3num)
+ enum ctattr_type type, u_int8_t l3num)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
memset(tuple, 0, sizeof(*tuple));
- nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
+ err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
+ if (err < 0)
+ return err;
if (!tb[CTA_TUPLE_IP])
return -EINVAL;
@@ -811,21 +954,29 @@ ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
}
static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
- [CTA_HELP_NAME] = { .type = NLA_NUL_STRING },
+ [CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN - 1 },
};
static inline int
-ctnetlink_parse_help(const struct nlattr *attr, char **helper_name)
+ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
+ struct nlattr **helpinfo)
{
+ int err;
struct nlattr *tb[CTA_HELP_MAX+1];
- nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
+ err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
+ if (err < 0)
+ return err;
if (!tb[CTA_HELP_NAME])
return -EINVAL;
*helper_name = nla_data(tb[CTA_HELP_NAME]);
+ if (tb[CTA_HELP_INFO])
+ *helpinfo = tb[CTA_HELP_INFO];
+
return 0;
}
@@ -841,7 +992,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
[CTA_ID] = { .type = NLA_U32 },
[CTA_NAT_DST] = { .type = NLA_NESTED },
[CTA_TUPLE_MASTER] = { .type = NLA_NESTED },
+ [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED },
+ [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED },
[CTA_ZONE] = { .type = NLA_U16 },
+ [CTA_MARK_MASK] = { .type = NLA_U32 },
+ [CTA_LABELS] = { .type = NLA_BINARY,
+ .len = NF_CT_LABELS_MAX_SIZE },
+ [CTA_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = NF_CT_LABELS_MAX_SIZE },
};
static int
@@ -869,7 +1027,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
else {
/* Flush the whole table */
nf_conntrack_flush_report(net,
- NETLINK_CB(skb).pid,
+ NETLINK_CB(skb).portid,
nlmsg_report(nlh));
return 0;
}
@@ -891,20 +1049,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
}
}
- if (nf_conntrack_event_report(IPCT_DESTROY, ct,
- NETLINK_CB(skb).pid,
- nlmsg_report(nlh)) < 0) {
- nf_ct_delete_from_lists(ct);
- /* we failed to report the event, try later */
- nf_ct_insert_dying_list(ct);
- nf_ct_put(ct);
- return 0;
- }
+ if (del_timer(&ct->timeout))
+ nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
- /* death_by_timeout would report the event again */
- set_bit(IPS_DYING_BIT, &ct->status);
-
- nf_ct_kill(ct);
nf_ct_put(ct);
return 0;
@@ -925,9 +1072,28 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
u16 zone;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP)
- return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table,
- ctnetlink_done);
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_table,
+ .done = ctnetlink_done,
+ };
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
+ struct ctnetlink_dump_filter *filter;
+
+ filter = kzalloc(sizeof(struct ctnetlink_dump_filter),
+ GFP_ATOMIC);
+ if (filter == NULL)
+ return -ENOMEM;
+
+ filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
+ filter->mark.mask =
+ ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ c.data = filter;
+ }
+#endif
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
if (err < 0)
@@ -957,14 +1123,14 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
}
rcu_read_lock();
- err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
- IPCTNL_MSG_CT_NEW, ct);
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type), ct);
rcu_read_unlock();
nf_ct_put(ct);
if (err <= 0)
goto free;
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
if (err < 0)
goto out;
@@ -973,7 +1139,125 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
free:
kfree_skb(skb2);
out:
- return err;
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static int ctnetlink_done_list(struct netlink_callback *cb)
+{
+ if (cb->args[1])
+ nf_ct_put((struct nf_conn *)cb->args[1]);
+ return 0;
+}
+
+static int
+ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+{
+ struct nf_conn *ct, *last;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+ int res;
+ int cpu;
+ struct hlist_nulls_head *list;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct nf_conn *)cb->args[1];
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ struct ct_pcpu *pcpu;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+ spin_lock_bh(&pcpu->lock);
+ list = dying ? &pcpu->dying : &pcpu->unconfirmed;
+restart:
+ hlist_nulls_for_each_entry(h, n, list, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (ct != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ rcu_read_lock();
+ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct);
+ rcu_read_unlock();
+ if (res < 0) {
+ if (!atomic_inc_not_zero(&ct->ct_general.use))
+ continue;
+ cb->args[0] = cpu;
+ cb->args[1] = (unsigned long)ct;
+ spin_unlock_bh(&pcpu->lock);
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ spin_unlock_bh(&pcpu->lock);
+ }
+ cb->args[2] = 1;
+out:
+ if (last)
+ nf_ct_put(last);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ctnetlink_dump_list(skb, cb, true);
+}
+
+static int
+ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_dying,
+ .done = ctnetlink_done_list,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ctnetlink_dump_list(skb, cb, false);
+}
+
+static int
+ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_unconfirmed,
+ .done = ctnetlink_done_list,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
}
#ifdef CONFIG_NF_NAT_NEEDED
@@ -983,21 +1267,19 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
const struct nlattr *attr)
{
typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+ int err;
parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
if (!parse_nat_setup) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
- spin_unlock_bh(&nf_conntrack_lock);
- nfnl_unlock();
- if (request_module("nf-nat-ipv4") < 0) {
- nfnl_lock();
- spin_lock_bh(&nf_conntrack_lock);
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ if (request_module("nf-nat") < 0) {
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
return -EOPNOTSUPP;
}
- nfnl_lock();
- spin_lock_bh(&nf_conntrack_lock);
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
if (nfnetlink_parse_nat_setup_hook)
return -EAGAIN;
@@ -1005,7 +1287,23 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
return -EOPNOTSUPP;
}
- return parse_nat_setup(ct, manip, attr);
+ err = parse_nat_setup(ct, manip, attr);
+ if (err == -EAGAIN) {
+#ifdef CONFIG_MODULES
+ rcu_read_unlock();
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) {
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+ return -EOPNOTSUPP;
+ }
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+#else
+ err = -EOPNOTSUPP;
+#endif
+ }
+ return err;
}
#endif
@@ -1036,27 +1334,25 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
}
static int
-ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
+ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
{
#ifdef CONFIG_NF_NAT_NEEDED
int ret;
- if (cda[CTA_NAT_DST]) {
- ret = ctnetlink_parse_nat_setup(ct,
- IP_NAT_MANIP_DST,
- cda[CTA_NAT_DST]);
- if (ret < 0)
- return ret;
- }
- if (cda[CTA_NAT_SRC]) {
- ret = ctnetlink_parse_nat_setup(ct,
- IP_NAT_MANIP_SRC,
- cda[CTA_NAT_SRC]);
- if (ret < 0)
- return ret;
- }
- return 0;
+ if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
+ return 0;
+
+ ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST,
+ cda[CTA_NAT_DST]);
+ if (ret < 0)
+ return ret;
+
+ ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC,
+ cda[CTA_NAT_SRC]);
+ return ret;
#else
+ if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
+ return 0;
return -EOPNOTSUPP;
#endif
}
@@ -1067,13 +1363,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
struct nf_conntrack_helper *helper;
struct nf_conn_help *help = nfct_help(ct);
char *helpname = NULL;
+ struct nlattr *helpinfo = NULL;
int err;
/* don't change helper of sibling connections */
if (ct->master)
return -EBUSY;
- err = ctnetlink_parse_help(cda[CTA_HELP], &helpname);
+ err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
if (err < 0)
return err;
@@ -1081,7 +1378,7 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
if (help && help->helper) {
/* we had a helper before ... */
nf_ct_remove_expectations(ct);
- rcu_assign_pointer(help->helper, NULL);
+ RCU_INIT_POINTER(help->helper, NULL);
}
return 0;
@@ -1091,14 +1388,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
nf_ct_protonum(ct));
if (helper == NULL) {
#ifdef CONFIG_MODULES
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
if (request_module("nfct-helper-%s", helpname) < 0) {
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
return -EOPNOTSUPP;
}
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
nf_ct_protonum(ct));
if (helper)
@@ -1108,20 +1405,17 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
}
if (help) {
- if (help->helper == helper)
+ if (help->helper == helper) {
+ /* update private helper data if allowed. */
+ if (helper->from_nlattr)
+ helper->from_nlattr(helpinfo, ct);
return 0;
- if (help->helper)
+ } else
return -EBUSY;
- /* need to zero data of old helper */
- memset(&help->help, 0, sizeof(help->help));
- } else {
- /* we cannot set a helper for an existing conntrack */
- return -EOPNOTSUPP;
}
- rcu_assign_pointer(help->helper, helper);
-
- return 0;
+ /* we cannot set a helper for an existing conntrack */
+ return -EOPNOTSUPP;
}
static inline int
@@ -1152,7 +1446,9 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]
struct nf_conntrack_l4proto *l4proto;
int err = 0;
- nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
+ err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
+ if (err < 0)
+ return err;
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
@@ -1163,63 +1459,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]
return err;
}
-#ifdef CONFIG_NF_NAT_NEEDED
-static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = {
- [CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 },
- [CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 },
- [CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 },
+static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
+ [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 },
+ [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 },
+ [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 },
};
static inline int
-change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr)
+change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)
{
- struct nlattr *cda[CTA_NAT_SEQ_MAX+1];
+ int err;
+ struct nlattr *cda[CTA_SEQADJ_MAX+1];
- nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy);
+ err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy);
+ if (err < 0)
+ return err;
- if (!cda[CTA_NAT_SEQ_CORRECTION_POS])
+ if (!cda[CTA_SEQADJ_CORRECTION_POS])
return -EINVAL;
- natseq->correction_pos =
- ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS]));
+ seq->correction_pos =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS]));
- if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE])
+ if (!cda[CTA_SEQADJ_OFFSET_BEFORE])
return -EINVAL;
- natseq->offset_before =
- ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE]));
+ seq->offset_before =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE]));
- if (!cda[CTA_NAT_SEQ_OFFSET_AFTER])
+ if (!cda[CTA_SEQADJ_OFFSET_AFTER])
return -EINVAL;
- natseq->offset_after =
- ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER]));
+ seq->offset_after =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER]));
return 0;
}
static int
-ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
- const struct nlattr * const cda[])
+ctnetlink_change_seq_adj(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
int ret = 0;
- struct nf_conn_nat *nat = nfct_nat(ct);
- if (!nat)
+ if (!seqadj)
return 0;
- if (cda[CTA_NAT_SEQ_ADJ_ORIG]) {
- ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL],
- cda[CTA_NAT_SEQ_ADJ_ORIG]);
+ if (cda[CTA_SEQ_ADJ_ORIG]) {
+ ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL],
+ cda[CTA_SEQ_ADJ_ORIG]);
if (ret < 0)
return ret;
ct->status |= IPS_SEQ_ADJUST;
}
- if (cda[CTA_NAT_SEQ_ADJ_REPLY]) {
- ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY],
- cda[CTA_NAT_SEQ_ADJ_REPLY]);
+ if (cda[CTA_SEQ_ADJ_REPLY]) {
+ ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY],
+ cda[CTA_SEQ_ADJ_REPLY]);
if (ret < 0)
return ret;
@@ -1228,7 +1526,31 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
return 0;
}
+
+static int
+ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ size_t len = nla_len(cda[CTA_LABELS]);
+ const void *mask = cda[CTA_LABELS_MASK];
+
+ if (len & (sizeof(u32)-1)) /* must be multiple of u32 */
+ return -EINVAL;
+
+ if (mask) {
+ if (nla_len(cda[CTA_LABELS_MASK]) == 0 ||
+ nla_len(cda[CTA_LABELS_MASK]) != len)
+ return -EINVAL;
+ mask = nla_data(cda[CTA_LABELS_MASK]);
+ }
+
+ len /= sizeof(u32);
+
+ return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len);
+#else
+ return -EOPNOTSUPP;
#endif
+}
static int
ctnetlink_change_conntrack(struct nf_conn *ct,
@@ -1269,13 +1591,17 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
#endif
-#ifdef CONFIG_NF_NAT_NEEDED
- if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
- err = ctnetlink_change_nat_seq_adj(ct, cda);
+ if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+ err = ctnetlink_change_seq_adj(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_LABELS]) {
+ err = ctnetlink_attach_labels(ct, cda);
if (err < 0)
return err;
}
-#endif
return 0;
}
@@ -1290,6 +1616,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
struct nf_conn *ct;
int err = -EINVAL;
struct nf_conntrack_helper *helper;
+ struct nf_conn_tstamp *tstamp;
ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
if (IS_ERR(ct))
@@ -1304,8 +1631,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
rcu_read_lock();
if (cda[CTA_HELP]) {
char *helpname = NULL;
-
- err = ctnetlink_parse_help(cda[CTA_HELP], &helpname);
+ struct nlattr *helpinfo = NULL;
+
+ err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
if (err < 0)
goto err2;
@@ -1334,14 +1662,17 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
} else {
struct nf_conn_help *help;
- help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+ help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
if (help == NULL) {
err = -ENOMEM;
goto err2;
}
+ /* set private helper data if allowed. */
+ if (helper->from_nlattr)
+ helper->from_nlattr(helpinfo, ct);
/* not in hash table yet so not strictly necessary */
- rcu_assign_pointer(help->helper, helper);
+ RCU_INIT_POINTER(help->helper, helper);
}
} else {
/* try an implicit helper assignation */
@@ -1350,14 +1681,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
goto err2;
}
- if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) {
- err = ctnetlink_change_nat(ct, cda);
- if (err < 0)
- goto err2;
- }
+ err = ctnetlink_setup_nat(ct, cda);
+ if (err < 0)
+ goto err2;
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
+ nf_ct_labels_ext_add(ct);
+
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
@@ -1367,14 +1699,13 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
goto err2;
}
-#ifdef CONFIG_NF_NAT_NEEDED
- if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
- err = ctnetlink_change_nat_seq_adj(ct, cda);
+ if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+ err = ctnetlink_change_seq_adj(ct, cda);
if (err < 0)
goto err2;
}
-#endif
+ memset(&ct->proto, 0, sizeof(ct->proto));
if (cda[CTA_PROTOINFO]) {
err = ctnetlink_change_protoinfo(ct, cda);
if (err < 0)
@@ -1405,9 +1736,14 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
__set_bit(IPS_EXPECTED_BIT, &ct->status);
ct->master = master_ct;
}
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp)
+ tstamp->start = ktime_to_ns(ktime_get_real());
+
+ err = nf_conntrack_hash_check_insert(ct);
+ if (err < 0)
+ goto err2;
- add_timer(&ct->timeout);
- nf_conntrack_hash_insert(ct);
rcu_read_unlock();
return ct;
@@ -1428,6 +1764,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct nf_conntrack_tuple otuple, rtuple;
struct nf_conntrack_tuple_hash *h = NULL;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nf_conn *ct;
u_int8_t u3 = nfmsg->nfgen_family;
u16 zone;
int err;
@@ -1448,78 +1785,460 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
return err;
}
- spin_lock_bh(&nf_conntrack_lock);
if (cda[CTA_TUPLE_ORIG])
- h = __nf_conntrack_find(net, zone, &otuple);
+ h = nf_conntrack_find_get(net, zone, &otuple);
else if (cda[CTA_TUPLE_REPLY])
- h = __nf_conntrack_find(net, zone, &rtuple);
+ h = nf_conntrack_find_get(net, zone, &rtuple);
if (h == NULL) {
err = -ENOENT;
if (nlh->nlmsg_flags & NLM_F_CREATE) {
- struct nf_conn *ct;
enum ip_conntrack_events events;
+ if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
+ return -EINVAL;
+
ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
&rtuple, u3);
- if (IS_ERR(ct)) {
- err = PTR_ERR(ct);
- goto out_unlock;
- }
+ if (IS_ERR(ct))
+ return PTR_ERR(ct);
+
err = 0;
- nf_conntrack_get(&ct->ct_general);
- spin_unlock_bh(&nf_conntrack_lock);
if (test_bit(IPS_EXPECTED_BIT, &ct->status))
events = IPCT_RELATED;
else
events = IPCT_NEW;
+ if (cda[CTA_LABELS] &&
+ ctnetlink_attach_labels(ct, cda) == 0)
+ events |= (1 << IPCT_LABEL);
+
nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
(1 << IPCT_ASSURED) |
(1 << IPCT_HELPER) |
(1 << IPCT_PROTOINFO) |
- (1 << IPCT_NATSEQADJ) |
+ (1 << IPCT_SEQADJ) |
(1 << IPCT_MARK) | events,
- ct, NETLINK_CB(skb).pid,
+ ct, NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_put(ct);
- } else
- spin_unlock_bh(&nf_conntrack_lock);
+ }
return err;
}
/* implicit 'else' */
- /* We manipulate the conntrack inside the global conntrack table lock,
- * so there's no need to increase the refcount */
err = -EEXIST;
+ ct = nf_ct_tuplehash_to_ctrack(h);
if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
+ spin_lock_bh(&nf_conntrack_expect_lock);
err = ctnetlink_change_conntrack(ct, cda);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
if (err == 0) {
- nf_conntrack_get(&ct->ct_general);
- spin_unlock_bh(&nf_conntrack_lock);
nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
(1 << IPCT_ASSURED) |
(1 << IPCT_HELPER) |
+ (1 << IPCT_LABEL) |
(1 << IPCT_PROTOINFO) |
- (1 << IPCT_NATSEQADJ) |
+ (1 << IPCT_SEQADJ) |
(1 << IPCT_MARK),
- ct, NETLINK_CB(skb).pid,
+ ct, NETLINK_CB(skb).portid,
nlmsg_report(nlh));
- nf_ct_put(ct);
- } else
- spin_unlock_bh(&nf_conntrack_lock);
+ }
+ }
+
+ nf_ct_put(ct);
+ return err;
+}
+
+static int
+ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
+ __u16 cpu, const struct ip_conntrack_stat *st)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(cpu);
+
+ if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
+ nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
+ nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
+ nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
+ nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
+ nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
+ nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
+ nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
+ nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
+ htonl(st->insert_failed)) ||
+ nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) ||
+ nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
+ nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
+ nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
+ htonl(st->search_restart)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int cpu;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[0] == nr_cpu_ids)
+ return 0;
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ const struct ip_conntrack_stat *st;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ st = per_cpu_ptr(net->ct.stat, cpu);
+ if (ctnetlink_ct_stat_cpu_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ cpu, st) < 0)
+ break;
+ }
+ cb->args[0] = cpu;
+
+ return skb->len;
+}
+
+static int
+ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_ct_stat_cpu_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return 0;
+}
+
+static int
+ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ struct net *net)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+ unsigned int nr_conntracks = atomic_read(&net->ct.count);
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct sk_buff *skb2;
+ int err;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ sock_net(skb->sk));
+ if (err <= 0)
+ goto free;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ return 0;
+
+free:
+ kfree_skb(skb2);
+out:
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
+ [CTA_EXPECT_MASTER] = { .type = NLA_NESTED },
+ [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED },
+ [CTA_EXPECT_MASK] = { .type = NLA_NESTED },
+ [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
+ [CTA_EXPECT_ID] = { .type = NLA_U32 },
+ [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN - 1 },
+ [CTA_EXPECT_ZONE] = { .type = NLA_U16 },
+ [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
+ [CTA_EXPECT_CLASS] = { .type = NLA_U32 },
+ [CTA_EXPECT_NAT] = { .type = NLA_NESTED },
+ [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING },
+};
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
+ struct nf_conntrack_helper *helper,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask);
+
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+static size_t
+ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
+{
+ return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ + nla_total_size(0) /* CTA_PROTOINFO */
+ + nla_total_size(0) /* CTA_HELP */
+ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ + ctnetlink_secctx_size(ct)
+#ifdef CONFIG_NF_NAT_NEEDED
+ + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+#endif
+ + ctnetlink_proto_size(ct)
+ ;
+}
+
+static int
+ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ rcu_read_lock();
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ if (nf_ct_zone(ct)) {
+ if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
+ }
+
+ if (ctnetlink_dump_id(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_status(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (ct->master && ctnetlink_dump_master(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if ((ct->status & IPS_SEQ_ADJUST) &&
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (ctnetlink_dump_labels(skb, ct) < 0)
+ goto nla_put_failure;
+ rcu_read_unlock();
+ return 0;
+
+nla_put_failure:
+ rcu_read_unlock();
+ return -ENOSPC;
+}
+
+static int
+ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
+{
+ int err;
+
+ if (cda[CTA_TIMEOUT]) {
+ err = ctnetlink_change_timeout(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_STATUS]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_HELP]) {
+ err = ctnetlink_change_helper(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_LABELS]) {
+ err = ctnetlink_attach_labels(ct, cda);
+ if (err < 0)
+ return err;
+ }
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK]) {
+ u32 mask = 0, mark, newmark;
+ if (cda[CTA_MARK_MASK])
+ mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+
+ mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+ newmark = (ct->mark & mask) ^ mark;
+ if (newmark != ct->mark)
+ ct->mark = newmark;
+ }
+#endif
+ return 0;
+}
+
+static int
+ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
+{
+ struct nlattr *cda[CTA_MAX+1];
+ int ret;
+
+ ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy);
+ if (ret < 0)
+ return ret;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ return ret;
+}
+
+static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
+ const struct nf_conn *ct,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
+{
+ int err;
+
+ err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
+ nf_ct_l3num(ct));
+ if (err < 0)
+ return err;
+
+ return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
+ nf_ct_l3num(ct));
+}
+
+static int
+ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
+ u32 portid, u32 report)
+{
+ struct nlattr *cda[CTA_EXPECT_MAX+1];
+ struct nf_conntrack_tuple tuple, mask;
+ struct nf_conntrack_helper *helper = NULL;
+ struct nf_conntrack_expect *exp;
+ int err;
+
+ err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy);
+ if (err < 0)
+ return err;
+
+ err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda,
+ ct, &tuple, &mask);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_HELP_NAME]) {
+ const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper == NULL)
+ return -EOPNOTSUPP;
+ }
+ exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct,
+ helper, &tuple, &mask);
+ if (IS_ERR(exp))
+ return PTR_ERR(exp);
+
+ err = nf_ct_expect_related_report(exp, portid, report);
+ if (err < 0) {
+ nf_ct_expect_put(exp);
return err;
}
-out_unlock:
- spin_unlock_bh(&nf_conntrack_lock);
- return err;
+ return 0;
}
+static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
+ .build_size = ctnetlink_nfqueue_build_size,
+ .build = ctnetlink_nfqueue_build,
+ .parse = ctnetlink_nfqueue_parse,
+ .attach_expect = ctnetlink_nfqueue_attach_expect,
+ .seq_adjust = nf_ct_tcp_seqadj_set,
+};
+#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
+
/***********************************************************************
* EXPECT
***********************************************************************/
@@ -1564,14 +2283,16 @@ ctnetlink_exp_dump_mask(struct sk_buff *skb,
if (!nest_parms)
goto nla_put_failure;
+ rcu_read_lock();
l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
-
- if (unlikely(ret < 0))
- goto nla_put_failure;
-
- l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
+ if (ret >= 0) {
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
+ tuple->dst.protonum);
ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
+ }
+ rcu_read_unlock();
+
if (unlikely(ret < 0))
goto nla_put_failure;
@@ -1583,13 +2304,20 @@ nla_put_failure:
return -1;
}
+static const union nf_inet_addr any_addr;
+
static int
ctnetlink_exp_dump_expect(struct sk_buff *skb,
const struct nf_conntrack_expect *exp)
{
struct nf_conn *master = exp->master;
- long timeout = (exp->timeout.expires - jiffies) / HZ;
+ long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
struct nf_conn_help *help;
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nlattr *nest_parms;
+ struct nf_conntrack_tuple nat_tuple = {};
+#endif
+ struct nf_ct_helper_expectfn *expfn;
if (timeout < 0)
timeout = 0;
@@ -1603,17 +2331,45 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
CTA_EXPECT_MASTER) < 0)
goto nla_put_failure;
- NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
- NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
- NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) ||
+ exp->saved_proto.all) {
+ nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir)))
+ goto nla_put_failure;
+
+ nat_tuple.src.l3num = nf_ct_l3num(master);
+ nat_tuple.src.u3 = exp->saved_addr;
+ nat_tuple.dst.protonum = nf_ct_protonum(master);
+ nat_tuple.src.u = exp->saved_proto;
+
+ if (ctnetlink_exp_dump_tuple(skb, &nat_tuple,
+ CTA_EXPECT_NAT_TUPLE) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+ }
+#endif
+ if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
+ nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) ||
+ nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
+ nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
+ goto nla_put_failure;
help = nfct_help(master);
if (help) {
struct nf_conntrack_helper *helper;
helper = rcu_dereference(help->helper);
- if (helper)
- NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+ if (helper &&
+ nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name))
+ goto nla_put_failure;
}
+ expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn);
+ if (expfn != NULL &&
+ nla_put_string(skb, CTA_EXPECT_FN, expfn->name))
+ goto nla_put_failure;
return 0;
@@ -1622,15 +2378,15 @@ nla_put_failure:
}
static int
-ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
int event, const struct nf_conntrack_expect *exp)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
- unsigned int flags = pid ? NLM_F_MULTI : 0;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
@@ -1681,7 +2437,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
goto errout;
type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
- nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags);
+ nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
@@ -1696,7 +2452,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
rcu_read_unlock();
nlmsg_end(skb, nlh);
- nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
+ nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC);
return 0;
nla_put_failure:
@@ -1722,14 +2478,13 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
struct net *net = sock_net(skb->sk);
struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- struct hlist_node *n;
u_int8_t l3proto = nfmsg->nfgen_family;
rcu_read_lock();
last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
- hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]],
+ hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
hnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
@@ -1739,7 +2494,7 @@ restart:
cb->args[1] = 0;
}
if (ctnetlink_exp_fill_info(skb,
- NETLINK_CB(cb->skb).pid,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
@@ -1762,16 +2517,91 @@ out:
return skb->len;
}
-static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
- [CTA_EXPECT_MASTER] = { .type = NLA_NESTED },
- [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED },
- [CTA_EXPECT_MASK] = { .type = NLA_NESTED },
- [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
- [CTA_EXPECT_ID] = { .type = NLA_U32 },
- [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING },
- [CTA_EXPECT_ZONE] = { .type = NLA_U16 },
- [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
-};
+static int
+ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nf_conn *ct = cb->data;
+ struct nf_conn_help *help = nfct_help(ct);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+
+ if (cb->args[0])
+ return 0;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_expect *)cb->args[1];
+restart:
+ hlist_for_each_entry(exp, &help->expectations, lnode) {
+ if (l3proto && exp->tuple.src.l3num != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (exp != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+ if (!atomic_inc_not_zero(&exp->use))
+ continue;
+ cb->args[1] = (unsigned long)exp;
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ cb->args[0] = 1;
+out:
+ rcu_read_unlock();
+ if (last)
+ nf_ct_expect_put(last);
+
+ return skb->len;
+}
+
+static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int err;
+ struct net *net = sock_net(ctnl);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ u16 zone = 0;
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_ct_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+ }
+
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return -ENOENT;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ c.data = ct;
+
+ err = netlink_dump_start(ctnl, skb, nlh, &c);
+ nf_ct_put(ct);
+
+ return err;
+}
static int
ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
@@ -1788,16 +2618,24 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- return netlink_dump_start(ctnl, skb, nlh,
- ctnetlink_exp_dump_table,
- ctnetlink_exp_done);
+ if (cda[CTA_EXPECT_MASTER])
+ return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda);
+ else {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
}
err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
if (err < 0)
return err;
- if (cda[CTA_EXPECT_MASTER])
+ if (cda[CTA_EXPECT_TUPLE])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ else if (cda[CTA_EXPECT_MASTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
else
return -EINVAL;
@@ -1819,25 +2657,30 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
+ if (skb2 == NULL) {
+ nf_ct_expect_put(exp);
goto out;
+ }
rcu_read_lock();
- err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
+ err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
rcu_read_unlock();
+ nf_ct_expect_put(exp);
if (err <= 0)
goto free;
- nf_ct_expect_put(exp);
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
- return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ return 0;
free:
kfree_skb(skb2);
out:
- nf_ct_expect_put(exp);
- return err;
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
}
static int
@@ -1849,7 +2692,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple tuple;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- struct hlist_node *n, *next;
+ struct hlist_node *next;
u_int8_t u3 = nfmsg->nfgen_family;
unsigned int i;
u16 zone;
@@ -1879,13 +2722,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
}
/* after list removal, usage count == 1 */
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
if (del_timer(&exp->timeout)) {
- nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+ nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
/* have to put what we 'get' above.
* after this line usage count == 0 */
nf_ct_expect_put(exp);
@@ -1894,38 +2737,38 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
struct nf_conn_help *m_help;
/* delete all expectations for this helper */
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
- hlist_for_each_entry_safe(exp, n, next,
+ hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i],
hnode) {
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
- NETLINK_CB(skb).pid,
+ NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
}
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
} else {
/* This basically means we have to flush everything*/
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
- hlist_for_each_entry_safe(exp, n, next,
+ hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i],
hnode) {
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
- NETLINK_CB(skb).pid,
+ NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
}
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
return 0;
@@ -1934,48 +2777,79 @@ static int
ctnetlink_change_expect(struct nf_conntrack_expect *x,
const struct nlattr * const cda[])
{
- return -EOPNOTSUPP;
+ if (cda[CTA_EXPECT_TIMEOUT]) {
+ if (!del_timer(&x->timeout))
+ return -ETIME;
+
+ x->timeout.expires = jiffies +
+ ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+ add_timer(&x->timeout);
+ }
+ return 0;
}
+static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
+ [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 },
+ [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED },
+};
+
static int
-ctnetlink_create_expect(struct net *net, u16 zone,
- const struct nlattr * const cda[],
- u_int8_t u3,
- u32 pid, int report)
+ctnetlink_parse_expect_nat(const struct nlattr *attr,
+ struct nf_conntrack_expect *exp,
+ u_int8_t u3)
{
- struct nf_conntrack_tuple tuple, mask, master_tuple;
- struct nf_conntrack_tuple_hash *h = NULL;
- struct nf_conntrack_expect *exp;
- struct nf_conn *ct;
- struct nf_conn_help *help;
- int err = 0;
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nlattr *tb[CTA_EXPECT_NAT_MAX+1];
+ struct nf_conntrack_tuple nat_tuple = {};
+ int err;
- /* caller guarantees that those three CTA_EXPECT_* exist */
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy);
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
- if (err < 0)
- return err;
- err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+
+ if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
+ &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
if (err < 0)
return err;
- /* Look for master conntrack of this expectation */
- h = nf_conntrack_find_get(net, zone, &master_tuple);
- if (!h)
- return -ENOENT;
- ct = nf_ct_tuplehash_to_ctrack(h);
- exp = nf_ct_expect_alloc(ct);
- if (!exp) {
- err = -ENOMEM;
- goto out;
+ exp->saved_addr = nat_tuple.src.u3;
+ exp->saved_proto = nat_tuple.src.u;
+ exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR]));
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
+ struct nf_conntrack_helper *helper,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
+{
+ u_int32_t class = 0;
+ struct nf_conntrack_expect *exp;
+ struct nf_conn_help *help;
+ int err;
+
+ if (cda[CTA_EXPECT_CLASS] && helper) {
+ class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS]));
+ if (class > helper->expect_class_max)
+ return ERR_PTR(-EINVAL);
}
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return ERR_PTR(-ENOMEM);
+
help = nfct_help(ct);
if (!help) {
if (!cda[CTA_EXPECT_TIMEOUT]) {
err = -EINVAL;
- goto out;
+ goto err_out;
}
exp->timeout.expires =
jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
@@ -1992,20 +2866,105 @@ ctnetlink_create_expect(struct net *net, u16 zone,
} else
exp->flags = 0;
}
+ if (cda[CTA_EXPECT_FN]) {
+ const char *name = nla_data(cda[CTA_EXPECT_FN]);
+ struct nf_ct_helper_expectfn *expfn;
- exp->class = 0;
- exp->expectfn = NULL;
- exp->master = ct;
- exp->helper = NULL;
- memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
- memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3));
- exp->mask.src.u.all = mask.src.u.all;
+ expfn = nf_ct_helper_expectfn_find_by_name(name);
+ if (expfn == NULL) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ exp->expectfn = expfn->expectfn;
+ } else
+ exp->expectfn = NULL;
- err = nf_ct_expect_related_report(exp, pid, report);
+ exp->class = class;
+ exp->master = ct;
+ exp->helper = helper;
+ exp->tuple = *tuple;
+ exp->mask.src.u3 = mask->src.u3;
+ exp->mask.src.u.all = mask->src.u.all;
+
+ if (cda[CTA_EXPECT_NAT]) {
+ err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT],
+ exp, nf_ct_l3num(ct));
+ if (err < 0)
+ goto err_out;
+ }
+ return exp;
+err_out:
nf_ct_expect_put(exp);
+ return ERR_PTR(err);
+}
-out:
- nf_ct_put(nf_ct_tuplehash_to_ctrack(h));
+static int
+ctnetlink_create_expect(struct net *net, u16 zone,
+ const struct nlattr * const cda[],
+ u_int8_t u3, u32 portid, int report)
+{
+ struct nf_conntrack_tuple tuple, mask, master_tuple;
+ struct nf_conntrack_tuple_hash *h = NULL;
+ struct nf_conntrack_helper *helper = NULL;
+ struct nf_conntrack_expect *exp;
+ struct nf_conn *ct;
+ int err;
+
+ /* caller guarantees that those three CTA_EXPECT_* exist */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+ if (err < 0)
+ return err;
+
+ /* Look for master conntrack of this expectation */
+ h = nf_conntrack_find_get(net, zone, &master_tuple);
+ if (!h)
+ return -ENOENT;
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (cda[CTA_EXPECT_HELP_NAME]) {
+ const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+ helper = __nf_conntrack_helper_find(helpname, u3,
+ nf_ct_protonum(ct));
+ if (helper == NULL) {
+#ifdef CONFIG_MODULES
+ if (request_module("nfct-helper-%s", helpname) < 0) {
+ err = -EOPNOTSUPP;
+ goto err_ct;
+ }
+ helper = __nf_conntrack_helper_find(helpname, u3,
+ nf_ct_protonum(ct));
+ if (helper) {
+ err = -EAGAIN;
+ goto err_ct;
+ }
+#endif
+ err = -EOPNOTSUPP;
+ goto err_ct;
+ }
+ }
+
+ exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
+ if (IS_ERR(exp)) {
+ err = PTR_ERR(exp);
+ goto err_ct;
+ }
+
+ err = nf_ct_expect_related_report(exp, portid, report);
+ if (err < 0)
+ goto err_exp;
+
+ return 0;
+err_exp:
+ nf_ct_expect_put(exp);
+err_ct:
+ nf_ct_put(ct);
return err;
}
@@ -2035,16 +2994,16 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- spin_lock_bh(&nf_conntrack_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
exp = __nf_ct_expect_find(net, zone, &tuple);
if (!exp) {
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
err = -ENOENT;
if (nlh->nlmsg_flags & NLM_F_CREATE) {
err = ctnetlink_create_expect(net, zone, cda,
u3,
- NETLINK_CB(skb).pid,
+ NETLINK_CB(skb).portid,
nlmsg_report(nlh));
}
return err;
@@ -2053,11 +3012,84 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
err = -EEXIST;
if (!(nlh->nlmsg_flags & NLM_F_EXCL))
err = ctnetlink_change_expect(exp, cda);
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
return err;
}
+static int
+ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
+ const struct ip_conntrack_stat *st)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(cpu);
+
+ if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
+ nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
+ nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int cpu;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[0] == nr_cpu_ids)
+ return 0;
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ const struct ip_conntrack_stat *st;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ st = per_cpu_ptr(net->ct.stat, cpu);
+ if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ cpu, st) < 0)
+ break;
+ }
+ cb->args[0] = cpu;
+
+ return skb->len;
+}
+
+static int
+ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_stat_cpu_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static struct nf_ct_event_notifier ctnl_notifier = {
.fcn = ctnetlink_conntrack_event,
@@ -2081,6 +3113,10 @@ static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
.attr_count = CTA_MAX,
.policy = ct_nla_policy },
+ [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu },
+ [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct },
+ [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying },
+ [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed },
};
static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
@@ -2093,6 +3129,7 @@ static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
[IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
.attr_count = CTA_EXPECT_MAX,
.policy = exp_nla_policy },
+ [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu },
};
static const struct nfnetlink_subsystem ctnl_subsys = {
@@ -2113,6 +3150,54 @@ MODULE_ALIAS("ip_conntrack_netlink");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
+static int __net_init ctnetlink_net_init(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int ret;
+
+ ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot register notifier.\n");
+ goto err_out;
+ }
+
+ ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot expect register notifier.\n");
+ goto err_unreg_notifier;
+ }
+#endif
+ return 0;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+ nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+err_out:
+ return ret;
+#endif
+}
+
+static void ctnetlink_net_exit(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
+ nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+#endif
+}
+
+static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ctnetlink_net_exit(net);
+}
+
+static struct pernet_operations ctnetlink_net_ops = {
+ .init = ctnetlink_net_init,
+ .exit_batch = ctnetlink_net_exit_batch,
+};
+
static int __init ctnetlink_init(void)
{
int ret;
@@ -2130,28 +3215,19 @@ static int __init ctnetlink_init(void)
goto err_unreg_subsys;
}
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- ret = nf_conntrack_register_notifier(&ctnl_notifier);
+ ret = register_pernet_subsys(&ctnetlink_net_ops);
if (ret < 0) {
- pr_err("ctnetlink_init: cannot register notifier.\n");
+ pr_err("ctnetlink_init: cannot register pernet operations\n");
goto err_unreg_exp_subsys;
}
-
- ret = nf_ct_expect_register_notifier(&ctnl_notifier_exp);
- if (ret < 0) {
- pr_err("ctnetlink_init: cannot expect register notifier.\n");
- goto err_unreg_notifier;
- }
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+ /* setup interaction between nf_queue and nf_conntrack_netlink. */
+ RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);
#endif
-
return 0;
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
- nf_conntrack_unregister_notifier(&ctnl_notifier);
err_unreg_exp_subsys:
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
-#endif
err_unreg_subsys:
nfnetlink_subsys_unregister(&ctnl_subsys);
err_out:
@@ -2162,14 +3238,12 @@ static void __exit ctnetlink_exit(void)
{
pr_info("ctnetlink: unregistering from nfnetlink.\n");
- nf_ct_remove_userspace_expectations();
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
- nf_conntrack_unregister_notifier(&ctnl_notifier);
-#endif
-
+ unregister_pernet_subsys(&ctnetlink_net_ops);
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
nfnetlink_subsys_unregister(&ctnl_subsys);
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+ RCU_INIT_POINTER(nfq_ct_hook, NULL);
+#endif
}
module_init(ctnetlink_init);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 088944824e1..825c3e3f830 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -11,10 +11,12 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
* Limitations:
* - We blindly assume that control connections are always
* established in PNS->PAC direction. This is a violation
- * of RFFC2673
+ * of RFC 2637
* - We can only support one single call within each session
* TODO:
* - testing of incoming PPTP calls
@@ -45,14 +47,14 @@ static DEFINE_SPINLOCK(nf_pptp_lock);
int
(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- struct PptpControlHeader *ctlh,
+ unsigned int protoff, struct PptpControlHeader *ctlh,
union pptp_ctrl_union *pptpReq) __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
int
(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- struct PptpControlHeader *ctlh,
+ unsigned int protoff, struct PptpControlHeader *ctlh,
union pptp_ctrl_union *pptpReq) __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
@@ -174,7 +176,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
static void pptp_destroy_siblings(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
- const struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
struct nf_conntrack_tuple t;
nf_ct_gre_keymap_destroy(ct);
@@ -182,16 +184,16 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
/* try original (pns->pac) tuple */
memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
t.dst.protonum = IPPROTO_GRE;
- t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id;
- t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id;
+ t.src.u.gre.key = ct_pptp_info->pns_call_id;
+ t.dst.u.gre.key = ct_pptp_info->pac_call_id;
if (!destroy_sibling_or_exp(net, ct, &t))
pr_debug("failed to timeout original pns->pac ct/exp\n");
/* try reply (pac->pns) tuple */
memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
t.dst.protonum = IPPROTO_GRE;
- t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id;
- t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id;
+ t.src.u.gre.key = ct_pptp_info->pac_call_id;
+ t.dst.u.gre.key = ct_pptp_info->pns_call_id;
if (!destroy_sibling_or_exp(net, ct, &t))
pr_debug("failed to timeout reply pac->pns ct/exp\n");
}
@@ -262,14 +264,14 @@ out_unexpect_orig:
}
static inline int
-pptp_inbound_pkt(struct sk_buff *skb,
+pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
struct PptpControlHeader *ctlh,
union pptp_ctrl_union *pptpReq,
unsigned int reqlen,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
- struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+ struct nf_ct_pptp_master *info = nfct_help_data(ct);
u_int16_t msg;
__be16 cid = 0, pcid = 0;
typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
@@ -364,6 +366,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
break;
case PPTP_WAN_ERROR_NOTIFY:
+ case PPTP_SET_LINK_INFO:
case PPTP_ECHO_REQUEST:
case PPTP_ECHO_REPLY:
/* I don't have to explain these ;) */
@@ -375,7 +378,8 @@ pptp_inbound_pkt(struct sk_buff *skb,
nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_inbound(skb, ct, ctinfo, ctlh, pptpReq);
+ return nf_nat_pptp_inbound(skb, ct, ctinfo,
+ protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
@@ -388,14 +392,14 @@ invalid:
}
static inline int
-pptp_outbound_pkt(struct sk_buff *skb,
+pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
struct PptpControlHeader *ctlh,
union pptp_ctrl_union *pptpReq,
unsigned int reqlen,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
- struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+ struct nf_ct_pptp_master *info = nfct_help_data(ct);
u_int16_t msg;
__be16 cid = 0, pcid = 0;
typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
@@ -470,7 +474,8 @@ pptp_outbound_pkt(struct sk_buff *skb,
nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_outbound(skb, ct, ctinfo, ctlh, pptpReq);
+ return nf_nat_pptp_outbound(skb, ct, ctinfo,
+ protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
@@ -505,7 +510,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
{
int dir = CTINFO2DIR(ctinfo);
- const struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+ const struct nf_ct_pptp_master *info = nfct_help_data(ct);
const struct tcphdr *tcph;
struct tcphdr _tcph;
const struct pptp_pkt_hdr *pptph;
@@ -519,8 +524,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
u_int16_t msg;
/* don't do any tracking before tcp handshake complete */
- if (ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY)
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
return NF_ACCEPT;
nexthdr_off = protoff;
@@ -570,11 +574,11 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
* established from PNS->PAC. However, RFC makes no guarantee */
if (dir == IP_CT_DIR_ORIGINAL)
/* client -> server (PNS -> PAC) */
- ret = pptp_outbound_pkt(skb, ctlh, pptpReq, reqlen, ct,
+ ret = pptp_outbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,
ctinfo);
else
/* server -> client (PAC -> PNS) */
- ret = pptp_inbound_pkt(skb, ctlh, pptpReq, reqlen, ct,
+ ret = pptp_inbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,
ctinfo);
pr_debug("sstate: %d->%d, cstate: %d->%d\n",
oldsstate, info->sstate, oldcstate, info->cstate);
@@ -592,6 +596,7 @@ static const struct nf_conntrack_expect_policy pptp_exp_policy = {
static struct nf_conntrack_helper pptp __read_mostly = {
.name = "pptp",
.me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_pptp_master),
.tuple.src.l3num = AF_INET,
.tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT),
.tuple.dst.protonum = IPPROTO_TCP,
@@ -600,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = {
.expect_policy = &pptp_exp_policy,
};
-static void nf_conntrack_pptp_net_exit(struct net *net)
-{
- nf_ct_gre_keymap_flush(net);
-}
-
-static struct pernet_operations nf_conntrack_pptp_net_ops = {
- .exit = nf_conntrack_pptp_net_exit,
-};
-
static int __init nf_conntrack_pptp_init(void)
{
- int rv;
-
- rv = nf_conntrack_helper_register(&pptp);
- if (rv < 0)
- return rv;
- rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops);
- if (rv < 0)
- nf_conntrack_helper_unregister(&pptp);
- return rv;
+ return nf_conntrack_helper_register(&pptp);
}
static void __exit nf_conntrack_pptp_fini(void)
{
nf_conntrack_helper_unregister(&pptp);
- unregister_pernet_subsys(&nf_conntrack_pptp_net_ops);
}
module_init(nf_conntrack_pptp_init);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index dc7bb74110d..b65d5864b6d 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -21,7 +22,6 @@
#include <linux/notifier.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
-#include <linux/rtnetlink.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
@@ -36,28 +36,32 @@ static DEFINE_MUTEX(nf_ct_proto_mutex);
#ifdef CONFIG_SYSCTL
static int
-nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path,
- struct ctl_table *table, unsigned int *users)
+nf_ct_register_sysctl(struct net *net,
+ struct ctl_table_header **header,
+ const char *path,
+ struct ctl_table *table)
{
if (*header == NULL) {
- *header = register_sysctl_paths(path, table);
+ *header = register_net_sysctl(net, path, table);
if (*header == NULL)
return -ENOMEM;
}
- if (users != NULL)
- (*users)++;
+
return 0;
}
static void
nf_ct_unregister_sysctl(struct ctl_table_header **header,
- struct ctl_table *table, unsigned int *users)
+ struct ctl_table **table,
+ unsigned int users)
{
- if (users != NULL && --*users > 0)
+ if (users > 0)
return;
- unregister_sysctl_table(*header);
+ unregister_net_sysctl_table(*header);
+ kfree(*table);
*header = NULL;
+ *table = NULL;
}
#endif
@@ -88,12 +92,6 @@ nf_ct_l3proto_find_get(u_int16_t l3proto)
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
-void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
-{
- module_put(p->me);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_put);
-
int
nf_ct_l3proto_try_module_get(unsigned short l3proto)
{
@@ -127,6 +125,27 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
+struct nf_conntrack_l4proto *
+nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
+{
+ struct nf_conntrack_l4proto *p;
+
+ rcu_read_lock();
+ p = __nf_ct_l4proto_find(l3num, l4num);
+ if (!try_module_get(p->me))
+ p = &nf_conntrack_l4proto_generic;
+ rcu_read_unlock();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get);
+
+void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p)
+{
+ module_put(p->me);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
+
static int kill_l3proto(struct nf_conn *i, void *data)
{
return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto;
@@ -140,32 +159,58 @@ static int kill_l4proto(struct nf_conn *i, void *data)
nf_ct_l3num(i) == l4proto->l3proto;
}
-static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto)
+static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
{
- int err = 0;
+ if (l3proto->l3proto == PF_INET)
+ return &net->ct.nf_ct_proto;
+ else
+ return NULL;
+}
-#ifdef CONFIG_SYSCTL
- if (l3proto->ctl_table != NULL) {
- err = nf_ct_register_sysctl(&l3proto->ctl_table_header,
+static int nf_ct_l3proto_register_sysctl(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ int err = 0;
+ struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+ /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */
+ if (in == NULL)
+ return 0;
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ if (in->ctl_table != NULL) {
+ err = nf_ct_register_sysctl(net,
+ &in->ctl_table_header,
l3proto->ctl_table_path,
- l3proto->ctl_table, NULL);
+ in->ctl_table);
+ if (err < 0) {
+ kfree(in->ctl_table);
+ in->ctl_table = NULL;
+ }
}
#endif
return err;
}
-static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto)
+static void nf_ct_l3proto_unregister_sysctl(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
{
-#ifdef CONFIG_SYSCTL
- if (l3proto->ctl_table_header != NULL)
- nf_ct_unregister_sysctl(&l3proto->ctl_table_header,
- l3proto->ctl_table, NULL);
+ struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+
+ if (in == NULL)
+ return;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ if (in->ctl_table_header != NULL)
+ nf_ct_unregister_sysctl(&in->ctl_table_header,
+ &in->ctl_table,
+ 0);
#endif
}
-int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
+int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
{
int ret = 0;
+ struct nf_conntrack_l3proto *old;
if (proto->l3proto >= AF_MAX)
return -EBUSY;
@@ -174,15 +219,13 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
return -EINVAL;
mutex_lock(&nf_ct_proto_mutex);
- if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) {
+ old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex));
+ if (old != &nf_conntrack_l3proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
- ret = nf_ct_l3proto_register_sysctl(proto);
- if (ret < 0)
- goto out_unlock;
-
if (proto->nlattr_tuple_size)
proto->nla_size = 3 * proto->nlattr_tuple_size();
@@ -191,81 +234,131 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
return ret;
+
}
-EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register);
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
-void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+int nf_ct_l3proto_pernet_register(struct net *net,
+ struct nf_conntrack_l3proto *proto)
{
- struct net *net;
+ int ret = 0;
+
+ if (proto->init_net) {
+ ret = proto->init_net(net);
+ if (ret < 0)
+ return ret;
+ }
+ return nf_ct_l3proto_register_sysctl(net, proto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
+
+void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+{
BUG_ON(proto->l3proto >= AF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_l3protos[proto->l3proto] != proto);
+ BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != proto);
rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
&nf_conntrack_l3proto_generic);
- nf_ct_l3proto_unregister_sysctl(proto);
mutex_unlock(&nf_ct_proto_mutex);
synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
+
+void nf_ct_l3proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l3proto *proto)
+{
+ nf_ct_l3proto_unregister_sysctl(net, proto);
/* Remove all contrack entries for this protocol */
- rtnl_lock();
- for_each_net(net)
- nf_ct_iterate_cleanup(net, kill_l3proto, proto);
- rtnl_unlock();
+ nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
}
-EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister);
-static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto)
+static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ if (l4proto->get_net_proto) {
+ /* statically built-in protocols use static per-net */
+ return l4proto->get_net_proto(net);
+ } else if (l4proto->net_id) {
+ /* ... and loadable protocols use dynamic per-net */
+ return net_generic(net, *l4proto->net_id);
+ }
+ return NULL;
+}
+
+static
+int nf_ct_l4proto_register_sysctl(struct net *net,
+ struct nf_proto_net *pn,
+ struct nf_conntrack_l4proto *l4proto)
{
int err = 0;
#ifdef CONFIG_SYSCTL
- if (l4proto->ctl_table != NULL) {
- err = nf_ct_register_sysctl(l4proto->ctl_table_header,
- nf_net_netfilter_sysctl_path,
- l4proto->ctl_table,
- l4proto->ctl_table_users);
- if (err < 0)
- goto out;
+ if (pn->ctl_table != NULL) {
+ err = nf_ct_register_sysctl(net,
+ &pn->ctl_table_header,
+ "net/netfilter",
+ pn->ctl_table);
+ if (err < 0) {
+ if (!pn->users) {
+ kfree(pn->ctl_table);
+ pn->ctl_table = NULL;
+ }
+ }
}
#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- if (l4proto->ctl_compat_table != NULL) {
- err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header,
- nf_net_ipv4_netfilter_sysctl_path,
- l4proto->ctl_compat_table, NULL);
+ if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) {
+ if (err < 0) {
+ nf_ct_kfree_compat_sysctl_table(pn);
+ goto out;
+ }
+ err = nf_ct_register_sysctl(net,
+ &pn->ctl_compat_header,
+ "net/ipv4/netfilter",
+ pn->ctl_compat_table);
if (err == 0)
goto out;
- nf_ct_unregister_sysctl(l4proto->ctl_table_header,
- l4proto->ctl_table,
- l4proto->ctl_table_users);
+
+ nf_ct_kfree_compat_sysctl_table(pn);
+ nf_ct_unregister_sysctl(&pn->ctl_table_header,
+ &pn->ctl_table,
+ pn->users);
}
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
out:
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
return err;
}
-static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto)
+static
+void nf_ct_l4proto_unregister_sysctl(struct net *net,
+ struct nf_proto_net *pn,
+ struct nf_conntrack_l4proto *l4proto)
{
#ifdef CONFIG_SYSCTL
- if (l4proto->ctl_table_header != NULL &&
- *l4proto->ctl_table_header != NULL)
- nf_ct_unregister_sysctl(l4proto->ctl_table_header,
- l4proto->ctl_table,
- l4proto->ctl_table_users);
+ if (pn->ctl_table_header != NULL)
+ nf_ct_unregister_sysctl(&pn->ctl_table_header,
+ &pn->ctl_table,
+ pn->users);
+
#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- if (l4proto->ctl_compat_table_header != NULL)
- nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header,
- l4proto->ctl_compat_table, NULL);
+ if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL)
+ nf_ct_unregister_sysctl(&pn->ctl_compat_header,
+ &pn->ctl_compat_table,
+ 0);
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
}
/* FIXME: Allow NULL functions and sub in pointers to generic for
them. --RR */
-int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
@@ -279,7 +372,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
mutex_lock(&nf_ct_proto_mutex);
if (!nf_ct_protos[l4proto->l3proto]) {
/* l3proto may be loaded latter. */
- struct nf_conntrack_l4proto **proto_array;
+ struct nf_conntrack_l4proto __rcu **proto_array;
int i;
proto_array = kmalloc(MAX_NF_CT_PROTO *
@@ -291,7 +384,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
}
for (i = 0; i < MAX_NF_CT_PROTO; i++)
- proto_array[i] = &nf_conntrack_l4proto_generic;
+ RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
/* Before making proto_array visible to lockless readers,
* we must make sure its content is committed to memory.
@@ -299,16 +392,14 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
smp_wmb();
nf_ct_protos[l4proto->l3proto] = proto_array;
- } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] !=
- &nf_conntrack_l4proto_generic) {
+ } else if (rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != &nf_conntrack_l4proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
- ret = nf_ct_l4proto_register_sysctl(l4proto);
- if (ret < 0)
- goto out_unlock;
-
l4proto->nla_size = 0;
if (l4proto->nlattr_size)
l4proto->nla_size += l4proto->nlattr_size();
@@ -317,45 +408,106 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
l4proto);
-
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
-void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+int nf_ct_l4proto_pernet_register(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
{
- struct net *net;
+ int ret = 0;
+ struct nf_proto_net *pn = NULL;
+ if (l4proto->init_net) {
+ ret = l4proto->init_net(net, l4proto->l3proto);
+ if (ret < 0)
+ goto out;
+ }
+
+ pn = nf_ct_l4proto_net(net, l4proto);
+ if (pn == NULL)
+ goto out;
+
+ ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto);
+ if (ret < 0)
+ goto out;
+
+ pn->users++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+
+void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+{
BUG_ON(l4proto->l3proto >= PF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto);
+ BUG_ON(rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != l4proto);
rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
&nf_conntrack_l4proto_generic);
- nf_ct_l4proto_unregister_sysctl(l4proto);
mutex_unlock(&nf_ct_proto_mutex);
synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
+
+void nf_ct_l4proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ struct nf_proto_net *pn = NULL;
+
+ pn = nf_ct_l4proto_net(net, l4proto);
+ if (pn == NULL)
+ return;
+
+ pn->users--;
+ nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);
/* Remove all contrack entries for this protocol */
- rtnl_lock();
- for_each_net(net)
- nf_ct_iterate_cleanup(net, kill_l4proto, l4proto);
- rtnl_unlock();
+ nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);
}
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
-int nf_conntrack_proto_init(void)
+int nf_conntrack_proto_pernet_init(struct net *net)
{
- unsigned int i;
int err;
+ struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+ &nf_conntrack_l4proto_generic);
- err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic);
+ err = nf_conntrack_l4proto_generic.init_net(net,
+ nf_conntrack_l4proto_generic.l3proto);
if (err < 0)
return err;
+ err = nf_ct_l4proto_register_sysctl(net,
+ pn,
+ &nf_conntrack_l4proto_generic);
+ if (err < 0)
+ return err;
+
+ pn->users++;
+ return 0;
+}
+
+void nf_conntrack_proto_pernet_fini(struct net *net)
+{
+ struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+ &nf_conntrack_l4proto_generic);
+
+ pn->users--;
+ nf_ct_l4proto_unregister_sysctl(net,
+ pn,
+ &nf_conntrack_l4proto_generic);
+}
+int nf_conntrack_proto_init(void)
+{
+ unsigned int i;
for (i = 0; i < AF_MAX; i++)
rcu_assign_pointer(nf_ct_l3protos[i],
&nf_conntrack_l3proto_generic);
@@ -365,9 +517,6 @@ int nf_conntrack_proto_init(void)
void nf_conntrack_proto_fini(void)
{
unsigned int i;
-
- nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic);
-
/* free l3proto protocol tables */
for (i = 0; i < PF_MAX; i++)
kfree(nf_ct_protos[i]);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 5292560d6d4..cb372f96f10 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -98,7 +98,7 @@ static const char * const dccp_state_names[] = {
#define sIV CT_DCCP_INVALID
/*
- * DCCP state transistion table
+ * DCCP state transition table
*
* The assumption is the same as for TCP tracking:
*
@@ -387,12 +387,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
/* this module per-net specifics */
static int dccp_net_id __read_mostly;
struct dccp_net {
+ struct nf_proto_net pn;
int dccp_loose;
unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-#ifdef CONFIG_SYSCTL
- struct ctl_table_header *sysctl_header;
- struct ctl_table *sysctl_table;
-#endif
};
static inline struct dccp_net *dccp_pernet(struct net *net)
@@ -423,7 +420,7 @@ static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
}
static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
struct net *net = nf_ct_net(ct);
struct dccp_net *dn;
@@ -431,7 +428,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
const char *msg;
u_int8_t state;
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh);
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
BUG_ON(dh == NULL);
state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
@@ -452,11 +449,15 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
ct->proto.dccp.state = CT_DCCP_NONE;
+ ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
+ ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
+ ct->proto.dccp.handshake_seq = 0;
return true;
out_invalid:
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg);
+ nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL,
+ NULL, "%s", msg);
return false;
}
@@ -469,18 +470,23 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
ntohl(dhack->dccph_ack_nr_low);
}
+static unsigned int *dccp_get_timeouts(struct net *net)
+{
+ return dccp_pernet(net)->dccp_timeout;
+}
+
static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, enum ip_conntrack_info ctinfo,
- u_int8_t pf, unsigned int hooknum)
+ u_int8_t pf, unsigned int hooknum,
+ unsigned int *timeouts)
{
struct net *net = nf_ct_net(ct);
- struct dccp_net *dn;
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
struct dccp_hdr _dh, *dh;
u_int8_t type, old_state, new_state;
enum ct_dccp_roles role;
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh);
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
BUG_ON(dh == NULL);
type = dh->dccph_type;
@@ -537,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_dccp: invalid packet ignored ");
return NF_ACCEPT;
case CT_DCCP_INVALID:
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_dccp: invalid state transition ");
return -NF_ACCEPT;
}
@@ -556,8 +562,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
if (new_state != old_state)
nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
- dn = dccp_pernet(net);
- nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
return NF_ACCEPT;
}
@@ -572,7 +577,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
unsigned int cscov;
const char *msg;
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh);
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
if (dh == NULL) {
msg = "nf_ct_dccp: short packet ";
goto out_invalid;
@@ -609,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
out_invalid:
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg);
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg);
return -NF_ACCEPT;
}
@@ -626,7 +631,7 @@ static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
}
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
struct nf_conn *ct)
{
@@ -636,11 +641,12 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
- NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state);
- NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE,
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]);
- NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
- cpu_to_be64(ct->proto.dccp.handshake_seq));
+ if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) ||
+ nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
+ nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
+ cpu_to_be64(ct->proto.dccp.handshake_seq)))
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
spin_unlock_bh(&ct->lock);
return 0;
@@ -699,8 +705,62 @@ static int dccp_nlattr_size(void)
return nla_total_size(0) /* CTA_PROTOINFO_DCCP */
+ nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1);
}
+
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ struct dccp_net *dn = dccp_pernet(net);
+ unsigned int *timeouts = data;
+ int i;
+
+ /* set default DCCP timeouts. */
+ for (i=0; i<CT_DCCP_MAX; i++)
+ timeouts[i] = dn->dccp_timeout[i];
+
+ /* there's a 1:1 mapping between attributes and protocol states. */
+ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
+ if (tb[i]) {
+ timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
+ }
+ }
+ return 0;
+}
+
+static int
+dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+ int i;
+
+ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
+ if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
+ goto nla_put_failure;
+ }
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
+ [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
/* template, data assigned later */
static struct ctl_table dccp_sysctl_table[] = {
@@ -756,6 +816,55 @@ static struct ctl_table dccp_sysctl_table[] = {
};
#endif /* CONFIG_SYSCTL */
+static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
+ struct dccp_net *dn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(dccp_sysctl_table,
+ sizeof(dccp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
+ pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
+ pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
+ pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
+ pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
+ pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
+ pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
+ pn->ctl_table[7].data = &dn->dccp_loose;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ pn->ctl_table[0].procname = NULL;
+#endif
+ return 0;
+}
+
+static int dccp_init_net(struct net *net, u_int16_t proto)
+{
+ struct dccp_net *dn = dccp_pernet(net);
+ struct nf_proto_net *pn = &dn->pn;
+
+ if (!pn->users) {
+ /* default values */
+ dn->dccp_loose = 1;
+ dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ;
+ dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
+ dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
+ dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
+ }
+
+ return dccp_kmemdup_sysctl_table(net, pn, dn);
+}
+
static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
.l3proto = AF_INET,
.l4proto = IPPROTO_DCCP,
@@ -764,10 +873,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
.invert_tuple = dccp_invert_tuple,
.new = dccp_new,
.packet = dccp_packet,
+ .get_timeouts = dccp_get_timeouts,
.error = dccp_error,
.print_tuple = dccp_print_tuple,
.print_conntrack = dccp_print_conntrack,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = dccp_to_nlattr,
.nlattr_size = dccp_nlattr_size,
.from_nlattr = nlattr_to_dccp,
@@ -776,6 +886,17 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
+ .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
+ .nla_policy = dccp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &dccp_net_id,
+ .init_net = dccp_init_net,
};
static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
@@ -786,10 +907,11 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
.invert_tuple = dccp_invert_tuple,
.new = dccp_new,
.packet = dccp_packet,
+ .get_timeouts = dccp_get_timeouts,
.error = dccp_error,
.print_tuple = dccp_print_tuple,
.print_conntrack = dccp_print_conntrack,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = dccp_to_nlattr,
.nlattr_size = dccp_nlattr_size,
.from_nlattr = nlattr_to_dccp,
@@ -798,55 +920,43 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
+ .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
+ .nla_policy = dccp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &dccp_net_id,
+ .init_net = dccp_init_net,
};
static __net_init int dccp_net_init(struct net *net)
{
- struct dccp_net *dn = dccp_pernet(net);
-
- /* default values */
- dn->dccp_loose = 1;
- dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
-
-#ifdef CONFIG_SYSCTL
- dn->sysctl_table = kmemdup(dccp_sysctl_table,
- sizeof(dccp_sysctl_table), GFP_KERNEL);
- if (!dn->sysctl_table)
- return -ENOMEM;
-
- dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
- dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
- dn->sysctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
- dn->sysctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
- dn->sysctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
- dn->sysctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
- dn->sysctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
- dn->sysctl_table[7].data = &dn->dccp_loose;
-
- dn->sysctl_header = register_net_sysctl_table(net,
- nf_net_netfilter_sysctl_path, dn->sysctl_table);
- if (!dn->sysctl_header) {
- kfree(dn->sysctl_table);
- return -ENOMEM;
+ int ret = 0;
+ ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_dccp4: pernet registration failed.\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_dccp6: pernet registration failed.\n");
+ goto cleanup_dccp4;
}
-#endif
-
return 0;
+cleanup_dccp4:
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
+out:
+ return ret;
}
static __net_exit void dccp_net_exit(struct net *net)
{
- struct dccp_net *dn = dccp_pernet(net);
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(dn->sysctl_header);
- kfree(dn->sysctl_table);
-#endif
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto6);
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
}
static struct pernet_operations dccp_net_ops = {
@@ -858,34 +968,34 @@ static struct pernet_operations dccp_net_ops = {
static int __init nf_conntrack_proto_dccp_init(void)
{
- int err;
+ int ret;
- err = register_pernet_subsys(&dccp_net_ops);
- if (err < 0)
- goto err1;
+ ret = register_pernet_subsys(&dccp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
- err = nf_conntrack_l4proto_register(&dccp_proto4);
- if (err < 0)
- goto err2;
+ ret = nf_ct_l4proto_register(&dccp_proto4);
+ if (ret < 0)
+ goto out_dccp4;
- err = nf_conntrack_l4proto_register(&dccp_proto6);
- if (err < 0)
- goto err3;
- return 0;
+ ret = nf_ct_l4proto_register(&dccp_proto6);
+ if (ret < 0)
+ goto out_dccp6;
-err3:
- nf_conntrack_l4proto_unregister(&dccp_proto4);
-err2:
+ return 0;
+out_dccp6:
+ nf_ct_l4proto_unregister(&dccp_proto4);
+out_dccp4:
unregister_pernet_subsys(&dccp_net_ops);
-err1:
- return err;
+out_pernet:
+ return ret;
}
static void __exit nf_conntrack_proto_dccp_fini(void)
{
+ nf_ct_l4proto_unregister(&dccp_proto6);
+ nf_ct_l4proto_unregister(&dccp_proto4);
unregister_pernet_subsys(&dccp_net_ops);
- nf_conntrack_l4proto_unregister(&dccp_proto6);
- nf_conntrack_l4proto_unregister(&dccp_proto4);
}
module_init(nf_conntrack_proto_dccp_init);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index e2091d0c7a2..d25f2937764 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -14,6 +14,11 @@
static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ;
+static inline struct nf_generic_net *generic_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.generic;
+}
+
static bool generic_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
struct nf_conntrack_tuple *tuple)
@@ -40,31 +45,77 @@ static int generic_print_tuple(struct seq_file *s,
return 0;
}
+static unsigned int *generic_get_timeouts(struct net *net)
+{
+ return &(generic_pernet(net)->timeout);
+}
+
/* Returns verdict for packet, or -1 for invalid. */
-static int packet(struct nf_conn *ct,
- const struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- u_int8_t pf,
- unsigned int hooknum)
+static int generic_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeout)
{
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout);
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
-static bool new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
{
return true;
}
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeout = data;
+ struct nf_generic_net *gn = generic_pernet(net);
+
+ if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ;
+ else {
+ /* Set default generic timeout. */
+ *timeout = gn->timeout;
+ }
+
+ return 0;
+}
+
+static int
+generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = {
+ [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *generic_sysctl_header;
static struct ctl_table generic_sysctl_table[] = {
{
.procname = "nf_conntrack_generic_timeout",
- .data = &nf_ct_generic_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -75,7 +126,6 @@ static struct ctl_table generic_sysctl_table[] = {
static struct ctl_table generic_compat_sysctl_table[] = {
{
.procname = "ip_conntrack_generic_timeout",
- .data = &nf_ct_generic_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -85,6 +135,62 @@ static struct ctl_table generic_compat_sysctl_table[] = {
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
+static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_generic_net *gn)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(generic_sysctl_table,
+ sizeof(generic_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &gn->timeout;
+#endif
+ return 0;
+}
+
+static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_generic_net *gn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
+ sizeof(generic_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &gn->timeout;
+#endif
+#endif
+ return 0;
+}
+
+static int generic_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_generic_net *gn = generic_pernet(net);
+ struct nf_proto_net *pn = &gn->pn;
+
+ gn->timeout = nf_ct_generic_timeout;
+
+ ret = generic_kmemdup_compat_sysctl_table(pn, gn);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_kmemdup_sysctl_table(pn, gn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+
+ return ret;
+}
+
+static struct nf_proto_net *generic_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.generic.pn;
+}
+
struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
{
.l3proto = PF_UNSPEC,
@@ -93,13 +199,18 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
.pkt_to_tuple = generic_pkt_to_tuple,
.invert_tuple = generic_invert_tuple,
.print_tuple = generic_print_tuple,
- .packet = packet,
- .new = new,
-#ifdef CONFIG_SYSCTL
- .ctl_table_header = &generic_sysctl_header,
- .ctl_table = generic_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- .ctl_compat_table = generic_compat_sysctl_table,
-#endif
-#endif
+ .packet = generic_packet,
+ .get_timeouts = generic_get_timeouts,
+ .new = generic_new,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = generic_timeout_nlattr_to_obj,
+ .obj_to_nlattr = generic_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_GENERIC_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = generic_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = generic_init_net,
+ .get_net_proto = generic_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index cf616e55ca4..d5665739e3b 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -21,6 +21,7 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*/
#include <linux/module.h>
@@ -41,18 +42,33 @@
#include <linux/netfilter/nf_conntrack_proto_gre.h>
#include <linux/netfilter/nf_conntrack_pptp.h>
-#define GRE_TIMEOUT (30 * HZ)
-#define GRE_STREAM_TIMEOUT (180 * HZ)
+enum grep_conntrack {
+ GRE_CT_UNREPLIED,
+ GRE_CT_REPLIED,
+ GRE_CT_MAX
+};
+
+static unsigned int gre_timeouts[GRE_CT_MAX] = {
+ [GRE_CT_UNREPLIED] = 30*HZ,
+ [GRE_CT_REPLIED] = 180*HZ,
+};
static int proto_gre_net_id __read_mostly;
struct netns_proto_gre {
+ struct nf_proto_net nf;
rwlock_t keymap_lock;
struct list_head keymap_list;
+ unsigned int gre_timeouts[GRE_CT_MAX];
};
-void nf_ct_gre_keymap_flush(struct net *net)
+static inline struct netns_proto_gre *gre_pernet(struct net *net)
+{
+ return net_generic(net, proto_gre_net_id);
+}
+
+static void nf_ct_gre_keymap_flush(struct net *net)
{
- struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
struct nf_ct_gre_keymap *km, *tmp;
write_lock_bh(&net_gre->keymap_lock);
@@ -62,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net)
}
write_unlock_bh(&net_gre->keymap_lock);
}
-EXPORT_SYMBOL(nf_ct_gre_keymap_flush);
static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
const struct nf_conntrack_tuple *t)
@@ -77,7 +92,7 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
/* look up the source key for a given tuple */
static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
{
- struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
struct nf_ct_gre_keymap *km;
__be16 key = 0;
@@ -101,11 +116,11 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
struct nf_conntrack_tuple *t)
{
struct net *net = nf_ct_net(ct);
- struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
- struct nf_conn_help *help = nfct_help(ct);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
struct nf_ct_gre_keymap **kmp, *km;
- kmp = &help->help.ct_pptp_info.keymap[dir];
+ kmp = &ct_pptp_info->keymap[dir];
if (*kmp) {
/* check whether it's a retransmission */
read_lock_bh(&net_gre->keymap_lock);
@@ -142,20 +157,20 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
- struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
- struct nf_conn_help *help = nfct_help(ct);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
enum ip_conntrack_dir dir;
pr_debug("entering for ct %p\n", ct);
write_lock_bh(&net_gre->keymap_lock);
for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
- if (help->help.ct_pptp_info.keymap[dir]) {
+ if (ct_pptp_info->keymap[dir]) {
pr_debug("removing %p from list\n",
- help->help.ct_pptp_info.keymap[dir]);
- list_del(&help->help.ct_pptp_info.keymap[dir]->list);
- kfree(help->help.ct_pptp_info.keymap[dir]);
- help->help.ct_pptp_info.keymap[dir] = NULL;
+ ct_pptp_info->keymap[dir]);
+ list_del(&ct_pptp_info->keymap[dir]->list);
+ kfree(ct_pptp_info->keymap[dir]);
+ ct_pptp_info->keymap[dir] = NULL;
}
}
write_unlock_bh(&net_gre->keymap_lock);
@@ -227,13 +242,19 @@ static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
(ct->proto.gre.stream_timeout / HZ));
}
+static unsigned int *gre_get_timeouts(struct net *net)
+{
+ return gre_pernet(net)->gre_timeouts;
+}
+
/* Returns verdict for packet, and may modify conntrack */
static int gre_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
- unsigned int hooknum)
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
/* If we've seen traffic both ways, this is a GRE connection.
* Extend timeout. */
@@ -241,8 +262,8 @@ static int gre_packet(struct nf_conn *ct,
nf_ct_refresh_acct(ct, ctinfo, skb,
ct->proto.gre.stream_timeout);
/* Also, more likely to be important, and not a probe. */
- set_bit(IPS_ASSURED_BIT, &ct->status);
- nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else
nf_ct_refresh_acct(ct, ctinfo, skb,
ct->proto.gre.timeout);
@@ -252,15 +273,15 @@ static int gre_packet(struct nf_conn *ct,
/* Called when a new connection for this protocol found. */
static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
pr_debug(": ");
nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
/* initialize to sane value. Ideally a conntrack helper
* (e.g. in case of pptp) is increasing them */
- ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT;
- ct->proto.gre.timeout = GRE_TIMEOUT;
+ ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
+ ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
return true;
}
@@ -278,6 +299,68 @@ static void gre_destroy(struct nf_conn *ct)
nf_ct_gre_keymap_destroy(master);
}
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+
+ /* set default timeouts for GRE. */
+ timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
+ timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) {
+ timeouts[GRE_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_GRE_REPLIED]) {
+ timeouts[GRE_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED,
+ htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED,
+ htonl(timeouts[GRE_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
+ [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+static int gre_init_net(struct net *net, u_int16_t proto)
+{
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ int i;
+
+ rwlock_init(&net_gre->keymap_lock);
+ INIT_LIST_HEAD(&net_gre->keymap_list);
+ for (i = 0; i < GRE_CT_MAX; i++)
+ net_gre->gre_timeouts[i] = gre_timeouts[i];
+
+ return 0;
+}
+
/* protocol helper struct */
static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
.l3proto = AF_INET,
@@ -287,30 +370,42 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
.invert_tuple = gre_invert_tuple,
.print_tuple = gre_print_tuple,
.print_conntrack = gre_print_conntrack,
+ .get_timeouts = gre_get_timeouts,
.packet = gre_packet,
.new = gre_new,
.destroy = gre_destroy,
.me = THIS_MODULE,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = gre_timeout_nlattr_to_obj,
+ .obj_to_nlattr = gre_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_GRE_MAX,
+ .obj_size = sizeof(unsigned int) * GRE_CT_MAX,
+ .nla_policy = gre_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &proto_gre_net_id,
+ .init_net = gre_init_net,
};
static int proto_gre_net_init(struct net *net)
{
- struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
-
- rwlock_init(&net_gre->keymap_lock);
- INIT_LIST_HEAD(&net_gre->keymap_list);
-
- return 0;
+ int ret = 0;
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4);
+ if (ret < 0)
+ pr_err("nf_conntrack_gre4: pernet registration failed.\n");
+ return ret;
}
static void proto_gre_net_exit(struct net *net)
{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);
nf_ct_gre_keymap_flush(net);
}
@@ -323,20 +418,26 @@ static struct pernet_operations proto_gre_net_ops = {
static int __init nf_ct_proto_gre_init(void)
{
- int rv;
-
- rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4);
- if (rv < 0)
- return rv;
- rv = register_pernet_subsys(&proto_gre_net_ops);
- if (rv < 0)
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
- return rv;
+ int ret;
+
+ ret = register_pernet_subsys(&proto_gre_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
+ if (ret < 0)
+ goto out_gre4;
+
+ return 0;
+out_gre4:
+ unregister_pernet_subsys(&proto_gre_net_ops);
+out_pernet:
+ return ret;
}
static void __exit nf_ct_proto_gre_fini(void)
{
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
unregister_pernet_subsys(&proto_gre_net_ops);
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index c6049c2d5ea..1314d33f6bc 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -1,6 +1,9 @@
/*
* Connection tracking protocol helper module for SCTP.
*
+ * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
+ * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
+ *
* SCTP is defined in RFC 2960. References to various sections in this code
* are to this RFC.
*
@@ -107,9 +110,9 @@ static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
},
{
@@ -121,12 +124,23 @@ static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
}
};
+static int sctp_net_id __read_mostly;
+struct sctp_net {
+ struct nf_proto_net pn;
+ unsigned int timeouts[SCTP_CONNTRACK_MAX];
+};
+
+static inline struct sctp_net *sctp_pernet(struct net *net)
+{
+ return net_generic(net, sctp_net_id);
+}
+
static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct nf_conntrack_tuple *tuple)
{
@@ -279,13 +293,19 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
return sctp_conntracks[dir][i][cur_state];
}
+static unsigned int *sctp_get_timeouts(struct net *net)
+{
+ return sctp_pernet(net)->timeouts;
+}
+
/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
static int sctp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
- unsigned int hooknum)
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
enum sctp_conntrack new_state, old_state;
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -370,7 +390,7 @@ static int sctp_packet(struct nf_conn *ct,
}
spin_unlock_bh(&ct->lock);
- nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
dir == IP_CT_DIR_REPLY &&
@@ -390,7 +410,7 @@ out:
/* Called when a new connection for this protocol found. */
static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
enum sctp_conntrack new_state;
const struct sctphdr *sh;
@@ -413,6 +433,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
test_bit(SCTP_CID_COOKIE_ACK, map))
return false;
+ memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
new_state = SCTP_CONNTRACK_MAX;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
/* Don't need lock here: this conntrack not in circulation yet */
@@ -460,7 +481,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
}
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -475,15 +496,12 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- NLA_PUT_U8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state);
-
- NLA_PUT_BE32(skb,
- CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
- ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]);
-
- NLA_PUT_BE32(skb,
- CTA_PROTOINFO_SCTP_VTAG_REPLY,
- ct->proto.sctp.vtag[IP_CT_DIR_REPLY]);
+ if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) ||
+ nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
+ nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
+ goto nla_put_failure;
spin_unlock_bh(&ct->lock);
@@ -542,55 +560,100 @@ static int sctp_nlattr_size(void)
}
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct sctp_net *sn = sctp_pernet(net);
+ int i;
+
+ /* set default SCTP timeouts. */
+ for (i=0; i<SCTP_CONNTRACK_MAX; i++)
+ timeouts[i] = sn->timeouts[i];
+
+ /* there's a 1:1 mapping between attributes and protocol states. */
+ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
+ if (tb[i]) {
+ timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
+ }
+ }
+ return 0;
+}
+
+static int
+sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+ int i;
+
+ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
+ if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
+ goto nla_put_failure;
+ }
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
+ [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+
#ifdef CONFIG_SYSCTL
-static unsigned int sctp_sysctl_table_users;
-static struct ctl_table_header *sctp_sysctl_header;
static struct ctl_table sctp_sysctl_table[] = {
{
.procname = "nf_conntrack_sctp_timeout_closed",
- .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_sctp_timeout_cookie_wait",
- .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_sctp_timeout_cookie_echoed",
- .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_sctp_timeout_established",
- .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_sctp_timeout_shutdown_sent",
- .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_sctp_timeout_shutdown_recd",
- .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent",
- .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -602,49 +665,42 @@ static struct ctl_table sctp_sysctl_table[] = {
static struct ctl_table sctp_compat_sysctl_table[] = {
{
.procname = "ip_conntrack_sctp_timeout_closed",
- .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_sctp_timeout_cookie_wait",
- .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_sctp_timeout_cookie_echoed",
- .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_sctp_timeout_established",
- .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_sctp_timeout_shutdown_sent",
- .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_sctp_timeout_shutdown_recd",
- .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
- .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -654,6 +710,80 @@ static struct ctl_table sctp_compat_sysctl_table[] = {
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif
+static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct sctp_net *sn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(sctp_sysctl_table,
+ sizeof(sctp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
+ pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
+ pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
+ pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
+ pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
+ pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
+ pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+#endif
+ return 0;
+}
+
+static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct sctp_net *sn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table,
+ sizeof(sctp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
+ pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
+ pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
+ pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
+ pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
+ pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
+ pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+#endif
+#endif
+ return 0;
+}
+
+static int sctp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct sctp_net *sn = sctp_pernet(net);
+ struct nf_proto_net *pn = &sn->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
+ sn->timeouts[i] = sctp_timeouts[i];
+ }
+
+ if (proto == AF_INET) {
+ ret = sctp_kmemdup_compat_sysctl_table(pn, sn);
+ if (ret < 0)
+ return ret;
+
+ ret = sctp_kmemdup_sysctl_table(pn, sn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = sctp_kmemdup_sysctl_table(pn, sn);
+
+ return ret;
+}
+
static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.l3proto = PF_INET,
.l4proto = IPPROTO_SCTP,
@@ -663,9 +793,10 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.print_tuple = sctp_print_tuple,
.print_conntrack = sctp_print_conntrack,
.packet = sctp_packet,
+ .get_timeouts = sctp_get_timeouts,
.new = sctp_new,
.me = THIS_MODULE,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = sctp_to_nlattr,
.nlattr_size = sctp_nlattr_size,
.from_nlattr = nlattr_to_sctp,
@@ -674,14 +805,17 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &sctp_sysctl_table_users,
- .ctl_table_header = &sctp_sysctl_header,
- .ctl_table = sctp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- .ctl_compat_table = sctp_compat_sysctl_table,
-#endif
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
+ .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
+ .nla_policy = sctp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &sctp_net_id,
+ .init_net = sctp_init_net,
};
static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
@@ -693,9 +827,10 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
.print_tuple = sctp_print_tuple,
.print_conntrack = sctp_print_conntrack,
.packet = sctp_packet,
+ .get_timeouts = sctp_get_timeouts,
.new = sctp_new,
.me = THIS_MODULE,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = sctp_to_nlattr,
.nlattr_size = sctp_nlattr_size,
.from_nlattr = nlattr_to_sctp,
@@ -703,41 +838,85 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
+ .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
+ .nla_policy = sctp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &sctp_sysctl_table_users,
- .ctl_table_header = &sctp_sysctl_header,
- .ctl_table = sctp_sysctl_table,
-#endif
+ .net_id = &sctp_net_id,
+ .init_net = sctp_init_net,
};
-static int __init nf_conntrack_proto_sctp_init(void)
+static int sctp_net_init(struct net *net)
{
- int ret;
+ int ret = 0;
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4);
- if (ret) {
- pr_err("nf_conntrack_l4proto_sctp4: protocol register failed\n");
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_sctp4: pernet registration failed.\n");
goto out;
}
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6);
- if (ret) {
- pr_err("nf_conntrack_l4proto_sctp6: protocol register failed\n");
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_sctp6: pernet registration failed.\n");
goto cleanup_sctp4;
}
+ return 0;
+cleanup_sctp4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
+out:
return ret;
+}
+
+static void sctp_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
+}
+
+static struct pernet_operations sctp_net_ops = {
+ .init = sctp_net_init,
+ .exit = sctp_net_exit,
+ .id = &sctp_net_id,
+ .size = sizeof(struct sctp_net),
+};
- cleanup_sctp4:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
- out:
+static int __init nf_conntrack_proto_sctp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&sctp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4);
+ if (ret < 0)
+ goto out_sctp4;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6);
+ if (ret < 0)
+ goto out_sctp6;
+
+ return 0;
+out_sctp6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+out_sctp4:
+ unregister_pernet_subsys(&sctp_net_ops);
+out_pernet:
return ret;
}
static void __exit nf_conntrack_proto_sctp_fini(void)
{
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+ unregister_pernet_subsys(&sctp_net_ops);
}
module_init(nf_conntrack_proto_sctp_init);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3fb2b73b24d..44d1ea32570 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,5 +1,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -25,6 +27,8 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_log.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -64,13 +68,7 @@ static const char *const tcp_conntrack_names[] = {
#define HOURS * 60 MINS
#define DAYS * 24 HOURS
-/* RFC1122 says the R2 limit should be at least 100 seconds.
- Linux uses 15 packets as limit, which corresponds
- to ~13-30min depending on RTO. */
-static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS;
-static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly = 5 MINS;
-
-static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {
+static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = {
[TCP_CONNTRACK_SYN_SENT] = 2 MINS,
[TCP_CONNTRACK_SYN_RECV] = 60 SECS,
[TCP_CONNTRACK_ESTABLISHED] = 5 DAYS,
@@ -80,6 +78,11 @@ static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {
[TCP_CONNTRACK_TIME_WAIT] = 2 MINS,
[TCP_CONNTRACK_CLOSE] = 10 SECS,
[TCP_CONNTRACK_SYN_SENT2] = 2 MINS,
+/* RFC1122 says the R2 limit should be at least 100 seconds.
+ Linux uses 15 packets as limit, which corresponds
+ to ~13-30min depending on RTO. */
+ [TCP_CONNTRACK_RETRANS] = 5 MINS,
+ [TCP_CONNTRACK_UNACK] = 5 MINS,
};
#define sNO TCP_CONNTRACK_NONE
@@ -159,21 +162,18 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sCL -> sSS
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
-/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
+/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
/*
* sNO -> sIV Too late and no reason to do anything
* sSS -> sIV Client can't send SYN and then SYN/ACK
* sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open
- * sSR -> sIG
- * sES -> sIG Error: SYNs in window outside the SYN_SENT state
- * are errors. Receiver will reply with RST
- * and close the connection.
- * Or we are not in sync and hold a dead connection.
- * sFW -> sIG
- * sCW -> sIG
- * sLA -> sIG
- * sTW -> sIG
- * sCL -> sIG
+ * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open
+ * sES -> sIV Invalid SYN/ACK packets sent by the client
+ * sFW -> sIV
+ * sCW -> sIV
+ * sLA -> sIV
+ * sTW -> sIV
+ * sCL -> sIV
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
@@ -227,11 +227,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sCL -> sIV
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
-/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
+/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
/*
* sSS -> sSR Standard open.
* sS2 -> sSR Simultaneous open
- * sSR -> sSR Retransmitted SYN/ACK.
+ * sSR -> sIG Retransmitted SYN/ACK, ignore it.
* sES -> sIG Late retransmitted SYN/ACK?
* sFW -> sIG Might be SYN/ACK answering ignored SYN
* sCW -> sIG
@@ -271,6 +271,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
}
};
+static inline struct nf_tcp_net *tcp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.tcp;
+}
+
static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct nf_conntrack_tuple *tuple)
{
@@ -409,7 +414,7 @@ static void tcp_options(const struct sk_buff *skb,
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
- break; /* don't parse partial options */
+ return; /* don't parse partial options */
if (opcode == TCPOPT_SACK_PERM
&& opsize == TCPOLEN_SACK_PERM)
@@ -447,7 +452,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
BUG_ON(ptr == NULL);
/* Fast path for timestamp-only option */
- if (length == TCPOLEN_TSTAMP_ALIGNED*4
+ if (length == TCPOLEN_TSTAMP_ALIGNED
&& *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
| (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8)
@@ -469,7 +474,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
- break; /* don't parse partial options */
+ return; /* don't parse partial options */
if (opcode == TCPOPT_SACK
&& opsize >= (TCPOLEN_SACK_BASE
@@ -492,21 +497,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
}
}
-#ifdef CONFIG_NF_NAT_NEEDED
-static inline s16 nat_offset(const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- u32 seq)
-{
- typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset);
-
- return get_offset != NULL ? get_offset(ct, dir, seq) : 0;
-}
-#define NAT_OFFSET(pf, ct, dir, seq) \
- (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0)
-#else
-#define NAT_OFFSET(pf, ct, dir, seq) 0
-#endif
-
static bool tcp_in_window(const struct nf_conn *ct,
struct ip_ct_tcp *state,
enum ip_conntrack_dir dir,
@@ -517,12 +507,13 @@ static bool tcp_in_window(const struct nf_conn *ct,
u_int8_t pf)
{
struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
struct ip_ct_tcp_state *sender = &state->seen[dir];
struct ip_ct_tcp_state *receiver = &state->seen[!dir];
const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
__u32 seq, ack, sack, end, win, swin;
- s16 receiver_offset;
- bool res;
+ s32 receiver_offset;
+ bool res, in_recv_win;
/*
* Get the required data from the packet.
@@ -536,7 +527,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
tcp_sack(skb, dataoff, tcph, &sack);
/* Take into account NAT sequence number mangling */
- receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1);
+ receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
ack -= receiver_offset;
sack -= receiver_offset;
@@ -585,8 +576,8 @@ static bool tcp_in_window(const struct nf_conn *ct,
* Let's try to use the data from the packet.
*/
sender->td_end = end;
- win <<= sender->td_scale;
- sender->td_maxwin = (win == 0 ? 1 : win);
+ swin = win << sender->td_scale;
+ sender->td_maxwin = (swin == 0 ? 1 : swin);
sender->td_maxend = end + sender->td_maxwin;
/*
* We haven't seen traffic in the other direction yet
@@ -628,15 +619,9 @@ static bool tcp_in_window(const struct nf_conn *ct,
ack = sack = receiver->td_end;
}
- if (seq == end
- && (!tcph->rst
- || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+ if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
/*
- * Packets contains no data: we assume it is valid
- * and check the ack value only.
- * However RST segments are always validated by their
- * SEQ number, except when seq == 0 (reset sent answering
- * SYN.
+ * RST sent answering SYN.
*/
seq = end = sender->td_end;
@@ -651,14 +636,18 @@ static bool tcp_in_window(const struct nf_conn *ct,
receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
receiver->td_scale);
+ /* Is the ending sequence in the receive window (if available)? */
+ in_recv_win = !receiver->td_maxwin ||
+ after(end, sender->td_end - receiver->td_maxwin - 1);
+
pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
before(seq, sender->td_maxend + 1),
- after(end, sender->td_end - receiver->td_maxwin - 1),
+ (in_recv_win ? 1 : 0),
before(sack, receiver->td_end + 1),
after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
if (before(seq, sender->td_maxend + 1) &&
- after(end, sender->td_end - receiver->td_maxwin - 1) &&
+ in_recv_win &&
before(sack, receiver->td_end + 1) &&
after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
/*
@@ -721,13 +710,13 @@ static bool tcp_in_window(const struct nf_conn *ct,
} else {
res = false;
if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
- nf_ct_tcp_be_liberal)
+ tn->tcp_be_liberal)
res = true;
if (!res && LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: %s ",
before(seq, sender->td_maxend + 1) ?
- after(end, sender->td_end - receiver->td_maxwin - 1) ?
+ in_recv_win ?
before(sack, receiver->td_end + 1) ?
after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
: "ACK is under the lower bound (possible overly delayed ACK)"
@@ -776,7 +765,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
if (th == NULL) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: short packet ");
return -NF_ACCEPT;
}
@@ -784,7 +773,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -797,7 +786,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
@@ -806,7 +795,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
@@ -814,15 +803,22 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
return NF_ACCEPT;
}
+static unsigned int *tcp_get_timeouts(struct net *net)
+{
+ return tcp_pernet(net)->timeouts;
+}
+
/* Returns verdict for packet, or -1 for invalid. */
static int tcp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
- unsigned int hooknum)
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
enum ip_conntrack_dir dir;
@@ -946,16 +942,32 @@ static int tcp_packet(struct nf_conn *ct,
}
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_tcp: invalid packet ignored ");
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid packet ignored in "
+ "state %s ", tcp_conntrack_names[old_state]);
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
+ /* Special case for SYN proxy: when the SYN to the server or
+ * the SYN/ACK from the server is lost, the client may transmit
+ * a keep-alive packet while in SYN_SENT state. This needs to
+ * be associated with the original conntrack entry in order to
+ * generate a new SYN with the correct sequence number.
+ */
+ if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
+ index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
+ ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
+ ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
+ pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
+ spin_unlock_bh(&ct->lock);
+ return NF_ACCEPT;
+ }
+
/* Invalid packet */
pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
dir, get_conntrack_index(th), old_state);
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid state ");
return -NF_ACCEPT;
case TCP_CONNTRACK_CLOSE:
@@ -965,8 +977,8 @@ static int tcp_packet(struct nf_conn *ct,
/* Invalid RST */
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_tcp: invalid RST ");
+ nf_log_packet(net, pf, 0, skb, NULL, NULL,
+ NULL, "nf_ct_tcp: invalid RST ");
return -NF_ACCEPT;
}
if (index == TCP_RST_SET
@@ -1014,15 +1026,15 @@ static int tcp_packet(struct nf_conn *ct,
&& new_state == TCP_CONNTRACK_FIN_WAIT)
ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
- if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans &&
- tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans)
- timeout = nf_ct_tcp_timeout_max_retrans;
+ if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
+ timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
+ timeout = timeouts[TCP_CONNTRACK_RETRANS];
else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
- tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged)
- timeout = nf_ct_tcp_timeout_unacknowledged;
+ timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
+ timeout = timeouts[TCP_CONNTRACK_UNACK];
else
- timeout = tcp_timeouts[new_state];
+ timeout = timeouts[new_state];
spin_unlock_bh(&ct->lock);
if (new_state != old_state)
@@ -1037,6 +1049,12 @@ static int tcp_packet(struct nf_conn *ct,
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
+ /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
+ * pickup with loose=1. Avoid large ESTABLISHED timeout.
+ */
+ if (new_state == TCP_CONNTRACK_ESTABLISHED &&
+ timeout > timeouts[TCP_CONNTRACK_UNACK])
+ timeout = timeouts[TCP_CONNTRACK_UNACK];
} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
@@ -1054,11 +1072,13 @@ static int tcp_packet(struct nf_conn *ct,
/* Called when a new connection for this protocol found. */
static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
enum tcp_conntrack new_state;
const struct tcphdr *th;
struct tcphdr _tcph;
+ struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
@@ -1066,9 +1086,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
BUG_ON(th == NULL);
/* Don't need lock here: this conntrack not in circulation yet */
- new_state
- = tcp_conntracks[0][get_conntrack_index(th)]
- [TCP_CONNTRACK_NONE];
+ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
@@ -1077,6 +1095,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
}
if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/* SYN packet */
ct->proto.tcp.seen[0].td_end =
segment_seq_plus_len(ntohl(th->seq), skb->len,
@@ -1088,11 +1107,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.tcp.seen[0].td_end;
tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
- ct->proto.tcp.seen[1].flags = 0;
- } else if (nf_ct_tcp_loose == 0) {
+ } else if (tn->tcp_loose == 0) {
/* Don't try to pick up connections. */
return false;
} else {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/*
* We are in the middle of a connection,
* its history is lost for us.
@@ -1107,7 +1126,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.tcp.seen[0].td_maxend =
ct->proto.tcp.seen[0].td_end +
ct->proto.tcp.seen[0].td_maxwin;
- ct->proto.tcp.seen[0].td_scale = 0;
/* We assume SACK and liberal window checking to handle
* window scaling */
@@ -1116,13 +1134,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
IP_CT_TCP_FLAG_BE_LIBERAL;
}
- ct->proto.tcp.seen[1].td_end = 0;
- ct->proto.tcp.seen[1].td_maxend = 0;
- ct->proto.tcp.seen[1].td_maxwin = 0;
- ct->proto.tcp.seen[1].td_scale = 0;
-
/* tcp_packet will set them */
- ct->proto.tcp.state = TCP_CONNTRACK_NONE;
ct->proto.tcp.last_index = TCP_NONE_SET;
pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
@@ -1134,7 +1146,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
}
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -1150,21 +1162,22 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state);
-
- NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
- ct->proto.tcp.seen[0].td_scale);
-
- NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
- ct->proto.tcp.seen[1].td_scale);
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
+ nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+ ct->proto.tcp.seen[0].td_scale) ||
+ nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
+ ct->proto.tcp.seen[1].td_scale))
+ goto nla_put_failure;
tmp.flags = ct->proto.tcp.seen[0].flags;
- NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
- sizeof(struct nf_ct_tcp_flags), &tmp);
+ if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
+ sizeof(struct nf_ct_tcp_flags), &tmp))
+ goto nla_put_failure;
tmp.flags = ct->proto.tcp.seen[1].flags;
- NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
- sizeof(struct nf_ct_tcp_flags), &tmp);
+ if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
+ sizeof(struct nf_ct_tcp_flags), &tmp))
+ goto nla_put_failure;
spin_unlock_bh(&ct->lock);
nla_nest_end(skb, nest_parms);
@@ -1247,97 +1260,194 @@ static int tcp_nlattr_tuple_size(void)
}
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ int i;
+
+ /* set default TCP timeouts. */
+ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
+ timeouts[i] = tn->timeouts[i];
+
+ if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
+ timeouts[TCP_CONNTRACK_SYN_SENT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
+ timeouts[TCP_CONNTRACK_SYN_RECV] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
+ timeouts[TCP_CONNTRACK_ESTABLISHED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
+ timeouts[TCP_CONNTRACK_FIN_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
+ timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
+ timeouts[TCP_CONNTRACK_LAST_ACK] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
+ timeouts[TCP_CONNTRACK_TIME_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
+ timeouts[TCP_CONNTRACK_CLOSE] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
+ timeouts[TCP_CONNTRACK_SYN_SENT2] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
+ timeouts[TCP_CONNTRACK_RETRANS] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_UNACK]) {
+ timeouts[TCP_CONNTRACK_UNACK] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
+ }
+ return 0;
+}
+
+static int
+tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
+ htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
+ htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
+ htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
+ htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
+ htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
+ htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
+ htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
+ htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
+ [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
-static unsigned int tcp_sysctl_table_users;
-static struct ctl_table_header *tcp_sysctl_header;
static struct ctl_table tcp_sysctl_table[] = {
{
.procname = "nf_conntrack_tcp_timeout_syn_sent",
- .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_syn_recv",
- .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_established",
- .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_fin_wait",
- .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_close_wait",
- .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_last_ack",
- .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_time_wait",
- .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_close",
- .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_max_retrans",
- .data = &nf_ct_tcp_timeout_max_retrans,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_timeout_unacknowledged",
- .data = &nf_ct_tcp_timeout_unacknowledged,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_tcp_loose",
- .data = &nf_ct_tcp_loose,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "nf_conntrack_tcp_be_liberal",
- .data = &nf_ct_tcp_be_liberal,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "nf_conntrack_tcp_max_retrans",
- .data = &nf_ct_tcp_max_retrans,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
@@ -1349,91 +1459,78 @@ static struct ctl_table tcp_sysctl_table[] = {
static struct ctl_table tcp_compat_sysctl_table[] = {
{
.procname = "ip_conntrack_tcp_timeout_syn_sent",
- .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_syn_sent2",
- .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_syn_recv",
- .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_established",
- .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_fin_wait",
- .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_close_wait",
- .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_last_ack",
- .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_time_wait",
- .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_close",
- .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE],
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_timeout_max_retrans",
- .data = &nf_ct_tcp_timeout_max_retrans,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_tcp_loose",
- .data = &nf_ct_tcp_loose,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_tcp_be_liberal",
- .data = &nf_ct_tcp_be_liberal,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_tcp_max_retrans",
- .data = &nf_ct_tcp_max_retrans,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
@@ -1443,6 +1540,101 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
+static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_tcp_net *tn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(tcp_sysctl_table,
+ sizeof(tcp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
+ pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
+ pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
+ pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
+ pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
+ pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
+ pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
+ pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
+ pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK];
+ pn->ctl_table[10].data = &tn->tcp_loose;
+ pn->ctl_table[11].data = &tn->tcp_be_liberal;
+ pn->ctl_table[12].data = &tn->tcp_max_retrans;
+#endif
+ return 0;
+}
+
+static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_tcp_net *tn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
+ sizeof(tcp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
+ pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2];
+ pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
+ pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
+ pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
+ pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
+ pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
+ pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
+ pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
+ pn->ctl_compat_table[10].data = &tn->tcp_loose;
+ pn->ctl_compat_table[11].data = &tn->tcp_be_liberal;
+ pn->ctl_compat_table[12].data = &tn->tcp_max_retrans;
+#endif
+#endif
+ return 0;
+}
+
+static int tcp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ struct nf_proto_net *pn = &tn->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
+ tn->timeouts[i] = tcp_timeouts[i];
+
+ tn->tcp_loose = nf_ct_tcp_loose;
+ tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
+ tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
+ }
+
+ if (proto == AF_INET) {
+ ret = tcp_kmemdup_compat_sysctl_table(pn, tn);
+ if (ret < 0)
+ return ret;
+
+ ret = tcp_kmemdup_sysctl_table(pn, tn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = tcp_kmemdup_sysctl_table(pn, tn);
+
+ return ret;
+}
+
+static struct nf_proto_net *tcp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.tcp.pn;
+}
+
struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
{
.l3proto = PF_INET,
@@ -1453,9 +1645,10 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
.print_tuple = tcp_print_tuple,
.print_conntrack = tcp_print_conntrack,
.packet = tcp_packet,
+ .get_timeouts = tcp_get_timeouts,
.new = tcp_new,
.error = tcp_error,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = tcp_to_nlattr,
.nlattr_size = tcp_nlattr_size,
.from_nlattr = nlattr_to_tcp,
@@ -1464,14 +1657,18 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
.nlattr_tuple_size = tcp_nlattr_tuple_size,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &tcp_sysctl_table_users,
- .ctl_table_header = &tcp_sysctl_header,
- .ctl_table = tcp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- .ctl_compat_table = tcp_compat_sysctl_table,
-#endif
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_TCP_MAX,
+ .obj_size = sizeof(unsigned int) *
+ TCP_CONNTRACK_TIMEOUT_MAX,
+ .nla_policy = tcp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = tcp_init_net,
+ .get_net_proto = tcp_get_net_proto,
};
EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
@@ -1485,9 +1682,10 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
.print_tuple = tcp_print_tuple,
.print_conntrack = tcp_print_conntrack,
.packet = tcp_packet,
+ .get_timeouts = tcp_get_timeouts,
.new = tcp_new,
.error = tcp_error,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = tcp_to_nlattr,
.nlattr_size = tcp_nlattr_size,
.from_nlattr = nlattr_to_tcp,
@@ -1496,10 +1694,17 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
.nlattr_tuple_size = tcp_nlattr_tuple_size,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &tcp_sysctl_table_users,
- .ctl_table_header = &tcp_sysctl_header,
- .ctl_table = tcp_sysctl_table,
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_TCP_MAX,
+ .obj_size = sizeof(unsigned int) *
+ TCP_CONNTRACK_TIMEOUT_MAX,
+ .nla_policy = tcp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = tcp_init_net,
+ .get_net_proto = tcp_get_net_proto,
};
EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 8289088b821..9d7721cbce4 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -1,5 +1,6 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -25,8 +26,15 @@
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ;
-static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ;
+static unsigned int udp_timeouts[UDP_CT_MAX] = {
+ [UDP_CT_UNREPLIED] = 30*HZ,
+ [UDP_CT_REPLIED] = 180*HZ,
+};
+
+static inline struct nf_udp_net *udp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.udp;
+}
static bool udp_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
@@ -63,30 +71,38 @@ static int udp_print_tuple(struct seq_file *s,
ntohs(tuple->dst.u.udp.port));
}
+static unsigned int *udp_get_timeouts(struct net *net)
+{
+ return udp_pernet(net)->timeouts;
+}
+
/* Returns verdict for packet, and may modify conntracktype */
static int udp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
- unsigned int hooknum)
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
/* If we've seen traffic both ways, this is some kind of UDP
stream. Extend timeout. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream);
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_REPLIED]);
/* Also, more likely to be important, and not a probe */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
- } else
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout);
-
+ } else {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_UNREPLIED]);
+ }
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
return true;
}
@@ -104,7 +120,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: short packet ");
return -NF_ACCEPT;
}
@@ -112,7 +128,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
/* Truncated/malformed packets */
if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -128,7 +144,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: bad UDP checksum ");
return -NF_ACCEPT;
}
@@ -136,20 +152,65 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return NF_ACCEPT;
}
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct nf_udp_net *un = udp_pernet(net);
+
+ /* set default timeouts for UDP. */
+ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
+ timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) {
+ timeouts[UDP_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_UDP_REPLIED]) {
+ timeouts[UDP_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED,
+ htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED,
+ htonl(timeouts[UDP_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = {
+ [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
-static unsigned int udp_sysctl_table_users;
-static struct ctl_table_header *udp_sysctl_header;
static struct ctl_table udp_sysctl_table[] = {
{
.procname = "nf_conntrack_udp_timeout",
- .data = &nf_ct_udp_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_udp_timeout_stream",
- .data = &nf_ct_udp_timeout_stream,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -160,14 +221,12 @@ static struct ctl_table udp_sysctl_table[] = {
static struct ctl_table udp_compat_sysctl_table[] = {
{
.procname = "ip_conntrack_udp_timeout",
- .data = &nf_ct_udp_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ip_conntrack_udp_timeout_stream",
- .data = &nf_ct_udp_timeout_stream,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -177,6 +236,73 @@ static struct ctl_table udp_compat_sysctl_table[] = {
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
+static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_udp_net *un)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+ pn->ctl_table = kmemdup(udp_sysctl_table,
+ sizeof(udp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+ pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
+ pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED];
+#endif
+ return 0;
+}
+
+static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_udp_net *un)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
+ sizeof(udp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
+ pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED];
+#endif
+#endif
+ return 0;
+}
+
+static int udp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_udp_net *un = udp_pernet(net);
+ struct nf_proto_net *pn = &un->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < UDP_CT_MAX; i++)
+ un->timeouts[i] = udp_timeouts[i];
+ }
+
+ if (proto == AF_INET) {
+ ret = udp_kmemdup_compat_sysctl_table(pn, un);
+ if (ret < 0)
+ return ret;
+
+ ret = udp_kmemdup_sysctl_table(pn, un);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = udp_kmemdup_sysctl_table(pn, un);
+
+ return ret;
+}
+
+static struct nf_proto_net *udp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.udp.pn;
+}
+
struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
{
.l3proto = PF_INET,
@@ -186,22 +312,26 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
.packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
.new = udp_new,
.error = udp_error,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &udp_sysctl_table_users,
- .ctl_table_header = &udp_sysctl_header,
- .ctl_table = udp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- .ctl_compat_table = udp_compat_sysctl_table,
-#endif
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
};
EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
@@ -214,18 +344,25 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
.packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
.new = udp_new,
.error = udp_error,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &udp_sysctl_table_users,
- .ctl_table_header = &udp_sysctl_header,
- .ctl_table = udp_sysctl_table,
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
};
EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 263b5a72588..2750e6c69f8 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -24,8 +24,27 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_log.h>
-static unsigned int nf_ct_udplite_timeout __read_mostly = 30*HZ;
-static unsigned int nf_ct_udplite_timeout_stream __read_mostly = 180*HZ;
+enum udplite_conntrack {
+ UDPLITE_CT_UNREPLIED,
+ UDPLITE_CT_REPLIED,
+ UDPLITE_CT_MAX
+};
+
+static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
+ [UDPLITE_CT_UNREPLIED] = 30*HZ,
+ [UDPLITE_CT_REPLIED] = 180*HZ,
+};
+
+static int udplite_net_id __read_mostly;
+struct udplite_net {
+ struct nf_proto_net pn;
+ unsigned int timeouts[UDPLITE_CT_MAX];
+};
+
+static inline struct udplite_net *udplite_pernet(struct net *net)
+{
+ return net_generic(net, udplite_net_id);
+}
static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
@@ -60,31 +79,38 @@ static int udplite_print_tuple(struct seq_file *s,
ntohs(tuple->dst.u.udp.port));
}
+static unsigned int *udplite_get_timeouts(struct net *net)
+{
+ return udplite_pernet(net)->timeouts;
+}
+
/* Returns verdict for packet, and may modify conntracktype */
static int udplite_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
- unsigned int hooknum)
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
/* If we've seen traffic both ways, this is some kind of UDP
stream. Extend timeout. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
nf_ct_refresh_acct(ct, ctinfo, skb,
- nf_ct_udplite_timeout_stream);
+ timeouts[UDPLITE_CT_REPLIED]);
/* Also, more likely to be important, and not a probe */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
- } else
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout);
-
+ } else {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDPLITE_CT_UNREPLIED]);
+ }
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
return true;
}
@@ -105,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: short packet ");
return -NF_ACCEPT;
}
@@ -115,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
cscov = udplen;
else if (cscov < sizeof(*hdr) || cscov > udplen) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: invalid checksum coverage ");
return -NF_ACCEPT;
}
@@ -123,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
/* UDPLITE mandates checksums */
if (!hdr->check) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: checksum missing ");
return -NF_ACCEPT;
}
@@ -133,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
pf)) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: bad UDPLite checksum ");
return -NF_ACCEPT;
}
@@ -141,20 +167,65 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
return NF_ACCEPT;
}
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct udplite_net *un = udplite_pernet(net);
+
+ /* set default timeouts for UDPlite. */
+ timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
+ timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
+ timeouts[UDPLITE_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
+ timeouts[UDPLITE_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
+ htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
+ htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
+ [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
-static unsigned int udplite_sysctl_table_users;
-static struct ctl_table_header *udplite_sysctl_header;
static struct ctl_table udplite_sysctl_table[] = {
{
.procname = "nf_conntrack_udplite_timeout",
- .data = &nf_ct_udplite_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_udplite_timeout_stream",
- .data = &nf_ct_udplite_timeout_stream,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -163,6 +234,40 @@ static struct ctl_table udplite_sysctl_table[] = {
};
#endif /* CONFIG_SYSCTL */
+static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct udplite_net *un)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(udplite_sysctl_table,
+ sizeof(udplite_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
+ pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
+#endif
+ return 0;
+}
+
+static int udplite_init_net(struct net *net, u_int16_t proto)
+{
+ struct udplite_net *un = udplite_pernet(net);
+ struct nf_proto_net *pn = &un->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0 ; i < UDPLITE_CT_MAX; i++)
+ un->timeouts[i] = udplite_timeouts[i];
+ }
+
+ return udplite_kmemdup_sysctl_table(pn, un);
+}
+
static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
{
.l3proto = PF_INET,
@@ -172,19 +277,27 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
.packet = udplite_packet,
+ .get_timeouts = udplite_get_timeouts,
.new = udplite_new,
.error = udplite_error,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &udplite_sysctl_table_users,
- .ctl_table_header = &udplite_sysctl_header,
- .ctl_table = udplite_sysctl_table,
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
+ .obj_size = sizeof(unsigned int) *
+ CTA_TIMEOUT_UDPLITE_MAX,
+ .nla_policy = udplite_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &udplite_net_id,
+ .init_net = udplite_init_net,
};
static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
@@ -196,42 +309,94 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
.packet = udplite_packet,
+ .get_timeouts = udplite_get_timeouts,
.new = udplite_new,
.error = udplite_error,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_users = &udplite_sysctl_table_users,
- .ctl_table_header = &udplite_sysctl_header,
- .ctl_table = udplite_sysctl_table,
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
+ .obj_size = sizeof(unsigned int) *
+ CTA_TIMEOUT_UDPLITE_MAX,
+ .nla_policy = udplite_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &udplite_net_id,
+ .init_net = udplite_init_net,
+};
+
+static int udplite_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udplite4: pernet registration failed.\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udplite6: pernet registration failed.\n");
+ goto cleanup_udplite4;
+ }
+ return 0;
+
+cleanup_udplite4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
+out:
+ return ret;
+}
+
+static void udplite_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
+}
+
+static struct pernet_operations udplite_net_ops = {
+ .init = udplite_net_init,
+ .exit = udplite_net_exit,
+ .id = &udplite_net_id,
+ .size = sizeof(struct udplite_net),
};
static int __init nf_conntrack_proto_udplite_init(void)
{
- int err;
+ int ret;
+
+ ret = register_pernet_subsys(&udplite_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4);
+ if (ret < 0)
+ goto out_udplite4;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6);
+ if (ret < 0)
+ goto out_udplite6;
- err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite4);
- if (err < 0)
- goto err1;
- err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite6);
- if (err < 0)
- goto err2;
return 0;
-err2:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
-err1:
- return err;
+out_udplite6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+out_udplite4:
+ unregister_pernet_subsys(&udplite_net_ops);
+out_pernet:
+ return ret;
}
static void __exit nf_conntrack_proto_udplite_exit(void)
{
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+ unregister_pernet_subsys(&udplite_net_ops);
}
module_init(nf_conntrack_proto_udplite_init);
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index d9e27734b2a..4a2134fd3fc 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -69,16 +69,15 @@ static int help(struct sk_buff *skb,
void *sb_ptr;
int ret = NF_ACCEPT;
int dir = CTINFO2DIR(ctinfo);
- struct nf_ct_sane_master *ct_sane_info;
+ struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple *tuple;
struct sane_request *req;
struct sane_reply_net_start *reply;
- ct_sane_info = &nfct_help(ct)->help.ct_sane_info;
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY)
+ ctinfo != IP_CT_ESTABLISHED_REPLY)
return NF_ACCEPT;
/* Not a full tcp header? */
@@ -139,6 +138,7 @@ static int help(struct sk_buff *skb,
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
ret = NF_DROP;
goto out;
}
@@ -152,8 +152,10 @@ static int help(struct sk_buff *skb,
nf_ct_dump_tuple(&exp->tuple);
/* Can't expect this? Best to drop packet now. */
- if (nf_ct_expect_related(exp) != 0)
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
+ }
nf_ct_expect_put(exp);
@@ -163,7 +165,6 @@ out:
}
static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly;
-static char sane_names[MAX_PORTS][2][sizeof("sane-65535")] __read_mostly;
static const struct nf_conntrack_expect_policy sane_exp_policy = {
.max_expected = 1,
@@ -190,7 +191,6 @@ static void nf_conntrack_sane_fini(void)
static int __init nf_conntrack_sane_init(void)
{
int i, j = -1, ret = 0;
- char *tmpname;
sane_buffer = kmalloc(65536, GFP_KERNEL);
if (!sane_buffer)
@@ -205,17 +205,16 @@ static int __init nf_conntrack_sane_init(void)
sane[i][0].tuple.src.l3num = PF_INET;
sane[i][1].tuple.src.l3num = PF_INET6;
for (j = 0; j < 2; j++) {
+ sane[i][j].data_len = sizeof(struct nf_ct_sane_master);
sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);
sane[i][j].tuple.dst.protonum = IPPROTO_TCP;
sane[i][j].expect_policy = &sane_exp_policy;
sane[i][j].me = THIS_MODULE;
sane[i][j].help = help;
- tmpname = &sane_names[i][j][0];
if (ports[i] == SANE_PORT)
- sprintf(tmpname, "sane");
+ sprintf(sane[i][j].name, "sane");
else
- sprintf(tmpname, "sane-%d", ports[i]);
- sane[i][j].name = tmpname;
+ sprintf(sane[i][j].name, "sane-%d", ports[i]);
pr_debug("nf_ct_sane: registering helper for pf: %d "
"port: %d\n",
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
new file mode 100644
index 00000000000..f6e2ae91a80
--- /dev/null
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -0,0 +1,243 @@
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+
+int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ s32 off)
+{
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_conn_seqadj *seqadj;
+ struct nf_ct_seqadj *this_way;
+
+ if (off == 0)
+ return 0;
+
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+ seqadj = nfct_seqadj(ct);
+ this_way = &seqadj->seq[dir];
+ this_way->offset_before = off;
+ this_way->offset_after = off;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_init);
+
+int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ __be32 seq, s32 off)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_seqadj *this_way;
+
+ if (off == 0)
+ return 0;
+
+ if (unlikely(!seqadj)) {
+ WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n");
+ return 0;
+ }
+
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+ spin_lock_bh(&ct->lock);
+ this_way = &seqadj->seq[dir];
+ if (this_way->offset_before == this_way->offset_after ||
+ before(this_way->correction_pos, ntohl(seq))) {
+ this_way->correction_pos = ntohl(seq);
+ this_way->offset_before = this_way->offset_after;
+ this_way->offset_after += off;
+ }
+ spin_unlock_bh(&ct->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_set);
+
+void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ s32 off)
+{
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP)
+ return;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb));
+ nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set);
+
+/* Adjust one found SACK option including checksum correction */
+static void nf_ct_sack_block_adjust(struct sk_buff *skb,
+ struct tcphdr *tcph,
+ unsigned int sackoff,
+ unsigned int sackend,
+ struct nf_ct_seqadj *seq)
+{
+ while (sackoff < sackend) {
+ struct tcp_sack_block_wire *sack;
+ __be32 new_start_seq, new_end_seq;
+
+ sack = (void *)skb->data + sackoff;
+ if (after(ntohl(sack->start_seq) - seq->offset_before,
+ seq->correction_pos))
+ new_start_seq = htonl(ntohl(sack->start_seq) -
+ seq->offset_after);
+ else
+ new_start_seq = htonl(ntohl(sack->start_seq) -
+ seq->offset_before);
+
+ if (after(ntohl(sack->end_seq) - seq->offset_before,
+ seq->correction_pos))
+ new_end_seq = htonl(ntohl(sack->end_seq) -
+ seq->offset_after);
+ else
+ new_end_seq = htonl(ntohl(sack->end_seq) -
+ seq->offset_before);
+
+ pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+ ntohl(sack->start_seq), new_start_seq,
+ ntohl(sack->end_seq), new_end_seq);
+
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->start_seq, new_start_seq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->end_seq, new_end_seq, 0);
+ sack->start_seq = new_start_seq;
+ sack->end_seq = new_end_seq;
+ sackoff += sizeof(*sack);
+ }
+}
+
+/* TCP SACK sequence number adjustment */
+static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
+ unsigned int protoff,
+ struct tcphdr *tcph,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dir, optoff, optend;
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+
+ optoff = protoff + sizeof(struct tcphdr);
+ optend = protoff + tcph->doff * 4;
+
+ if (!skb_make_writable(skb, optend))
+ return 0;
+
+ dir = CTINFO2DIR(ctinfo);
+
+ while (optoff < optend) {
+ /* Usually: option, length. */
+ unsigned char *op = skb->data + optoff;
+
+ switch (op[0]) {
+ case TCPOPT_EOL:
+ return 1;
+ case TCPOPT_NOP:
+ optoff++;
+ continue;
+ default:
+ /* no partial options */
+ if (optoff + 1 == optend ||
+ optoff + op[1] > optend ||
+ op[1] < 2)
+ return 0;
+ if (op[0] == TCPOPT_SACK &&
+ op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
+ ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+ nf_ct_sack_block_adjust(skb, tcph, optoff + 2,
+ optoff+op[1],
+ &seqadj->seq[!dir]);
+ optoff += op[1];
+ }
+ }
+ return 1;
+}
+
+/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
+int nf_ct_seq_adjust(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned int protoff)
+{
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct tcphdr *tcph;
+ __be32 newseq, newack;
+ s32 seqoff, ackoff;
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *this_way, *other_way;
+ int res;
+
+ this_way = &seqadj->seq[dir];
+ other_way = &seqadj->seq[!dir];
+
+ if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+ return 0;
+
+ tcph = (void *)skb->data + protoff;
+ spin_lock_bh(&ct->lock);
+ if (after(ntohl(tcph->seq), this_way->correction_pos))
+ seqoff = this_way->offset_after;
+ else
+ seqoff = this_way->offset_before;
+
+ if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+ other_way->correction_pos))
+ ackoff = other_way->offset_after;
+ else
+ ackoff = other_way->offset_before;
+
+ newseq = htonl(ntohl(tcph->seq) + seqoff);
+ newack = htonl(ntohl(tcph->ack_seq) - ackoff);
+
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+
+ pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+ ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+ ntohl(newack));
+
+ tcph->seq = newseq;
+ tcph->ack_seq = newack;
+
+ res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+ spin_unlock_bh(&ct->lock);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_adjust);
+
+s32 nf_ct_seq_offset(const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ u32 seq)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *this_way;
+
+ if (!seqadj)
+ return 0;
+
+ this_way = &seqadj->seq[dir];
+ return after(seq, this_way->correction_pos) ?
+ this_way->offset_after : this_way->offset_before;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
+
+static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_seqadj),
+ .align = __alignof__(struct nf_conn_seqadj),
+ .id = NF_CT_EXT_SEQADJ,
+};
+
+int nf_conntrack_seqadj_init(void)
+{
+ return nf_ct_extend_register(&nf_ct_seqadj_extend);
+}
+
+void nf_conntrack_seqadj_fini(void)
+{
+ nf_ct_extend_unregister(&nf_ct_seqadj_extend);
+}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index bcf47eb518e..4c3ba1c8d68 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -52,60 +52,8 @@ module_param(sip_direct_media, int, 0600);
MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling "
"endpoints only (default 1)");
-unsigned int (*nf_nat_sip_hook)(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr,
- unsigned int *datalen) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_sip_hook);
-
-void (*nf_nat_sip_seq_adjust_hook)(struct sk_buff *skb, s16 off) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_sip_seq_adjust_hook);
-
-unsigned int (*nf_nat_sip_expect_hook)(struct sk_buff *skb,
- unsigned int dataoff,
- const char **dptr,
- unsigned int *datalen,
- struct nf_conntrack_expect *exp,
- unsigned int matchoff,
- unsigned int matchlen) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_sip_expect_hook);
-
-unsigned int (*nf_nat_sdp_addr_hook)(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr,
- unsigned int *datalen,
- unsigned int sdpoff,
- enum sdp_header_types type,
- enum sdp_header_types term,
- const union nf_inet_addr *addr)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_sdp_addr_hook);
-
-unsigned int (*nf_nat_sdp_port_hook)(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr,
- unsigned int *datalen,
- unsigned int matchoff,
- unsigned int matchlen,
- u_int16_t port) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_sdp_port_hook);
-
-unsigned int (*nf_nat_sdp_session_hook)(struct sk_buff *skb,
- unsigned int dataoff,
- const char **dptr,
- unsigned int *datalen,
- unsigned int sdpoff,
- const union nf_inet_addr *addr)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_sdp_session_hook);
-
-unsigned int (*nf_nat_sdp_media_hook)(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr,
- unsigned int *datalen,
- struct nf_conntrack_expect *rtp_exp,
- struct nf_conntrack_expect *rtcp_exp,
- unsigned int mediaoff,
- unsigned int medialen,
- union nf_inet_addr *rtp_addr)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_sdp_media_hook);
+const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
static int string_len(const struct nf_conn *ct, const char *dptr,
const char *limit, int *shift)
@@ -183,12 +131,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr,
return len + digits_len(ct, dptr, limit, shift);
}
-static int parse_addr(const struct nf_conn *ct, const char *cp,
- const char **endp, union nf_inet_addr *addr,
- const char *limit)
+static int sip_parse_addr(const struct nf_conn *ct, const char *cp,
+ const char **endp, union nf_inet_addr *addr,
+ const char *limit, bool delim)
{
const char *end;
- int ret = 0;
+ int ret;
if (!ct)
return 0;
@@ -197,16 +145,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
switch (nf_ct_l3num(ct)) {
case AF_INET:
ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
+ if (ret == 0)
+ return 0;
break;
case AF_INET6:
+ if (cp < limit && *cp == '[')
+ cp++;
+ else if (delim)
+ return 0;
+
ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
+ if (ret == 0)
+ return 0;
+
+ if (end < limit && *end == ']')
+ end++;
+ else if (delim)
+ return 0;
break;
default:
BUG();
}
- if (ret == 0 || end == cp)
- return 0;
if (endp)
*endp = end;
return 1;
@@ -219,7 +179,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr,
union nf_inet_addr addr;
const char *aux = dptr;
- if (!parse_addr(ct, dptr, &dptr, &addr, limit)) {
+ if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) {
pr_debug("ip: %s parse failed.!\n", dptr);
return 0;
}
@@ -296,7 +256,7 @@ int ct_sip_parse_request(const struct nf_conn *ct,
return 0;
dptr += shift;
- if (!parse_addr(ct, dptr, &end, addr, limit))
+ if (!sip_parse_addr(ct, dptr, &end, addr, limit, true))
return -1;
if (end < limit && *end == ':') {
end++;
@@ -550,7 +510,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,
if (ret == 0)
return ret;
- if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit))
+ if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true))
return -1;
if (*c == ':') {
c++;
@@ -599,7 +559,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,
unsigned int dataoff, unsigned int datalen,
const char *name,
unsigned int *matchoff, unsigned int *matchlen,
- union nf_inet_addr *addr)
+ union nf_inet_addr *addr, bool delim)
{
const char *limit = dptr + datalen;
const char *start, *end;
@@ -613,7 +573,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,
return 0;
start += strlen(name);
- if (!parse_addr(ct, start, &end, addr, limit))
+ if (!sip_parse_addr(ct, start, &end, addr, limit, delim))
return 0;
*matchoff = start - dptr;
*matchlen = end - start;
@@ -675,6 +635,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,
return 1;
}
+static int sdp_parse_addr(const struct nf_conn *ct, const char *cp,
+ const char **endp, union nf_inet_addr *addr,
+ const char *limit)
+{
+ const char *end;
+ int ret;
+
+ memset(addr, 0, sizeof(*addr));
+ switch (nf_ct_l3num(ct)) {
+ case AF_INET:
+ ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
+ break;
+ case AF_INET6:
+ ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
+ break;
+ default:
+ BUG();
+ }
+
+ if (ret == 0)
+ return 0;
+ if (endp)
+ *endp = end;
+ return 1;
+}
+
+/* skip ip address. returns its length. */
+static int sdp_addr_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ union nf_inet_addr addr;
+ const char *aux = dptr;
+
+ if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) {
+ pr_debug("ip: %s parse failed.!\n", dptr);
+ return 0;
+ }
+
+ return dptr - aux;
+}
+
/* SDP header parsing: a SDP session description contains an ordered set of
* headers, starting with a section containing general session parameters,
* optionally followed by multiple media descriptions.
@@ -684,13 +685,18 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,
* be tolerant and also accept records terminated with a single newline
* character". We handle both cases.
*/
-static const struct sip_header ct_sdp_hdrs[] = {
- [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
- [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len),
- [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len),
- [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len),
- [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len),
- [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
+static const struct sip_header ct_sdp_hdrs_v4[] = {
+ [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
+ [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len),
+ [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len),
+ [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
+};
+
+static const struct sip_header ct_sdp_hdrs_v6[] = {
+ [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
+ [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len),
+ [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len),
+ [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
};
/* Linear string search within SDP header values */
@@ -707,7 +713,7 @@ static const char *ct_sdp_header_search(const char *dptr, const char *limit,
}
/* Locate a SDP header (optionally a substring within the header value),
- * optionally stopping at the first occurence of the term header, parse
+ * optionally stopping at the first occurrence of the term header, parse
* it and return the offset and length of the data we're interested in.
*/
int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr,
@@ -716,11 +722,14 @@ int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr,
enum sdp_header_types term,
unsigned int *matchoff, unsigned int *matchlen)
{
- const struct sip_header *hdr = &ct_sdp_hdrs[type];
- const struct sip_header *thdr = &ct_sdp_hdrs[term];
+ const struct sip_header *hdrs, *hdr, *thdr;
const char *start = dptr, *limit = dptr + datalen;
int shift = 0;
+ hdrs = nf_ct_l3num(ct) == NFPROTO_IPV4 ? ct_sdp_hdrs_v4 : ct_sdp_hdrs_v6;
+ hdr = &hdrs[type];
+ thdr = &hdrs[term];
+
for (dptr += dataoff; dptr < limit; dptr++) {
/* Find beginning of line */
if (*dptr != '\r' && *dptr != '\n')
@@ -775,8 +784,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr,
if (ret <= 0)
return ret;
- if (!parse_addr(ct, dptr + *matchoff, NULL, addr,
- dptr + *matchoff + *matchlen))
+ if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr,
+ dptr + *matchoff + *matchlen))
return -1;
return 1;
}
@@ -788,11 +797,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
{
struct nf_conn_help *help = nfct_help(ct);
struct nf_conntrack_expect *exp;
- struct hlist_node *n, *next;
+ struct hlist_node *next;
int found = 0;
- spin_lock_bh(&nf_conntrack_lock);
- hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
if (exp->class != SIP_EXPECT_SIGNALLING ||
!nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) ||
exp->tuple.dst.protonum != proto ||
@@ -806,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
found = 1;
break;
}
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
return found;
}
@@ -814,10 +823,10 @@ static void flush_expectations(struct nf_conn *ct, bool media)
{
struct nf_conn_help *help = nfct_help(ct);
struct nf_conntrack_expect *exp;
- struct hlist_node *n, *next;
+ struct hlist_node *next;
- spin_lock_bh(&nf_conntrack_lock);
- hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
continue;
if (!del_timer(&exp->timeout))
@@ -827,10 +836,11 @@ static void flush_expectations(struct nf_conn *ct, bool media)
if (!media)
break;
}
- spin_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
-static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,
+static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
union nf_inet_addr *daddr, __be16 port,
enum sip_expectation_classes class,
@@ -846,8 +856,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,
int direct_rtp = 0, skip_expect = 0, ret = NF_DROP;
u_int16_t base_port;
__be16 rtp_port, rtcp_port;
- typeof(nf_nat_sdp_port_hook) nf_nat_sdp_port;
- typeof(nf_nat_sdp_media_hook) nf_nat_sdp_media;
+ const struct nf_nat_sip_hooks *hooks;
saddr = NULL;
if (sip_direct_media) {
@@ -886,34 +895,35 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,
exp->class != class)
break;
#ifdef CONFIG_NF_NAT_NEEDED
- if (exp->tuple.src.l3num == AF_INET && !direct_rtp &&
- (exp->saved_ip != exp->tuple.dst.u3.ip ||
+ if (!direct_rtp &&
+ (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) ||
exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) &&
ct->status & IPS_NAT_MASK) {
- daddr->ip = exp->saved_ip;
- tuple.dst.u3.ip = exp->saved_ip;
+ *daddr = exp->saved_addr;
+ tuple.dst.u3 = exp->saved_addr;
tuple.dst.u.udp.port = exp->saved_proto.udp.port;
direct_rtp = 1;
} else
#endif
skip_expect = 1;
} while (!skip_expect);
- rcu_read_unlock();
base_port = ntohs(tuple.dst.u.udp.port) & ~1;
rtp_port = htons(base_port);
rtcp_port = htons(base_port + 1);
if (direct_rtp) {
- nf_nat_sdp_port = rcu_dereference(nf_nat_sdp_port_hook);
- if (nf_nat_sdp_port &&
- !nf_nat_sdp_port(skb, dataoff, dptr, datalen,
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks &&
+ !hooks->sdp_port(skb, protoff, dataoff, dptr, datalen,
mediaoff, medialen, ntohs(rtp_port)))
goto err1;
}
- if (skip_expect)
+ if (skip_expect) {
+ rcu_read_unlock();
return NF_ACCEPT;
+ }
rtp_exp = nf_ct_expect_alloc(ct);
if (rtp_exp == NULL)
@@ -927,10 +937,10 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,
nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr,
IPPROTO_UDP, NULL, &rtcp_port);
- nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook);
- if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp)
- ret = nf_nat_sdp_media(skb, dataoff, dptr, datalen,
- rtp_exp, rtcp_exp,
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK && !direct_rtp)
+ ret = hooks->sdp_media(skb, protoff, dataoff, dptr,
+ datalen, rtp_exp, rtcp_exp,
mediaoff, medialen, daddr);
else {
if (nf_ct_expect_related(rtp_exp) == 0) {
@@ -944,6 +954,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,
err2:
nf_ct_expect_put(rtp_exp);
err1:
+ rcu_read_unlock();
return ret;
}
@@ -970,7 +981,8 @@ static const struct sdp_media_type *sdp_media_type(const char *dptr,
return NULL;
}
-static int process_sdp(struct sk_buff *skb, unsigned int dataoff,
+static int process_sdp(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq)
{
@@ -982,16 +994,12 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,
unsigned int caddr_len, maddr_len;
unsigned int i;
union nf_inet_addr caddr, maddr, rtp_addr;
+ const struct nf_nat_sip_hooks *hooks;
unsigned int port;
- enum sdp_header_types c_hdr;
const struct sdp_media_type *t;
int ret = NF_ACCEPT;
- typeof(nf_nat_sdp_addr_hook) nf_nat_sdp_addr;
- typeof(nf_nat_sdp_session_hook) nf_nat_sdp_session;
- nf_nat_sdp_addr = rcu_dereference(nf_nat_sdp_addr_hook);
- c_hdr = nf_ct_l3num(ct) == AF_INET ? SDP_HDR_CONNECTION_IP4 :
- SDP_HDR_CONNECTION_IP6;
+ hooks = rcu_dereference(nf_nat_sip_hooks);
/* Find beginning of session description */
if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
@@ -1005,7 +1013,7 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,
* the end of the session description. */
caddr_len = 0;
if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen,
- c_hdr, SDP_HDR_MEDIA,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
&matchoff, &matchlen, &caddr) > 0)
caddr_len = matchlen;
@@ -1029,111 +1037,129 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,
port = simple_strtoul(*dptr + mediaoff, NULL, 10);
if (port == 0)
continue;
- if (port < 1024 || port > 65535)
+ if (port < 1024 || port > 65535) {
+ nf_ct_helper_log(skb, ct, "wrong port %u", port);
return NF_DROP;
+ }
/* The media description overrides the session description. */
maddr_len = 0;
if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen,
- c_hdr, SDP_HDR_MEDIA,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
&matchoff, &matchlen, &maddr) > 0) {
maddr_len = matchlen;
memcpy(&rtp_addr, &maddr, sizeof(rtp_addr));
} else if (caddr_len)
memcpy(&rtp_addr, &caddr, sizeof(rtp_addr));
- else
+ else {
+ nf_ct_helper_log(skb, ct, "cannot parse SDP message");
return NF_DROP;
+ }
- ret = set_expected_rtp_rtcp(skb, dataoff, dptr, datalen,
+ ret = set_expected_rtp_rtcp(skb, protoff, dataoff,
+ dptr, datalen,
&rtp_addr, htons(port), t->class,
mediaoff, medialen);
- if (ret != NF_ACCEPT)
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct,
+ "cannot add expectation for voice");
return ret;
+ }
/* Update media connection address if present */
- if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) {
- ret = nf_nat_sdp_addr(skb, dataoff, dptr, datalen,
- mediaoff, c_hdr, SDP_HDR_MEDIA,
+ if (maddr_len && hooks && ct->status & IPS_NAT_MASK) {
+ ret = hooks->sdp_addr(skb, protoff, dataoff,
+ dptr, datalen, mediaoff,
+ SDP_HDR_CONNECTION,
+ SDP_HDR_MEDIA,
&rtp_addr);
- if (ret != NF_ACCEPT)
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SDP");
return ret;
+ }
}
i++;
}
/* Update session connection and owner addresses */
- nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook);
- if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK)
- ret = nf_nat_sdp_session(skb, dataoff, dptr, datalen, sdpoff,
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK)
+ ret = hooks->sdp_session(skb, protoff, dataoff,
+ dptr, datalen, sdpoff,
&rtp_addr);
return ret;
}
-static int process_invite_response(struct sk_buff *skb, unsigned int dataoff,
+static int process_invite_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq, unsigned int code)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_help *help = nfct_help(ct);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
if ((code >= 100 && code <= 199) ||
(code >= 200 && code <= 299))
- return process_sdp(skb, dataoff, dptr, datalen, cseq);
- else if (help->help.ct_sip_info.invite_cseq == cseq)
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
flush_expectations(ct, true);
return NF_ACCEPT;
}
-static int process_update_response(struct sk_buff *skb, unsigned int dataoff,
+static int process_update_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq, unsigned int code)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_help *help = nfct_help(ct);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
if ((code >= 100 && code <= 199) ||
(code >= 200 && code <= 299))
- return process_sdp(skb, dataoff, dptr, datalen, cseq);
- else if (help->help.ct_sip_info.invite_cseq == cseq)
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
flush_expectations(ct, true);
return NF_ACCEPT;
}
-static int process_prack_response(struct sk_buff *skb, unsigned int dataoff,
+static int process_prack_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq, unsigned int code)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_help *help = nfct_help(ct);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
if ((code >= 100 && code <= 199) ||
(code >= 200 && code <= 299))
- return process_sdp(skb, dataoff, dptr, datalen, cseq);
- else if (help->help.ct_sip_info.invite_cseq == cseq)
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
flush_expectations(ct, true);
return NF_ACCEPT;
}
-static int process_invite_request(struct sk_buff *skb, unsigned int dataoff,
+static int process_invite_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_help *help = nfct_help(ct);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
unsigned int ret;
flush_expectations(ct, true);
- ret = process_sdp(skb, dataoff, dptr, datalen, cseq);
+ ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
if (ret == NF_ACCEPT)
- help->help.ct_sip_info.invite_cseq = cseq;
+ ct_sip_info->invite_cseq = cseq;
return ret;
}
-static int process_bye_request(struct sk_buff *skb, unsigned int dataoff,
+static int process_bye_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq)
{
@@ -1148,22 +1174,23 @@ static int process_bye_request(struct sk_buff *skb, unsigned int dataoff,
* signalling connections. The expectation is marked inactive and is activated
* when receiving a response indicating success from the registrar.
*/
-static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
+static int process_register_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_help *help = nfct_help(ct);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned int matchoff, matchlen;
struct nf_conntrack_expect *exp;
union nf_inet_addr *saddr, daddr;
+ const struct nf_nat_sip_hooks *hooks;
__be16 port;
u8 proto;
unsigned int expires = 0;
int ret;
- typeof(nf_nat_sip_expect_hook) nf_nat_sip_expect;
/* Expected connections can not register again. */
if (ct->status & IPS_EXPECTED)
@@ -1184,9 +1211,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
SIP_HDR_CONTACT, NULL,
&matchoff, &matchlen, &daddr, &port);
- if (ret < 0)
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse contact");
return NF_DROP;
- else if (ret == 0)
+ } else if (ret == 0)
return NF_ACCEPT;
/* We don't support third-party registrations */
@@ -1199,8 +1227,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
if (ct_sip_parse_numerical_param(ct, *dptr,
matchoff + matchlen, *datalen,
- "expires=", NULL, NULL, &expires) < 0)
+ "expires=", NULL, NULL, &expires) < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse expires");
return NF_DROP;
+ }
if (expires == 0) {
ret = NF_ACCEPT;
@@ -1208,8 +1238,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
}
exp = nf_ct_expect_alloc(ct);
- if (!exp)
+ if (!exp) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
return NF_DROP;
+ }
saddr = NULL;
if (sip_direct_signalling)
@@ -1221,31 +1253,33 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
exp->helper = nfct_help(ct)->helper;
exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
- nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook);
- if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK)
- ret = nf_nat_sip_expect(skb, dataoff, dptr, datalen, exp,
- matchoff, matchlen);
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK)
+ ret = hooks->expect(skb, protoff, dataoff, dptr, datalen,
+ exp, matchoff, matchlen);
else {
- if (nf_ct_expect_related(exp) != 0)
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
- else
+ } else
ret = NF_ACCEPT;
}
nf_ct_expect_put(exp);
store_cseq:
if (ret == NF_ACCEPT)
- help->help.ct_sip_info.register_cseq = cseq;
+ ct_sip_info->register_cseq = cseq;
return ret;
}
-static int process_register_response(struct sk_buff *skb, unsigned int dataoff,
+static int process_register_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int cseq, unsigned int code)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_help *help = nfct_help(ct);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
union nf_inet_addr addr;
__be16 port;
@@ -1262,7 +1296,7 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,
* responses, so we store the sequence number of the last valid
* request and compare it here.
*/
- if (help->help.ct_sip_info.register_cseq != cseq)
+ if (ct_sip_info->register_cseq != cseq)
return NF_ACCEPT;
if (code >= 100 && code <= 199)
@@ -1281,9 +1315,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,
SIP_HDR_CONTACT, &in_contact,
&matchoff, &matchlen,
&addr, &port);
- if (ret < 0)
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse contact");
return NF_DROP;
- else if (ret == 0)
+ } else if (ret == 0)
break;
/* We don't support third-party registrations */
@@ -1298,8 +1333,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,
matchoff + matchlen,
*datalen, "expires=",
NULL, NULL, &c_expires);
- if (ret < 0)
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse expires");
return NF_DROP;
+ }
if (c_expires == 0)
break;
if (refresh_signalling_expectation(ct, &addr, proto, port,
@@ -1321,7 +1358,8 @@ static const struct sip_handler sip_handlers[] = {
SIP_HANDLER("REGISTER", process_register_request, process_register_response),
};
-static int process_sip_response(struct sk_buff *skb, unsigned int dataoff,
+static int process_sip_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen)
{
enum ip_conntrack_info ctinfo;
@@ -1332,15 +1370,21 @@ static int process_sip_response(struct sk_buff *skb, unsigned int dataoff,
if (*datalen < strlen("SIP/2.0 200"))
return NF_ACCEPT;
code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10);
- if (!code)
+ if (!code) {
+ nf_ct_helper_log(skb, ct, "cannot get code");
return NF_DROP;
+ }
if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
- &matchoff, &matchlen) <= 0)
+ &matchoff, &matchlen) <= 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse cseq");
return NF_DROP;
+ }
cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
- if (!cseq)
+ if (!cseq) {
+ nf_ct_helper_log(skb, ct, "cannot get cseq");
return NF_DROP;
+ }
matchend = matchoff + matchlen + 1;
for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
@@ -1352,19 +1396,37 @@ static int process_sip_response(struct sk_buff *skb, unsigned int dataoff,
if (*datalen < matchend + handler->len ||
strnicmp(*dptr + matchend, handler->method, handler->len))
continue;
- return handler->response(skb, dataoff, dptr, datalen,
+ return handler->response(skb, protoff, dataoff, dptr, datalen,
cseq, code);
}
return NF_ACCEPT;
}
-static int process_sip_request(struct sk_buff *skb, unsigned int dataoff,
+static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
const char **dptr, unsigned int *datalen)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned int matchoff, matchlen;
unsigned int cseq, i;
+ union nf_inet_addr addr;
+ __be16 port;
+
+ /* Many Cisco IP phones use a high source port for SIP requests, but
+ * listen for the response on port 5060. If we are the local
+ * router for one of these phones, save the port number from the
+ * Via: header so that nf_nat_sip can redirect the responses to
+ * the correct port.
+ */
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ SIP_HDR_VIA_UDP, NULL, &matchoff,
+ &matchlen, &addr, &port) > 0 &&
+ port != ct->tuplehash[dir].tuple.src.u.udp.port &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3))
+ ct_sip_info->forced_dport = port;
for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
const struct sip_handler *handler;
@@ -1377,33 +1439,41 @@ static int process_sip_request(struct sk_buff *skb, unsigned int dataoff,
continue;
if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
- &matchoff, &matchlen) <= 0)
+ &matchoff, &matchlen) <= 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse cseq");
return NF_DROP;
+ }
cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
- if (!cseq)
+ if (!cseq) {
+ nf_ct_helper_log(skb, ct, "cannot get cseq");
return NF_DROP;
+ }
- return handler->request(skb, dataoff, dptr, datalen, cseq);
+ return handler->request(skb, protoff, dataoff, dptr, datalen,
+ cseq);
}
return NF_ACCEPT;
}
static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct,
- unsigned int dataoff, const char **dptr,
- unsigned int *datalen)
+ unsigned int protoff, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
{
- typeof(nf_nat_sip_hook) nf_nat_sip;
+ const struct nf_nat_sip_hooks *hooks;
int ret;
if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0)
- ret = process_sip_request(skb, dataoff, dptr, datalen);
+ ret = process_sip_request(skb, protoff, dataoff, dptr, datalen);
else
- ret = process_sip_response(skb, dataoff, dptr, datalen);
+ ret = process_sip_response(skb, protoff, dataoff, dptr, datalen);
if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
- nf_nat_sip = rcu_dereference(nf_nat_sip_hook);
- if (nf_nat_sip && !nf_nat_sip(skb, dataoff, dptr, datalen))
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && !hooks->msg(skb, protoff, dataoff,
+ dptr, datalen)) {
+ nf_ct_helper_log(skb, ct, "cannot NAT SIP message");
ret = NF_DROP;
+ }
}
return ret;
@@ -1419,10 +1489,10 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
const char *dptr, *end;
s16 diff, tdiff = 0;
int ret = NF_ACCEPT;
- typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
+ bool term;
if (ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY)
+ ctinfo != IP_CT_ESTABLISHED_REPLY)
return NF_ACCEPT;
/* No Data ? */
@@ -1453,16 +1523,25 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
if (dptr + matchoff == end)
break;
- if (end + strlen("\r\n\r\n") > dptr + datalen)
- break;
- if (end[0] != '\r' || end[1] != '\n' ||
- end[2] != '\r' || end[3] != '\n')
+ term = false;
+ for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) {
+ if (end[0] == '\r' && end[1] == '\n' &&
+ end[2] == '\r' && end[3] == '\n') {
+ term = true;
+ break;
+ }
+ }
+ if (!term)
break;
end += strlen("\r\n\r\n") + clen;
msglen = origlen = end - dptr;
+ if (msglen > datalen)
+ return NF_ACCEPT;
- ret = process_sip_msg(skb, ct, dataoff, &dptr, &msglen);
+ ret = process_sip_msg(skb, ct, protoff, dataoff,
+ &dptr, &msglen);
+ /* process_sip_* functions report why this packet is dropped */
if (ret != NF_ACCEPT)
break;
diff = msglen - origlen;
@@ -1474,9 +1553,11 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
}
if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
- nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook);
- if (nf_nat_sip_seq_adjust)
- nf_nat_sip_seq_adjust(skb, tdiff);
+ const struct nf_nat_sip_hooks *hooks;
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks)
+ hooks->seq_adjust(skb, protoff, tdiff);
}
return ret;
@@ -1503,11 +1584,10 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
if (datalen < strlen("SIP/2.0 200"))
return NF_ACCEPT;
- return process_sip_msg(skb, ct, dataoff, &dptr, &datalen);
+ return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen);
}
static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly;
-static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly;
static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {
[SIP_EXPECT_SIGNALLING] = {
@@ -1548,7 +1628,6 @@ static void nf_conntrack_sip_fini(void)
static int __init nf_conntrack_sip_init(void)
{
int i, j, ret;
- char *tmpname;
if (ports_c == 0)
ports[ports_c++] = SIP_PORT;
@@ -1571,17 +1650,16 @@ static int __init nf_conntrack_sip_init(void)
sip[i][3].help = sip_help_tcp;
for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+ sip[i][j].data_len = sizeof(struct nf_ct_sip_master);
sip[i][j].tuple.src.u.udp.port = htons(ports[i]);
sip[i][j].expect_policy = sip_exp_policy;
sip[i][j].expect_class_max = SIP_EXPECT_MAX;
sip[i][j].me = THIS_MODULE;
- tmpname = &sip_names[i][j][0];
if (ports[i] == SIP_PORT)
- sprintf(tmpname, "sip");
+ sprintf(sip[i][j].name, "sip");
else
- sprintf(tmpname, "sip-%u", i);
- sip[i][j].name = tmpname;
+ sprintf(sip[i][j].name, "sip-%u", i);
pr_debug("port #%u: %u\n", i, ports[i]);
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 00000000000..87b95a2c270
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,78 @@
+/*
+ * SNMP service broadcast connection tracking helper
+ *
+ * (c) 2011 Jiri Olsa <jolsa@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+
+#define SNMP_PORT 161
+
+MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
+MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFCT_HELPER("snmp");
+
+static unsigned int timeout __read_mostly = 30;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
+
+static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ typeof(nf_nat_snmp_hook) nf_nat_snmp;
+
+ nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+
+ nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
+ if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
+ return nf_nat_snmp(skb, protoff, ct, ctinfo);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+ .max_expected = 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+ .name = "snmp",
+ .tuple.src.l3num = NFPROTO_IPV4,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .help = snmp_conntrack_help,
+ .expect_policy = &exp_policy,
+};
+
+static int __init nf_conntrack_snmp_init(void)
+{
+ exp_policy.timeout = timeout;
+ return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_snmp_fini(void)
+{
+ nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_snmp_init);
+module_exit(nf_conntrack_snmp_fini);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 0fb65705b44..f641751dba9 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1,5 +1,6 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -29,10 +30,12 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <linux/rculist_nulls.h>
MODULE_LICENSE("GPL");
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
int
print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto,
@@ -45,6 +48,7 @@ EXPORT_SYMBOL_GPL(print_tuple);
struct ct_iter_state {
struct seq_net_private p;
unsigned int bucket;
+ u_int64_t time_now;
};
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
@@ -56,7 +60,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -69,13 +73,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(
+ &net->ct.hash[st->bucket]));
}
return head;
}
@@ -93,6 +99,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
+ struct ct_iter_state *st = seq->private;
+
+ st->time_now = ktime_to_ns(ktime_get_real());
rcu_read_lock();
return ct_get_idx(seq, *pos);
}
@@ -118,7 +127,7 @@ static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
if (ret)
- return ret;
+ return 0;
ret = seq_printf(s, "secctx=%s ", secctx);
@@ -132,6 +141,34 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ struct ct_iter_state *st = s->private;
+ struct nf_conn_tstamp *tstamp;
+ s64 delta_time;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ delta_time = st->time_now - tstamp->start;
+ if (delta_time > 0)
+ delta_time = div_s64(delta_time, NSEC_PER_SEC);
+ else
+ delta_time = 0;
+
+ return seq_printf(s, "delta-time=%llu ",
+ (unsigned long long)delta_time);
+ }
+ return 0;
+}
+#else
+static inline int
+ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
/* return 0 on success, 1 in case of error */
static int ct_seq_show(struct seq_file *s, void *v)
{
@@ -200,13 +237,16 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#endif
+ if (ct_show_delta_time(s, ct))
+ goto release;
+
if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
goto release;
ret = 0;
release:
nf_ct_put(ct);
- return 0;
+ return ret;
}
static const struct seq_operations ct_seq_ops = {
@@ -327,7 +367,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops);
+ pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
if (!pde)
goto out_nf_conntrack;
@@ -338,7 +378,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
return 0;
out_stat_nf_conntrack:
- proc_net_remove(net, "nf_conntrack");
+ remove_proc_entry("nf_conntrack", net->proc_net);
out_nf_conntrack:
return -ENOMEM;
}
@@ -346,7 +386,7 @@ out_nf_conntrack:
static void nf_conntrack_standalone_fini_proc(struct net *net)
{
remove_proc_entry("nf_conntrack", net->proc_net_stat);
- proc_net_remove(net, "nf_conntrack");
+ remove_proc_entry("nf_conntrack", net->proc_net);
}
#else
static int nf_conntrack_standalone_init_proc(struct net *net)
@@ -357,7 +397,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
static void nf_conntrack_standalone_fini_proc(struct net *net)
{
}
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
/* Sysctl support */
@@ -368,7 +408,7 @@ static int log_invalid_proto_max = 255;
static struct ctl_table_header *nf_ct_netfilter_header;
-static ctl_table nf_ct_sysctl_table[] = {
+static struct ctl_table nf_ct_sysctl_table[] = {
{
.procname = "nf_conntrack_max",
.data = &nf_conntrack_max,
@@ -418,7 +458,7 @@ static ctl_table nf_ct_sysctl_table[] = {
#define NET_NF_CONNTRACK_MAX 2089
-static ctl_table nf_ct_netfilter_table[] = {
+static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
.data = &nf_conntrack_max,
@@ -429,22 +469,10 @@ static ctl_table nf_ct_netfilter_table[] = {
{ }
};
-static struct ctl_path nf_ct_path[] = {
- { .procname = "net", },
- { }
-};
-
static int nf_conntrack_standalone_init_sysctl(struct net *net)
{
struct ctl_table *table;
- if (net_eq(net, &init_net)) {
- nf_ct_netfilter_header =
- register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table);
- if (!nf_ct_netfilter_header)
- goto out;
- }
-
table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table),
GFP_KERNEL);
if (!table)
@@ -455,8 +483,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[3].data = &net->ct.sysctl_checksum;
table[4].data = &net->ct.sysctl_log_invalid;
- net->ct.sysctl_header = register_net_sysctl_table(net,
- nf_net_netfilter_sysctl_path, table);
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
if (!net->ct.sysctl_header)
goto out_unregister_netfilter;
@@ -465,10 +496,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
out_unregister_netfilter:
kfree(table);
out_kmemdup:
- if (net_eq(net, &init_net))
- unregister_sysctl_table(nf_ct_netfilter_header);
-out:
- printk(KERN_ERR "nf_conntrack: can't register to sysctl.\n");
return -ENOMEM;
}
@@ -476,8 +503,6 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)
{
struct ctl_table *table;
- if (net_eq(net, &init_net))
- unregister_sysctl_table(nf_ct_netfilter_header);
table = net->ct.sysctl_header->ctl_table_arg;
unregister_net_sysctl_table(net->ct.sysctl_header);
kfree(table);
@@ -493,51 +518,91 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)
}
#endif /* CONFIG_SYSCTL */
-static int nf_conntrack_net_init(struct net *net)
+static int nf_conntrack_pernet_init(struct net *net)
{
int ret;
- ret = nf_conntrack_init(net);
+ ret = nf_conntrack_init_net(net);
if (ret < 0)
goto out_init;
+
ret = nf_conntrack_standalone_init_proc(net);
if (ret < 0)
goto out_proc;
+
net->ct.sysctl_checksum = 1;
net->ct.sysctl_log_invalid = 0;
ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
goto out_sysctl;
+
return 0;
out_sysctl:
nf_conntrack_standalone_fini_proc(net);
out_proc:
- nf_conntrack_cleanup(net);
+ nf_conntrack_cleanup_net(net);
out_init:
return ret;
}
-static void nf_conntrack_net_exit(struct net *net)
+static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
{
- nf_conntrack_standalone_fini_sysctl(net);
- nf_conntrack_standalone_fini_proc(net);
- nf_conntrack_cleanup(net);
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_conntrack_standalone_fini_sysctl(net);
+ nf_conntrack_standalone_fini_proc(net);
+ }
+ nf_conntrack_cleanup_net_list(net_exit_list);
}
static struct pernet_operations nf_conntrack_net_ops = {
- .init = nf_conntrack_net_init,
- .exit = nf_conntrack_net_exit,
+ .init = nf_conntrack_pernet_init,
+ .exit_batch = nf_conntrack_pernet_exit,
};
static int __init nf_conntrack_standalone_init(void)
{
- return register_pernet_subsys(&nf_conntrack_net_ops);
+ int ret = nf_conntrack_init_start();
+ if (ret < 0)
+ goto out_start;
+
+#ifdef CONFIG_SYSCTL
+ nf_ct_netfilter_header =
+ register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
+ if (!nf_ct_netfilter_header) {
+ pr_err("nf_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
+ goto out_sysctl;
+ }
+#endif
+
+ ret = register_pernet_subsys(&nf_conntrack_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ nf_conntrack_init_end();
+ return 0;
+
+out_pernet:
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(nf_ct_netfilter_header);
+out_sysctl:
+#endif
+ nf_conntrack_cleanup_end();
+out_start:
+ return ret;
}
static void __exit nf_conntrack_standalone_fini(void)
{
+ nf_conntrack_cleanup_start();
unregister_pernet_subsys(&nf_conntrack_net_ops);
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(nf_ct_netfilter_header);
+#endif
+ nf_conntrack_cleanup_end();
}
module_init(nf_conntrack_standalone_init);
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 75466fd72f4..e68ab4fbd71 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -1,5 +1,5 @@
/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
@@ -60,8 +60,10 @@ static int tftp_help(struct sk_buff *skb,
nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
exp = nf_ct_expect_alloc(ct);
- if (exp == NULL)
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
return NF_DROP;
+ }
tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
nf_ct_l3num(ct),
@@ -74,8 +76,10 @@ static int tftp_help(struct sk_buff *skb,
nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);
if (nf_nat_tftp && ct->status & IPS_NAT_MASK)
ret = nf_nat_tftp(skb, ctinfo, exp);
- else if (nf_ct_expect_related(exp) != 0)
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
+ }
nf_ct_expect_put(exp);
break;
case TFTP_OPCODE_DATA:
@@ -92,7 +96,6 @@ static int tftp_help(struct sk_buff *skb,
}
static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly;
-static char tftp_names[MAX_PORTS][2][sizeof("tftp-65535")] __read_mostly;
static const struct nf_conntrack_expect_policy tftp_exp_policy = {
.max_expected = 1,
@@ -112,7 +115,6 @@ static void nf_conntrack_tftp_fini(void)
static int __init nf_conntrack_tftp_init(void)
{
int i, j, ret;
- char *tmpname;
if (ports_c == 0)
ports[ports_c++] = TFTP_PORT;
@@ -129,12 +131,10 @@ static int __init nf_conntrack_tftp_init(void)
tftp[i][j].me = THIS_MODULE;
tftp[i][j].help = tftp_help;
- tmpname = &tftp_names[i][j][0];
if (ports[i] == TFTP_PORT)
- sprintf(tmpname, "tftp");
+ sprintf(tftp[i][j].name, "tftp");
else
- sprintf(tmpname, "tftp-%u", i);
- tftp[i][j].name = tmpname;
+ sprintf(tftp[i][j].name, "tftp-%u", i);
ret = nf_conntrack_helper_register(&tftp[i][j]);
if (ret) {
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
new file mode 100644
index 00000000000..93da609d9d2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -0,0 +1,51 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+
+struct ctnl_timeout *
+(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
+
+void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+
+static struct nf_ct_ext_type timeout_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_timeout),
+ .align = __alignof__(struct nf_conn_timeout),
+ .id = NF_CT_EXT_TIMEOUT,
+};
+
+int nf_conntrack_timeout_init(void)
+{
+ int ret = nf_ct_extend_register(&timeout_extend);
+ if (ret < 0)
+ pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
+ return ret;
+}
+
+void nf_conntrack_timeout_fini(void)
+{
+ nf_ct_extend_unregister(&timeout_extend);
+}
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 00000000000..7a394df0deb
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,114 @@
+/*
+ * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+
+static bool nf_ct_tstamp __read_mostly;
+
+module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
+MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tstamp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_timestamp",
+ .data = &init_net.ct.sysctl_tstamp,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type tstamp_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_tstamp),
+ .align = __alignof__(struct nf_conn_tstamp),
+ .id = NF_CT_EXT_TSTAMP,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_tstamp;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter",
+ table);
+ if (!net->ct.tstamp_sysctl_header) {
+ printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.tstamp_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_tstamp_pernet_init(struct net *net)
+{
+ net->ct.sysctl_tstamp = nf_ct_tstamp;
+ return nf_conntrack_tstamp_init_sysctl(net);
+}
+
+void nf_conntrack_tstamp_pernet_fini(struct net *net)
+{
+ nf_conntrack_tstamp_fini_sysctl(net);
+}
+
+int nf_conntrack_tstamp_init(void)
+{
+ int ret;
+ ret = nf_ct_extend_register(&tstamp_extend);
+ if (ret < 0)
+ pr_err("nf_ct_tstamp: Unable to register extension\n");
+ return ret;
+}
+
+void nf_conntrack_tstamp_fini(void)
+{
+ nf_ct_extend_unregister(&tstamp_extend);
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 770f76432ad..61a3c927e63 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -13,26 +13,20 @@
/* core.c */
-extern unsigned int nf_iterate(struct list_head *head,
- struct sk_buff *skb,
- unsigned int hook,
- const struct net_device *indev,
- const struct net_device *outdev,
- struct list_head **i,
- int (*okfn)(struct sk_buff *),
- int hook_thresh);
+unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
+ unsigned int hook, const struct net_device *indev,
+ const struct net_device *outdev,
+ struct nf_hook_ops **elemp,
+ int (*okfn)(struct sk_buff *), int hook_thresh);
/* nf_queue.c */
-extern int nf_queue(struct sk_buff *skb,
- struct list_head *elem,
- u_int8_t pf, unsigned int hook,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *),
- unsigned int queuenum);
-extern int __init netfilter_queue_init(void);
+int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf,
+ unsigned int hook, struct net_device *indev,
+ struct net_device *outdev, int (*okfn)(struct sk_buff *),
+ unsigned int queuenum);
+int __init netfilter_queue_init(void);
/* nf_log.c */
-extern int __init netfilter_log_init(void);
+int __init netfilter_log_init(void);
#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index b07393eab88..85296d4eac0 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,6 @@
#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
-static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(nf_log_mutex);
@@ -32,13 +31,46 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
return NULL;
}
-/* return EEXIST if the same logger is registred, 0 on success. */
+void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+{
+ const struct nf_logger *log;
+
+ if (pf == NFPROTO_UNSPEC)
+ return;
+
+ mutex_lock(&nf_log_mutex);
+ log = rcu_dereference_protected(net->nf.nf_loggers[pf],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == NULL)
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_set);
+
+void nf_log_unset(struct net *net, const struct nf_logger *logger)
+{
+ int i;
+ const struct nf_logger *log;
+
+ mutex_lock(&nf_log_mutex);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ log = rcu_dereference_protected(net->nf.nf_loggers[i],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == logger)
+ RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL);
+ }
+ mutex_unlock(&nf_log_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_log_unset);
+
+/* return EEXIST if the same logger is registered, 0 on success. */
int nf_log_register(u_int8_t pf, struct nf_logger *logger)
{
- const struct nf_logger *llog;
int i;
- if (pf >= ARRAY_SIZE(nf_loggers))
+ if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers))
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(logger->list); i++)
@@ -52,10 +84,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
} else {
/* register at end of list to honor first register win */
list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
- llog = rcu_dereference_protected(nf_loggers[pf],
- lockdep_is_held(&nf_log_mutex));
- if (llog == NULL)
- rcu_assign_pointer(nf_loggers[pf], logger);
}
mutex_unlock(&nf_log_mutex);
@@ -66,45 +94,43 @@ EXPORT_SYMBOL(nf_log_register);
void nf_log_unregister(struct nf_logger *logger)
{
- const struct nf_logger *c_logger;
int i;
mutex_lock(&nf_log_mutex);
- for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
- c_logger = rcu_dereference_protected(nf_loggers[i],
- lockdep_is_held(&nf_log_mutex));
- if (c_logger == logger)
- rcu_assign_pointer(nf_loggers[i], NULL);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
list_del(&logger->list[i]);
- }
mutex_unlock(&nf_log_mutex);
-
- synchronize_rcu();
}
EXPORT_SYMBOL(nf_log_unregister);
-int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger)
+int nf_log_bind_pf(struct net *net, u_int8_t pf,
+ const struct nf_logger *logger)
{
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
+ return -EINVAL;
mutex_lock(&nf_log_mutex);
if (__find_logger(pf, logger->name) == NULL) {
mutex_unlock(&nf_log_mutex);
return -ENOENT;
}
- rcu_assign_pointer(nf_loggers[pf], logger);
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
mutex_unlock(&nf_log_mutex);
return 0;
}
EXPORT_SYMBOL(nf_log_bind_pf);
-void nf_log_unbind_pf(u_int8_t pf)
+void nf_log_unbind_pf(struct net *net, u_int8_t pf)
{
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
+ return;
mutex_lock(&nf_log_mutex);
- rcu_assign_pointer(nf_loggers[pf], NULL);
+ RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL);
mutex_unlock(&nf_log_mutex);
}
EXPORT_SYMBOL(nf_log_unbind_pf);
-void nf_log_packet(u_int8_t pf,
+void nf_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -117,12 +143,12 @@ void nf_log_packet(u_int8_t pf,
const struct nf_logger *logger;
rcu_read_lock();
- logger = rcu_dereference(nf_loggers[pf]);
+ logger = rcu_dereference(net->nf.nf_loggers[pf]);
if (logger) {
va_start(args, fmt);
vsnprintf(prefix, sizeof(prefix), fmt, args);
va_end(args);
- logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
+ logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
}
rcu_read_unlock();
}
@@ -131,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet);
#ifdef CONFIG_PROC_FS
static void *seq_start(struct seq_file *seq, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
+
mutex_lock(&nf_log_mutex);
- if (*pos >= ARRAY_SIZE(nf_loggers))
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -141,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos)
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- if (*pos >= ARRAY_SIZE(nf_loggers))
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -160,8 +190,10 @@ static int seq_show(struct seq_file *s, void *v)
const struct nf_logger *logger;
struct nf_logger *t;
int ret;
+ struct net *net = seq_file_net(s);
- logger = nf_loggers[*pos];
+ logger = rcu_dereference_protected(net->nf.nf_loggers[*pos],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
ret = seq_printf(s, "%2lld NONE (", *pos);
@@ -194,7 +226,8 @@ static const struct seq_operations nflog_seq_ops = {
static int nflog_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &nflog_seq_ops);
+ return seq_open_net(inode, file, &nflog_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations nflog_file_ops = {
@@ -202,25 +235,17 @@ static const struct file_operations nflog_file_ops = {
.open = nflog_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
};
#endif /* PROC_FS */
#ifdef CONFIG_SYSCTL
-static struct ctl_path nf_log_sysctl_path[] = {
- { .procname = "net", },
- { .procname = "netfilter", },
- { .procname = "nf_log", },
- { }
-};
-
static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
-static struct ctl_table_header *nf_log_dir_header;
-static int nf_log_proc_dostring(ctl_table *table, int write,
+static int nf_log_proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
const struct nf_logger *logger;
@@ -228,6 +253,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
size_t size = *lenp;
int r = 0;
int tindex = (unsigned long)table->extra1;
+ struct net *net = current->nsproxy->net_ns;
if (write) {
if (size > sizeof(buf))
@@ -236,7 +262,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
return -EFAULT;
if (!strcmp(buf, "NONE")) {
- nf_log_unbind_pf(tindex);
+ nf_log_unbind_pf(net, tindex);
return 0;
}
mutex_lock(&nf_log_mutex);
@@ -245,11 +271,12 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
mutex_unlock(&nf_log_mutex);
return -ENOENT;
}
- rcu_assign_pointer(nf_loggers[tindex], logger);
+ rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
mutex_unlock(&nf_log_mutex);
} else {
mutex_lock(&nf_log_mutex);
- logger = nf_loggers[tindex];
+ logger = rcu_dereference_protected(net->nf.nf_loggers[tindex],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
table->data = "NONE";
else
@@ -261,49 +288,112 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
return r;
}
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
{
int i;
-
- for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
- snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i);
- nf_log_sysctl_table[i].procname =
- nf_log_sysctl_fnames[i-NFPROTO_UNSPEC];
- nf_log_sysctl_table[i].data = NULL;
- nf_log_sysctl_table[i].maxlen =
- NFLOGGER_NAME_LEN * sizeof(char);
- nf_log_sysctl_table[i].mode = 0644;
- nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring;
- nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i;
+ struct ctl_table *table;
+
+ table = nf_log_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_log_sysctl_table,
+ sizeof(nf_log_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ } else {
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+ snprintf(nf_log_sysctl_fnames[i],
+ 3, "%d", i);
+ nf_log_sysctl_table[i].procname =
+ nf_log_sysctl_fnames[i];
+ nf_log_sysctl_table[i].data = NULL;
+ nf_log_sysctl_table[i].maxlen =
+ NFLOGGER_NAME_LEN * sizeof(char);
+ nf_log_sysctl_table[i].mode = 0644;
+ nf_log_sysctl_table[i].proc_handler =
+ nf_log_proc_dostring;
+ nf_log_sysctl_table[i].extra1 =
+ (void *)(unsigned long) i;
+ }
}
- nf_log_dir_header = register_sysctl_paths(nf_log_sysctl_path,
- nf_log_sysctl_table);
- if (!nf_log_dir_header)
- return -ENOMEM;
+ net->nf.nf_log_dir_header = register_net_sysctl(net,
+ "net/netfilter/nf_log",
+ table);
+ if (!net->nf.nf_log_dir_header)
+ goto err_reg;
return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->nf.nf_log_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_log_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
}
#else
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
{
return 0;
}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+}
#endif /* CONFIG_SYSCTL */
-int __init netfilter_log_init(void)
+static int __net_init nf_log_net_init(struct net *net)
{
- int i, r;
+ int ret = -ENOMEM;
+
#ifdef CONFIG_PROC_FS
if (!proc_create("nf_log", S_IRUGO,
- proc_net_netfilter, &nflog_file_ops))
- return -1;
+ net->nf.proc_netfilter, &nflog_file_ops))
+ return ret;
#endif
+ ret = netfilter_log_sysctl_init(net);
+ if (ret < 0)
+ goto out_sysctl;
- /* Errors will trigger panic, unroll on error is unnecessary. */
- r = netfilter_log_sysctl_init();
- if (r < 0)
- return r;
+ return 0;
+
+out_sysctl:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+#endif
+ return ret;
+}
+
+static void __net_exit nf_log_net_exit(struct net *net)
+{
+ netfilter_log_sysctl_exit(net);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+#endif
+}
+
+static struct pernet_operations nf_log_net_ops = {
+ .init = nf_log_net_init,
+ .exit = nf_log_net_exit,
+};
+
+int __init netfilter_log_init(void)
+{
+ int i, ret;
+
+ ret = register_pernet_subsys(&nf_log_net_ops);
+ if (ret < 0)
+ return ret;
for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
INIT_LIST_HEAD(&(nf_loggers_l[i]));
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
new file mode 100644
index 00000000000..eb772380a20
--- /dev/null
+++ b/net/netfilter/nf_nat_amanda.c
@@ -0,0 +1,90 @@
+/* Amanda extension for TCP NAT alteration.
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on a copy of HW's ip_nat_irc.c as well as other modules
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_amanda.h>
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_amanda");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ char buffer[sizeof("65535")];
+ u_int16_t port;
+ unsigned int ret;
+
+ /* Connection comes from client. */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_ORIGINAL;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one (ie. same IP: it will be TCP and master is UDP). */
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int res;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ res = nf_ct_expect_related(exp);
+ if (res == 0)
+ break;
+ else if (res != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, exp->master, "all ports in use");
+ return NF_DROP;
+ }
+
+ sprintf(buffer, "%u", port);
+ ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, strlen(buffer));
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, exp->master, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ }
+ return ret;
+}
+
+static void __exit nf_nat_amanda_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_amanda_init(void)
+{
+ BUG_ON(nf_nat_amanda_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_amanda_hook, help);
+ return 0;
+}
+
+module_init(nf_nat_amanda_init);
+module_exit(nf_nat_amanda_fini);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
new file mode 100644
index 00000000000..a49907b1dab
--- /dev/null
+++ b/net/netfilter/nf_nat_core.c
@@ -0,0 +1,898 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <net/xfrm.h>
+#include <linux/jhash.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_nat.h>
+
+static DEFINE_SPINLOCK(nf_nat_lock);
+
+static DEFINE_MUTEX(nf_nat_proto_mutex);
+static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
+ __read_mostly;
+static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
+ __read_mostly;
+
+
+inline const struct nf_nat_l3proto *
+__nf_nat_l3proto_find(u8 family)
+{
+ return rcu_dereference(nf_nat_l3protos[family]);
+}
+
+inline const struct nf_nat_l4proto *
+__nf_nat_l4proto_find(u8 family, u8 protonum)
+{
+ return rcu_dereference(nf_nat_l4protos[family][protonum]);
+}
+EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find);
+
+#ifdef CONFIG_XFRM
+static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ unsigned long statusbit;
+ u8 family;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return;
+
+ family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ rcu_read_lock();
+ l3proto = __nf_nat_l3proto_find(family);
+ if (l3proto == NULL)
+ goto out;
+
+ dir = CTINFO2DIR(ctinfo);
+ if (dir == IP_CT_DIR_ORIGINAL)
+ statusbit = IPS_DST_NAT;
+ else
+ statusbit = IPS_SRC_NAT;
+
+ l3proto->decode_session(skb, ct, dir, statusbit, fl);
+out:
+ rcu_read_unlock();
+}
+
+int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
+{
+ struct flowi fl;
+ unsigned int hh_len;
+ struct dst_entry *dst;
+ int err;
+
+ err = xfrm_decode_session(skb, &fl, family);
+ if (err < 0)
+ return err;
+
+ dst = skb_dst(skb);
+ if (dst->xfrm)
+ dst = ((struct xfrm_dst *)dst)->route;
+ dst_hold(dst);
+
+ dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(nf_xfrm_me_harder);
+#endif /* CONFIG_XFRM */
+
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ unsigned int hash;
+
+ /* Original src, to ensure we map it consistently if poss. */
+ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+ tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd);
+ return ((u64)hash * net->ct.nat_htable_size) >> 32;
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ /* Conntrack tracking doesn't keep track of outgoing tuples; only
+ * incoming ones. NAT means they don't have a fixed mapping,
+ * so we invert the tuple and look for the incoming reply.
+ *
+ * We could keep a separate hash if this proves too slow.
+ */
+ struct nf_conntrack_tuple reply;
+
+ nf_ct_invert_tuplepr(&reply, tuple);
+ return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+EXPORT_SYMBOL(nf_nat_used_tuple);
+
+/* If we source map this tuple so reply looks like reply_tuple, will
+ * that meet the constraints of range.
+ */
+static int in_range(const struct nf_nat_l3proto *l3proto,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range)
+{
+ /* If we are supposed to map IPs, then we must be in the
+ * range specified, otherwise let this drag us onto a new src IP.
+ */
+ if (range->flags & NF_NAT_RANGE_MAP_IPS &&
+ !l3proto->in_range(tuple, range))
+ return 0;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
+ l4proto->in_range(tuple, NF_NAT_MANIP_SRC,
+ &range->min_proto, &range->max_proto))
+ return 1;
+
+ return 0;
+}
+
+static inline int
+same_src(const struct nf_conn *ct,
+ const struct nf_conntrack_tuple *tuple)
+{
+ const struct nf_conntrack_tuple *t;
+
+ t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ return (t->dst.protonum == tuple->dst.protonum &&
+ nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
+ t->src.u.all == tuple->src.u.all);
+}
+
+/* Only called for SRC manip */
+static int
+find_appropriate_src(struct net *net, u16 zone,
+ const struct nf_nat_l3proto *l3proto,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *result,
+ const struct nf_nat_range *range)
+{
+ unsigned int h = hash_by_src(net, zone, tuple);
+ const struct nf_conn_nat *nat;
+ const struct nf_conn *ct;
+
+ hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
+ ct = nat->ct;
+ if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
+ /* Copy source part from reply tuple. */
+ nf_ct_invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
+
+ if (in_range(l3proto, l4proto, result, range))
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+ * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
+ * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+ * 1-65535, we don't do pro-rata allocation based on ports; we choose
+ * the ip with the lowest src-ip/dst-ip/proto usage.
+ */
+static void
+find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ const struct nf_conn *ct,
+ enum nf_nat_manip_type maniptype)
+{
+ union nf_inet_addr *var_ipp;
+ unsigned int i, max;
+ /* Host order */
+ u32 minip, maxip, j, dist;
+ bool full_range;
+
+ /* No IP mapping? Do nothing. */
+ if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
+ return;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ var_ipp = &tuple->src.u3;
+ else
+ var_ipp = &tuple->dst.u3;
+
+ /* Fast path: only one choice. */
+ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
+ *var_ipp = range->min_addr;
+ return;
+ }
+
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
+ else
+ max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;
+
+ /* Hashing source and destination IPs gives a fairly even
+ * spread in practice (if there are a small number of IPs
+ * involved, there usually aren't that many connections
+ * anyway). The consistency means that servers see the same
+ * client coming from the same IP (some Internet Banking sites
+ * like this), even across reboots.
+ */
+ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
+ range->flags & NF_NAT_RANGE_PERSISTENT ?
+ 0 : (__force u32)tuple->dst.u3.all[max] ^ zone);
+
+ full_range = false;
+ for (i = 0; i <= max; i++) {
+ /* If first bytes of the address are at the maximum, use the
+ * distance. Otherwise use the full range.
+ */
+ if (!full_range) {
+ minip = ntohl((__force __be32)range->min_addr.all[i]);
+ maxip = ntohl((__force __be32)range->max_addr.all[i]);
+ dist = maxip - minip + 1;
+ } else {
+ minip = 0;
+ dist = ~0;
+ }
+
+ var_ipp->all[i] = (__force __u32)
+ htonl(minip + (((u64)j * dist) >> 32));
+ if (var_ipp->all[i] != range->max_addr.all[i])
+ full_range = true;
+
+ if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
+ j ^= (__force u32)tuple->dst.u3.all[i];
+ }
+}
+
+/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
+ * we change the source to map into the range. For NF_INET_PRE_ROUTING
+ * and NF_INET_LOCAL_OUT, we change the destination to map into the
+ * range. It might not be possible to get a unique tuple, but we try.
+ * At worst (or if we race), we will end up with a final duplicate in
+ * __ip_conntrack_confirm and drop the packet. */
+static void
+get_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig_tuple,
+ const struct nf_nat_range *range,
+ struct nf_conn *ct,
+ enum nf_nat_manip_type maniptype)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
+ u16 zone = nf_ct_zone(ct);
+
+ rcu_read_lock();
+ l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
+ l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
+ orig_tuple->dst.protonum);
+
+ /* 1) If this srcip/proto/src-proto-part is currently mapped,
+ * and that same mapping gives a unique tuple within the given
+ * range, use that.
+ *
+ * This is only required for source (ie. NAT/masq) mappings.
+ * So far, we don't do local source mappings, so multiple
+ * manips not an issue.
+ */
+ if (maniptype == NF_NAT_MANIP_SRC &&
+ !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+ /* try the original tuple first */
+ if (in_range(l3proto, l4proto, orig_tuple, range)) {
+ if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ *tuple = *orig_tuple;
+ goto out;
+ }
+ } else if (find_appropriate_src(net, zone, l3proto, l4proto,
+ orig_tuple, tuple, range)) {
+ pr_debug("get_unique_tuple: Found current src map\n");
+ if (!nf_nat_used_tuple(tuple, ct))
+ goto out;
+ }
+ }
+
+ /* 2) Select the least-used IP/proto combination in the given range */
+ *tuple = *orig_tuple;
+ find_best_ips_proto(zone, tuple, range, ct, maniptype);
+
+ /* 3) The per-protocol part of the manip is made to map into
+ * the range to make a unique tuple.
+ */
+
+ /* Only bother mapping if it's not already in range and unique */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ if (l4proto->in_range(tuple, maniptype,
+ &range->min_proto,
+ &range->max_proto) &&
+ (range->min_proto.all == range->max_proto.all ||
+ !nf_nat_used_tuple(tuple, ct)))
+ goto out;
+ } else if (!nf_nat_used_tuple(tuple, ct)) {
+ goto out;
+ }
+ }
+
+ /* Last change: get protocol to try to obtain unique tuple. */
+ l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
+out:
+ rcu_read_unlock();
+}
+
+struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
+{
+ struct nf_conn_nat *nat = nfct_nat(ct);
+ if (nat)
+ return nat;
+
+ if (!nf_ct_is_confirmed(ct))
+ nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+
+ return nat;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
+
+unsigned int
+nf_nat_setup_info(struct nf_conn *ct,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_tuple curr_tuple, new_tuple;
+ struct nf_conn_nat *nat;
+
+ /* nat helper or nfctnetlink also setup binding */
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
+ maniptype == NF_NAT_MANIP_DST);
+ BUG_ON(nf_nat_initialized(ct, maniptype));
+
+ /* What we've got will look like inverse of reply. Normally
+ * this is what is in the conntrack, except for prior
+ * manipulations (future optimization: if num_manips == 0,
+ * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
+ */
+ nf_ct_invert_tuplepr(&curr_tuple,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
+
+ if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
+ struct nf_conntrack_tuple reply;
+
+ /* Alter conntrack table so will recognize replies. */
+ nf_ct_invert_tuplepr(&reply, &new_tuple);
+ nf_conntrack_alter_reply(ct, &reply);
+
+ /* Non-atomic: we own this at the moment. */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ ct->status |= IPS_SRC_NAT;
+ else
+ ct->status |= IPS_DST_NAT;
+
+ if (nfct_help(ct))
+ nfct_seqadj_ext_add(ct);
+ }
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ unsigned int srchash;
+
+ srchash = hash_by_src(net, nf_ct_zone(ct),
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ spin_lock_bh(&nf_nat_lock);
+ /* nf_conntrack_alter_reply might re-allocate extension aera */
+ nat = nfct_nat(ct);
+ nat->ct = ct;
+ hlist_add_head_rcu(&nat->bysource,
+ &net->ct.nat_bysource[srchash]);
+ spin_unlock_bh(&nf_nat_lock);
+ }
+
+ /* It's done. */
+ if (maniptype == NF_NAT_MANIP_DST)
+ ct->status |= IPS_DST_NAT_DONE;
+ else
+ ct->status |= IPS_SRC_NAT_DONE;
+
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL(nf_nat_setup_info);
+
+static unsigned int
+__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
+{
+ /* Force range to this IP; let proto decide mapping for
+ * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+ * Use reply in case it's already been mangled (eg local packet).
+ */
+ union nf_inet_addr ip =
+ (manip == NF_NAT_MANIP_SRC ?
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
+ struct nf_nat_range range = {
+ .flags = NF_NAT_RANGE_MAP_IPS,
+ .min_addr = ip,
+ .max_addr = ip,
+ };
+ return nf_nat_setup_info(ct, &range, manip);
+}
+
+unsigned int
+nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+ return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
+}
+EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
+
+/* Do packet manipulations according to nf_nat_setup_info. */
+unsigned int nf_nat_packet(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned long statusbit;
+ enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
+
+ if (mtype == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply dir. */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ /* Non-atomic: these bits don't change. */
+ if (ct->status & statusbit) {
+ struct nf_conntrack_tuple target;
+
+ /* We are aiming to look like inverse of other direction. */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+ l3proto = __nf_nat_l3proto_find(target.src.l3num);
+ l4proto = __nf_nat_l4proto_find(target.src.l3num,
+ target.dst.protonum);
+ if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_nat_packet);
+
+struct nf_nat_proto_clean {
+ u8 l3proto;
+ u8 l4proto;
+};
+
+/* kill conntracks with affected NAT section */
+static int nf_nat_proto_remove(struct nf_conn *i, void *data)
+{
+ const struct nf_nat_proto_clean *clean = data;
+ struct nf_conn_nat *nat = nfct_nat(i);
+
+ if (!nat)
+ return 0;
+
+ if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
+ (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
+ return 0;
+
+ return i->status & IPS_NAT_MASK ? 1 : 0;
+}
+
+static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
+{
+ struct nf_conn_nat *nat = nfct_nat(ct);
+
+ if (nf_nat_proto_remove(ct, data))
+ return 1;
+
+ if (!nat || !nat->ct)
+ return 0;
+
+ /* This netns is being destroyed, and conntrack has nat null binding.
+ * Remove it from bysource hash, as the table will be freed soon.
+ *
+ * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
+ * will delete entry from already-freed table.
+ */
+ if (!del_timer(&ct->timeout))
+ return 1;
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&nat->bysource);
+ ct->status &= ~IPS_NAT_DONE_MASK;
+ nat->ct = NULL;
+ spin_unlock_bh(&nf_nat_lock);
+
+ add_timer(&ct->timeout);
+
+ /* don't delete conntrack. Although that would make things a lot
+ * simpler, we'd end up flushing all conntracks on nat rmmod.
+ */
+ return 0;
+}
+
+static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
+{
+ struct nf_nat_proto_clean clean = {
+ .l3proto = l3proto,
+ .l4proto = l4proto,
+ };
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
+ rtnl_unlock();
+}
+
+static void nf_nat_l3proto_clean(u8 l3proto)
+{
+ struct nf_nat_proto_clean clean = {
+ .l3proto = l3proto,
+ };
+ struct net *net;
+
+ rtnl_lock();
+
+ for_each_net(net)
+ nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
+ rtnl_unlock();
+}
+
+/* Protocol registration. */
+int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto)
+{
+ const struct nf_nat_l4proto **l4protos;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ if (nf_nat_l4protos[l3proto] == NULL) {
+ l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *),
+ GFP_KERNEL);
+ if (l4protos == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < IPPROTO_MAX; i++)
+ RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown);
+
+ /* Before making proto_array visible to lockless readers,
+ * we must make sure its content is committed to memory.
+ */
+ smp_wmb();
+
+ nf_nat_l4protos[l3proto] = l4protos;
+ }
+
+ if (rcu_dereference_protected(
+ nf_nat_l4protos[l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_nat_proto_mutex)
+ ) != &nf_nat_l4proto_unknown) {
+ ret = -EBUSY;
+ goto out;
+ }
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto);
+ out:
+ mutex_unlock(&nf_nat_proto_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_register);
+
+/* No one stores the protocol anywhere; simply delete it. */
+void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto)
+{
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto],
+ &nf_nat_l4proto_unknown);
+ mutex_unlock(&nf_nat_proto_mutex);
+ synchronize_rcu();
+
+ nf_nat_l4proto_clean(l3proto, l4proto->l4proto);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
+
+int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
+{
+ int err;
+
+ err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
+ if (err < 0)
+ return err;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
+ &nf_nat_l4proto_tcp);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
+ &nf_nat_l4proto_udp);
+ mutex_unlock(&nf_nat_proto_mutex);
+
+ RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);
+
+void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
+{
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
+ mutex_unlock(&nf_nat_proto_mutex);
+ synchronize_rcu();
+
+ nf_nat_l3proto_clean(l3proto->l3proto);
+ nf_ct_l3proto_module_put(l3proto->l3proto);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
+
+/* No one using conntrack by the time this called. */
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
+{
+ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
+
+ if (nat == NULL || nat->ct == NULL)
+ return;
+
+ NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&nat->bysource);
+ spin_unlock_bh(&nf_nat_lock);
+}
+
+static void nf_nat_move_storage(void *new, void *old)
+{
+ struct nf_conn_nat *new_nat = new;
+ struct nf_conn_nat *old_nat = old;
+ struct nf_conn *ct = old_nat->ct;
+
+ if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
+ return;
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
+ spin_unlock_bh(&nf_nat_lock);
+}
+
+static struct nf_ct_ext_type nat_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_nat),
+ .align = __alignof__(struct nf_conn_nat),
+ .destroy = nf_nat_cleanup_conntrack,
+ .move = nf_nat_move_storage,
+ .id = NF_CT_EXT_NAT,
+ .flags = NF_CT_EXT_F_PREALLOC,
+};
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
+ [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
+ [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
+};
+
+static int nfnetlink_parse_nat_proto(struct nlattr *attr,
+ const struct nf_conn *ct,
+ struct nf_nat_range *range)
+{
+ struct nlattr *tb[CTA_PROTONAT_MAX+1];
+ const struct nf_nat_l4proto *l4proto;
+ int err;
+
+ err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
+ if (err < 0)
+ return err;
+
+ l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->nlattr_to_range)
+ err = l4proto->nlattr_to_range(tb, range);
+
+ return err;
+}
+
+static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
+ [CTA_NAT_V4_MINIP] = { .type = NLA_U32 },
+ [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 },
+ [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) },
+ [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) },
+ [CTA_NAT_PROTO] = { .type = NLA_NESTED },
+};
+
+static int
+nfnetlink_parse_nat(const struct nlattr *nat,
+ const struct nf_conn *ct, struct nf_nat_range *range,
+ const struct nf_nat_l3proto *l3proto)
+{
+ struct nlattr *tb[CTA_NAT_MAX+1];
+ int err;
+
+ memset(range, 0, sizeof(*range));
+
+ err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
+ if (err < 0)
+ return err;
+
+ err = l3proto->nlattr_to_range(tb, range);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_NAT_PROTO])
+ return 0;
+
+ return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
+}
+
+/* This function is called under rcu_read_lock() */
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ struct nf_nat_range range;
+ const struct nf_nat_l3proto *l3proto;
+ int err;
+
+ /* Should not happen, restricted to creating new conntracks
+ * via ctnetlink.
+ */
+ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
+ return -EEXIST;
+
+ /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to
+ * attach the null binding, otherwise this may oops.
+ */
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ if (l3proto == NULL)
+ return -EAGAIN;
+
+ /* No NAT information has been passed, allocate the null-binding */
+ if (attr == NULL)
+ return __nf_nat_alloc_null_binding(ct, manip);
+
+ err = nfnetlink_parse_nat(attr, ct, &range, l3proto);
+ if (err < 0)
+ return err;
+
+ return nf_nat_setup_info(ct, &range, manip);
+}
+#else
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static int __net_init nf_nat_net_init(struct net *net)
+{
+ /* Leave them the same for the moment. */
+ net->ct.nat_htable_size = net->ct.htable_size;
+ net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
+ if (!net->ct.nat_bysource)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit nf_nat_net_exit(struct net *net)
+{
+ struct nf_nat_proto_clean clean = {};
+
+ nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
+ synchronize_rcu();
+ nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
+}
+
+static struct pernet_operations nf_nat_net_ops = {
+ .init = nf_nat_net_init,
+ .exit = nf_nat_net_exit,
+};
+
+static struct nf_ct_helper_expectfn follow_master_nat = {
+ .name = "nat-follow-master",
+ .expectfn = nf_nat_follow_master,
+};
+
+static int __init nf_nat_init(void)
+{
+ int ret;
+
+ ret = nf_ct_extend_register(&nat_extend);
+ if (ret < 0) {
+ printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
+ return ret;
+ }
+
+ ret = register_pernet_subsys(&nf_nat_net_ops);
+ if (ret < 0)
+ goto cleanup_extend;
+
+ nf_ct_helper_expectfn_register(&follow_master_nat);
+
+ /* Initialize fake conntrack so that NAT will skip it */
+ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
+
+ BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
+ RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
+ nfnetlink_parse_nat_setup);
+#ifdef CONFIG_XFRM
+ BUG_ON(nf_nat_decode_session_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
+#endif
+ return 0;
+
+ cleanup_extend:
+ nf_ct_extend_unregister(&nat_extend);
+ return ret;
+}
+
+static void __exit nf_nat_cleanup(void)
+{
+ unsigned int i;
+
+ unregister_pernet_subsys(&nf_nat_net_ops);
+ nf_ct_extend_unregister(&nat_extend);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
+#ifdef CONFIG_XFRM
+ RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
+#endif
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ kfree(nf_nat_l4protos[i]);
+ synchronize_net();
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(nf_nat_init);
+module_exit(nf_nat_cleanup);
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
new file mode 100644
index 00000000000..e84a578dbe3
--- /dev/null
+++ b/net/netfilter/nf_nat_ftp.c
@@ -0,0 +1,146 @@
+/* FTP extension for TCP NAT alteration. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/inet.h>
+#include <linux/tcp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_ftp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp NAT helper");
+MODULE_ALIAS("ip_nat_ftp");
+
+/* FIXME: Time out? --RR */
+
+static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type,
+ char *buffer, size_t buflen,
+ union nf_inet_addr *addr, u16 port)
+{
+ switch (type) {
+ case NF_CT_FTP_PORT:
+ case NF_CT_FTP_PASV:
+ return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
+ ((unsigned char *)&addr->ip)[0],
+ ((unsigned char *)&addr->ip)[1],
+ ((unsigned char *)&addr->ip)[2],
+ ((unsigned char *)&addr->ip)[3],
+ port >> 8,
+ port & 0xFF);
+ case NF_CT_FTP_EPRT:
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return snprintf(buffer, buflen, "|1|%pI4|%u|",
+ &addr->ip, port);
+ else
+ return snprintf(buffer, buflen, "|2|%pI6|%u|",
+ &addr->ip6, port);
+ case NF_CT_FTP_EPSV:
+ return snprintf(buffer, buflen, "|||%u|", port);
+ }
+
+ return 0;
+}
+
+/* So, this packet has hit the connection tracking matching code.
+ Mangle it, and change the expectation to match the new version. */
+static unsigned int nf_nat_ftp(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ enum nf_ct_ftp_type type,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_conn *ct = exp->master;
+ char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+
+ /* Connection will come from wherever this packet goes, hence !dir */
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = !dir;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one. */
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use");
+ return NF_DROP;
+ }
+
+ buflen = nf_nat_ftp_fmt_cmd(ct, type, buffer, sizeof(buffer),
+ &newaddr, port);
+ if (!buflen)
+ goto out;
+
+ pr_debug("calling nf_nat_mangle_tcp_packet\n");
+
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
+ matchlen, buffer, buflen))
+ goto out;
+
+ return NF_ACCEPT;
+
+out:
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ return NF_DROP;
+}
+
+static void __exit nf_nat_ftp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_ftp_init(void)
+{
+ BUG_ON(nf_nat_ftp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_ftp_init);
+module_exit(nf_nat_ftp_fini);
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
new file mode 100644
index 00000000000..2840abb5bb9
--- /dev/null
+++ b/net/netfilter/nf_nat_helper.c
@@ -0,0 +1,212 @@
+/* nf_nat_helper.c - generic support functions for NAT helpers
+ *
+ * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
+ * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2007-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+
+/* Frobs data inside this packet, which is linear. */
+static void mangle_contents(struct sk_buff *skb,
+ unsigned int dataoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ unsigned char *data;
+
+ BUG_ON(skb_is_nonlinear(skb));
+ data = skb_network_header(skb) + dataoff;
+
+ /* move post-replacement */
+ memmove(data + match_offset + rep_len,
+ data + match_offset + match_len,
+ skb_tail_pointer(skb) - (skb_network_header(skb) + dataoff +
+ match_offset + match_len));
+
+ /* insert data from buffer */
+ memcpy(data + match_offset, rep_buffer, rep_len);
+
+ /* update skb info */
+ if (rep_len > match_len) {
+ pr_debug("nf_nat_mangle_packet: Extending packet by "
+ "%u from %u bytes\n", rep_len - match_len, skb->len);
+ skb_put(skb, rep_len - match_len);
+ } else {
+ pr_debug("nf_nat_mangle_packet: Shrinking packet from "
+ "%u from %u bytes\n", match_len - rep_len, skb->len);
+ __skb_trim(skb, skb->len + rep_len - match_len);
+ }
+
+ if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) {
+ /* fix IP hdr checksum information */
+ ip_hdr(skb)->tot_len = htons(skb->len);
+ ip_send_check(ip_hdr(skb));
+ } else
+ ipv6_hdr(skb)->payload_len =
+ htons(skb->len - sizeof(struct ipv6hdr));
+}
+
+/* Unusual, but possible case. */
+static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
+{
+ if (skb->len + extra > 65535)
+ return 0;
+
+ if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
+ return 0;
+
+ return 1;
+}
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
+ * command in FTP).
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * */
+int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len, bool adjust)
+{
+ const struct nf_nat_l3proto *l3proto;
+ struct tcphdr *tcph;
+ int oldlen, datalen;
+
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (rep_len > match_len &&
+ rep_len - match_len > skb_tailroom(skb) &&
+ !enlarge_skb(skb, rep_len - match_len))
+ return 0;
+
+ SKB_LINEAR_ASSERT(skb);
+
+ tcph = (void *)skb->data + protoff;
+
+ oldlen = skb->len - protoff;
+ mangle_contents(skb, protoff + tcph->doff*4,
+ match_offset, match_len, rep_buffer, rep_len);
+
+ datalen = skb->len - protoff;
+
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ l3proto->csum_recalc(skb, IPPROTO_TCP, tcph, &tcph->check,
+ datalen, oldlen);
+
+ if (adjust && rep_len != match_len)
+ nf_ct_seqadj_set(ct, ctinfo, tcph->seq,
+ (int)rep_len - (int)match_len);
+
+ return 1;
+}
+EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
+ * command in the Amanda protocol)
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
+ * should be fairly easy to do.
+ */
+int
+nf_nat_mangle_udp_packet(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ const struct nf_nat_l3proto *l3proto;
+ struct udphdr *udph;
+ int datalen, oldlen;
+
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (rep_len > match_len &&
+ rep_len - match_len > skb_tailroom(skb) &&
+ !enlarge_skb(skb, rep_len - match_len))
+ return 0;
+
+ udph = (void *)skb->data + protoff;
+
+ oldlen = skb->len - protoff;
+ mangle_contents(skb, protoff + sizeof(*udph),
+ match_offset, match_len, rep_buffer, rep_len);
+
+ /* update the length of the UDP packet */
+ datalen = skb->len - protoff;
+ udph->len = htons(datalen);
+
+ /* fix udp checksum if udp checksum was previously calculated */
+ if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
+ return 1;
+
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check,
+ datalen, oldlen);
+
+ return 1;
+}
+EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
+
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void nf_nat_follow_master(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = exp->saved_proto;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.src.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
new file mode 100644
index 00000000000..1fb2258c353
--- /dev/null
+++ b/net/netfilter/nf_nat_irc.c
@@ -0,0 +1,119 @@
+/* IRC extension for TCP NAT alteration.
+ *
+ * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * based on a copy of RR's ip_nat_ftp.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_irc.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("IRC (DCC) NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_irc");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ char buffer[sizeof("4294967296 65635")];
+ struct nf_conn *ct = exp->master;
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ unsigned int ret;
+
+ /* Reply comes from server. */
+ newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3;
+
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use");
+ return NF_DROP;
+ }
+
+ /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
+ * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
+ * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
+ *
+ * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
+ * 255.255.255.255==4294967296, 10 digits)
+ * P: bound port (min 1 d, max 5d (65635))
+ * F: filename (min 1 d )
+ * S: size (min 1 d )
+ * 0x01, \n: terminators
+ */
+ /* AAA = "us", ie. where server normally talks to. */
+ snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port);
+ pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
+ buffer, &newaddr.ip, port);
+
+ ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
+ matchlen, buffer, strlen(buffer));
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ }
+
+ return ret;
+}
+
+static void __exit nf_nat_irc_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_irc_init(void)
+{
+ BUG_ON(nf_nat_irc_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_irc_hook, help);
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_irc_init);
+module_exit(nf_nat_irc_fini);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
new file mode 100644
index 00000000000..83a72a235ca
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -0,0 +1,114 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/netfilter.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ __be16 port;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ port = tuple->src.u.all;
+ else
+ port = tuple->dst.u.all;
+
+ return ntohs(port) >= ntohs(min->all) &&
+ ntohs(port) <= ntohs(max->all);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
+
+void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct,
+ u16 *rover)
+{
+ unsigned int range_size, min, i;
+ __be16 *portptr;
+ u_int16_t off;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.all;
+ else
+ portptr = &tuple->dst.u.all;
+
+ /* If no range specified... */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == NF_NAT_MANIP_DST)
+ return;
+
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr) < 512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min_proto.all);
+ range_size = ntohs(range->max_proto.all) - min + 1;
+ }
+
+ if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
+ off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
+ ? tuple->dst.u.all
+ : tuple->src.u.all);
+ } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
+ off = prandom_u32();
+ } else {
+ off = *rover;
+ }
+
+ for (i = 0; ; ++off) {
+ *portptr = htons(min + off % range_size);
+ if (++i != range_size && nf_nat_used_tuple(tuple, ct))
+ continue;
+ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+ *rover = off;
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_PROTONAT_PORT_MIN]) {
+ range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
+ range->max_proto.all = range->min_proto.all;
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ if (tb[CTA_PROTONAT_PORT_MAX]) {
+ range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range);
+#endif
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
new file mode 100644
index 00000000000..c8be2cdac0b
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -0,0 +1,116 @@
+/*
+ * DCCP NAT protocol helper
+ *
+ * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/dccp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u_int16_t dccp_port_rover;
+
+static void
+dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &dccp_port_rover);
+}
+
+static bool
+dccp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct dccp_hdr *hdr;
+ __be16 *portptr, oldport, newport;
+ int hdrsize = 8; /* DCCP connection tracking guarantees this much */
+
+ if (skb->len >= hdroff + sizeof(struct dccp_hdr))
+ hdrsize = sizeof(struct dccp_hdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct dccp_hdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ newport = tuple->src.u.dccp.port;
+ portptr = &hdr->dccph_sport;
+ } else {
+ newport = tuple->dst.u.dccp.port;
+ portptr = &hdr->dccph_dport;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
+ 0);
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
+ .l4proto = IPPROTO_DCCP,
+ .manip_pkt = dccp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = dccp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_dccp_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_dccp_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+
+}
+
+module_init(nf_nat_proto_dccp_init);
+module_exit(nf_nat_proto_dccp_fini);
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("DCCP NAT protocol helper");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
new file mode 100644
index 00000000000..754536f2c67
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sctp.h>
+#include <linux/module.h>
+#include <net/sctp/checksum.h>
+
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u_int16_t nf_sctp_port_rover;
+
+static void
+sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &nf_sctp_port_rover);
+}
+
+static bool
+sctp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ sctp_sctphdr_t *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct sctphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ hdr->source = tuple->src.u.sctp.port;
+ } else {
+ /* Get rid of dst port */
+ hdr->dest = tuple->dst.u.sctp.port;
+ }
+
+ hdr->checksum = sctp_compute_cksum(skb, hdroff);
+
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
+ .l4proto = IPPROTO_SCTP,
+ .manip_pkt = sctp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = sctp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_sctp_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_sctp_exit(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+}
+
+module_init(nf_nat_proto_sctp_init);
+module_exit(nf_nat_proto_sctp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SCTP NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
new file mode 100644
index 00000000000..83ec8a6e4c3
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -0,0 +1,85 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+
+static u16 tcp_port_rover;
+
+static void
+tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &tcp_port_rover);
+}
+
+static bool
+tcp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct tcphdr *hdr;
+ __be16 *portptr, newport, oldport;
+ int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+ /* this could be a inner header returned in icmp packet; in such
+ cases we cannot update the checksum field since it is outside of
+ the 8 bytes of transport layer headers we are guaranteed */
+ if (skb->len >= hdroff + sizeof(struct tcphdr))
+ hdrsize = sizeof(struct tcphdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct tcphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.tcp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.tcp.port;
+ portptr = &hdr->dest;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_tcp = {
+ .l4proto = IPPROTO_TCP,
+ .manip_pkt = tcp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = tcp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
new file mode 100644
index 00000000000..7df613fb34a
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -0,0 +1,76 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u16 udp_port_rover;
+
+static void
+udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &udp_port_rover);
+}
+
+static bool
+udp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ __be16 *portptr, newport;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+ hdr = (struct udphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+ if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ l3proto->csum_update(skb, iphdroff, &hdr->check,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
+ 0);
+ if (!hdr->check)
+ hdr->check = CSUM_MANGLED_0;
+ }
+ *portptr = newport;
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_udp = {
+ .l4proto = IPPROTO_UDP,
+ .manip_pkt = udp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = udp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
new file mode 100644
index 00000000000..776a0d1317b
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -0,0 +1,106 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u16 udplite_port_rover;
+
+static void
+udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &udplite_port_rover);
+}
+
+static bool
+udplite_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ __be16 *portptr, newport;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct udphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of source port */
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+
+ l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
+ if (!hdr->check)
+ hdr->check = CSUM_MANGLED_0;
+
+ *portptr = newport;
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+ .l4proto = IPPROTO_UDPLITE,
+ .manip_pkt = udplite_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = udplite_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_udplite_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_udplite_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+}
+
+module_init(nf_nat_proto_udplite_init);
+module_exit(nf_nat_proto_udplite_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
new file mode 100644
index 00000000000..6e494d58441
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -0,0 +1,54 @@
+/* The "unknown" protocol. This is what is used for protocols we
+ * don't understand. It's returned by ip_ct_find_proto().
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type manip_type,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ return true;
+}
+
+static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ /* Sorry: we can't help you; if it's not unique, we can't frob
+ * anything.
+ */
+ return;
+}
+
+static bool
+unknown_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_unknown = {
+ .manip_pkt = unknown_manip_pkt,
+ .in_range = unknown_in_range,
+ .unique_tuple = unknown_unique_tuple,
+};
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
new file mode 100644
index 00000000000..b4d691db955
--- /dev/null
+++ b/net/netfilter/nf_nat_sip.c
@@ -0,0 +1,653 @@
+/* SIP extension for NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ * (C) 2007 United Security Providers
+ * (C) 2007, 2008, 2011, 2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+MODULE_ALIAS("ip_nat_sip");
+
+
+static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff, unsigned int matchlen,
+ const char *buffer, unsigned int buflen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct tcphdr *th;
+ unsigned int baseoff;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP) {
+ th = (struct tcphdr *)(skb->data + protoff);
+ baseoff = protoff + th->doff * 4;
+ matchoff += dataoff - baseoff;
+
+ if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, buflen, false))
+ return 0;
+ } else {
+ baseoff = protoff + sizeof(struct udphdr);
+ matchoff += dataoff - baseoff;
+
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, buflen))
+ return 0;
+ }
+
+ /* Reload data pointer and adjust datalen value */
+ *dptr = skb->data + dataoff;
+ *datalen += buflen - matchlen;
+ return 1;
+}
+
+static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer,
+ const union nf_inet_addr *addr, bool delim)
+{
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return sprintf(buffer, "%pI4", &addr->ip);
+ else {
+ if (delim)
+ return sprintf(buffer, "[%pI6c]", &addr->ip6);
+ else
+ return sprintf(buffer, "%pI6c", &addr->ip6);
+ }
+}
+
+static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer,
+ const union nf_inet_addr *addr, u16 port)
+{
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return sprintf(buffer, "%pI4:%u", &addr->ip, port);
+ else
+ return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port);
+}
+
+static int map_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff, unsigned int matchlen,
+ union nf_inet_addr *addr, __be16 port)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+ unsigned int buflen;
+ union nf_inet_addr newaddr;
+ __be16 newport;
+
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) &&
+ ct->tuplehash[dir].tuple.src.u.udp.port == port) {
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+ newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+ } else if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, addr) &&
+ ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
+ newaddr = ct->tuplehash[!dir].tuple.src.u3;
+ newport = ct_sip_info->forced_dport ? :
+ ct->tuplehash[!dir].tuple.src.u.udp.port;
+ } else
+ return 1;
+
+ if (nf_inet_addr_cmp(&newaddr, addr) && newport == port)
+ return 1;
+
+ buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport));
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen);
+}
+
+static int map_sip_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ enum sip_header_types type)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchlen, matchoff;
+ union nf_inet_addr addr;
+ __be16 port;
+
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
+ &matchoff, &matchlen, &addr, &port) <= 0)
+ return 1;
+ return map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port);
+}
+
+static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ unsigned int coff, matchoff, matchlen;
+ enum sip_header_types hdr;
+ union nf_inet_addr addr;
+ __be16 port;
+ int request, in_header;
+
+ /* Basic rules: requests and responses. */
+ if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
+ if (ct_sip_parse_request(ct, *dptr, *datalen,
+ &matchoff, &matchlen,
+ &addr, &port) > 0 &&
+ !map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SIP message");
+ return NF_DROP;
+ }
+ request = 1;
+ } else
+ request = 0;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP)
+ hdr = SIP_HDR_VIA_TCP;
+ else
+ hdr = SIP_HDR_VIA_UDP;
+
+ /* Translate topmost Via header and parameters */
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ hdr, NULL, &matchoff, &matchlen,
+ &addr, &port) > 0) {
+ unsigned int olen, matchend, poff, plen, buflen, n;
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+
+ /* We're only interested in headers related to this
+ * connection */
+ if (request) {
+ if (!nf_inet_addr_cmp(&addr,
+ &ct->tuplehash[dir].tuple.src.u3) ||
+ port != ct->tuplehash[dir].tuple.src.u.udp.port)
+ goto next;
+ } else {
+ if (!nf_inet_addr_cmp(&addr,
+ &ct->tuplehash[dir].tuple.dst.u3) ||
+ port != ct->tuplehash[dir].tuple.dst.u.udp.port)
+ goto next;
+ }
+
+ olen = *datalen;
+ if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle Via header");
+ return NF_DROP;
+ }
+
+ matchend = matchoff + matchlen + *datalen - olen;
+
+ /* The maddr= parameter (RFC 2361) specifies where to send
+ * the reply. */
+ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+ "maddr=", &poff, &plen,
+ &addr, true) > 0 &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) &&
+ !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) {
+ buflen = sip_sprintf_addr(ct, buffer,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ true);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle maddr");
+ return NF_DROP;
+ }
+ }
+
+ /* The received= parameter (RFC 2361) contains the address
+ * from which the server received the request. */
+ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+ "received=", &poff, &plen,
+ &addr, false) > 0 &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) &&
+ !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) {
+ buflen = sip_sprintf_addr(ct, buffer,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ false);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle received");
+ return NF_DROP;
+ }
+ }
+
+ /* The rport= parameter (RFC 3581) contains the port number
+ * from which the server received the request. */
+ if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
+ "rport=", &poff, &plen,
+ &n) > 0 &&
+ htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
+ htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
+ __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
+ buflen = sprintf(buffer, "%u", ntohs(p));
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle rport");
+ return NF_DROP;
+ }
+ }
+ }
+
+next:
+ /* Translate Contact headers */
+ coff = 0;
+ in_header = 0;
+ while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
+ SIP_HDR_CONTACT, &in_header,
+ &matchoff, &matchlen,
+ &addr, &port) > 0) {
+ if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen,
+ &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle contact");
+ return NF_DROP;
+ }
+ }
+
+ if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) ||
+ !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SIP from/to");
+ return NF_DROP;
+ }
+
+ /* Mangle destination port for Cisco phones, then fix up checksums */
+ if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) {
+ struct udphdr *uh;
+
+ if (!skb_make_writable(skb, skb->len)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ return NF_DROP;
+ }
+
+ uh = (void *)skb->data + protoff;
+ uh->dest = ct_sip_info->forced_dport;
+
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, protoff,
+ 0, 0, NULL, 0)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ return NF_DROP;
+ }
+ }
+
+ return NF_ACCEPT;
+}
+
+static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
+ s16 off)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
+ return;
+
+ th = (struct tcphdr *)(skb->data + protoff);
+ nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+
+/* Handles expected signalling connections and media streams */
+static void nf_nat_sip_expected(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = exp->saved_proto;
+ range.min_addr = range.max_addr = exp->saved_addr;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+
+ /* Change src to where master sends to, but only if the connection
+ * actually came from the same source. */
+ if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
+ &ct->master->tuplehash[exp->dir].tuple.src.u3)) {
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+ }
+}
+
+static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ struct nf_conntrack_expect *exp,
+ unsigned int matchoff,
+ unsigned int matchlen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ __be16 srcport;
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+ unsigned int buflen;
+
+ /* Connection will come from reply */
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3))
+ newaddr = exp->tuple.dst.u3;
+ else
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+
+ /* If the signalling port matches the connection's source port in the
+ * original direction, try to use the destination port in the opposite
+ * direction. */
+ srcport = ct_sip_info->forced_dport ? :
+ ct->tuplehash[dir].tuple.src.u.udp.port;
+ if (exp->tuple.dst.u.udp.port == srcport)
+ port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
+ else
+ port = ntohs(exp->tuple.dst.u.udp.port);
+
+ exp->saved_addr = exp->tuple.dst.u3;
+ exp->tuple.dst.u3 = newaddr;
+ exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+ exp->dir = !dir;
+ exp->expectfn = nf_nat_sip_expected;
+
+ for (; port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.udp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use for SIP");
+ return NF_DROP;
+ }
+
+ if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) ||
+ exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
+ buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ goto err;
+ }
+ }
+ return NF_ACCEPT;
+
+err:
+ nf_ct_unexpect_related(exp);
+ return NF_DROP;
+}
+
+static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchoff, matchlen;
+ char buffer[sizeof("65536")];
+ int buflen, c_len;
+
+ /* Get actual SDP length */
+ if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
+ SDP_HDR_VERSION, SDP_HDR_UNSPEC,
+ &matchoff, &matchlen) <= 0)
+ return 0;
+ c_len = *datalen - matchoff + strlen("v=");
+
+ /* Now, update SDP length */
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
+ &matchoff, &matchlen) <= 0)
+ return 0;
+
+ buflen = sprintf(buffer, "%u", c_len);
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen);
+}
+
+static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ char *buffer, int buflen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchlen, matchoff;
+
+ if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
+ &matchoff, &matchlen) <= 0)
+ return -ENOENT;
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL;
+}
+
+static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ const union nf_inet_addr *addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ char buffer[INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
+ sdpoff, type, term, buffer, buflen))
+ return 0;
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ u_int16_t port)
+{
+ char buffer[sizeof("nnnnn")];
+ unsigned int buflen;
+
+ buflen = sprintf(buffer, "%u", port);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen))
+ return 0;
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ const union nf_inet_addr *addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ char buffer[INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ /* Mangle session description owner and contact addresses */
+ buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
+ SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen))
+ return 0;
+
+ switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
+ buffer, buflen)) {
+ case 0:
+ /*
+ * RFC 2327:
+ *
+ * Session description
+ *
+ * c=* (connection information - not required if included in all media)
+ */
+ case -ENOENT:
+ break;
+ default:
+ return 0;
+ }
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+/* So, this packet has hit the connection tracking matching code.
+ Mangle it, and change the expectation to match the new version. */
+static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp,
+ unsigned int mediaoff,
+ unsigned int medialen,
+ union nf_inet_addr *rtp_addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ u_int16_t port;
+
+ /* Connection will come from reply */
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3))
+ *rtp_addr = rtp_exp->tuple.dst.u3;
+ else
+ *rtp_addr = ct->tuplehash[!dir].tuple.dst.u3;
+
+ rtp_exp->saved_addr = rtp_exp->tuple.dst.u3;
+ rtp_exp->tuple.dst.u3 = *rtp_addr;
+ rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+ rtp_exp->dir = !dir;
+ rtp_exp->expectfn = nf_nat_sip_expected;
+
+ rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3;
+ rtcp_exp->tuple.dst.u3 = *rtp_addr;
+ rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+ rtcp_exp->dir = !dir;
+ rtcp_exp->expectfn = nf_nat_sip_expected;
+
+ /* Try to get same pair of ports: if not, try to change them. */
+ for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+ port != 0; port += 2) {
+ int ret;
+
+ rtp_exp->tuple.dst.u.udp.port = htons(port);
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == -EBUSY)
+ continue;
+ else if (ret < 0) {
+ port = 0;
+ break;
+ }
+ rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
+ break;
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
+ nf_ct_unexpect_related(rtp_exp);
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use for SDP media");
+ goto err1;
+ }
+
+ /* Update media port. */
+ if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
+ !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen,
+ mediaoff, medialen, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SDP message");
+ goto err2;
+ }
+
+ return NF_ACCEPT;
+
+err2:
+ nf_ct_unexpect_related(rtp_exp);
+ nf_ct_unexpect_related(rtcp_exp);
+err1:
+ return NF_DROP;
+}
+
+static struct nf_ct_helper_expectfn sip_nat = {
+ .name = "sip",
+ .expectfn = nf_nat_sip_expected,
+};
+
+static void __exit nf_nat_sip_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_sip_hooks, NULL);
+
+ nf_ct_helper_expectfn_unregister(&sip_nat);
+ synchronize_rcu();
+}
+
+static const struct nf_nat_sip_hooks sip_hooks = {
+ .msg = nf_nat_sip,
+ .seq_adjust = nf_nat_sip_seq_adjust,
+ .expect = nf_nat_sip_expect,
+ .sdp_addr = nf_nat_sdp_addr,
+ .sdp_port = nf_nat_sdp_port,
+ .sdp_session = nf_nat_sdp_session,
+ .sdp_media = nf_nat_sdp_media,
+};
+
+static int __init nf_nat_sip_init(void)
+{
+ BUG_ON(nf_nat_sip_hooks != NULL);
+ RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks);
+ nf_ct_helper_expectfn_register(&sip_nat);
+ return 0;
+}
+
+module_init(nf_nat_sip_init);
+module_exit(nf_nat_sip_fini);
diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c
new file mode 100644
index 00000000000..7f67e1d5310
--- /dev/null
+++ b/net/netfilter/nf_nat_tftp.c
@@ -0,0 +1,52 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_tftp.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("TFTP NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_tftp");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_expect *exp)
+{
+ const struct nf_conn *ct = exp->master;
+
+ exp->saved_proto.udp.port
+ = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+ exp->expectfn = nf_nat_follow_master;
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, exp->master, "cannot add expectation");
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+static void __exit nf_nat_tftp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_tftp_init(void)
+{
+ BUG_ON(nf_nat_tftp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_tftp_hook, help);
+ return 0;
+}
+
+module_init(nf_nat_tftp_init);
+module_exit(nf_nat_tftp_fini);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 74aebed5bd2..5d24b1fdb59 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -1,3 +1,8 @@
+/*
+ * Rusty Russell (C)2000 -- This code is GPL.
+ * Patrick McHardy (c) 2006-2012
+ */
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
@@ -14,75 +19,33 @@
#include "nf_internals.h"
/*
- * A queue handler may be registered for each protocol. Each is protected by
- * long term mutex. The handler must provide an an outfn() to accept packets
- * for queueing and must reinject all packets it receives, no matter what.
+ * Hook for nfnetlink_queue to register its queue handler.
+ * We do this so that most of the NFQUEUE code can be modular.
+ *
+ * Once the queue is registered it must reinject all packets it
+ * receives, no matter what.
*/
-static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
-
-static DEFINE_MUTEX(queue_handler_mutex);
+static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
/* return EBUSY when somebody else is registered, return EEXIST if the
* same handler is registered, return 0 in case of success. */
-int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
{
- int ret;
-
- if (pf >= ARRAY_SIZE(queue_handler))
- return -EINVAL;
-
- mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] == qh)
- ret = -EEXIST;
- else if (queue_handler[pf])
- ret = -EBUSY;
- else {
- rcu_assign_pointer(queue_handler[pf], qh);
- ret = 0;
- }
- mutex_unlock(&queue_handler_mutex);
-
- return ret;
+ /* should never happen, we only have one queueing backend in kernel */
+ WARN_ON(rcu_access_pointer(queue_handler));
+ rcu_assign_pointer(queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
+void nf_unregister_queue_handler(void)
{
- if (pf >= ARRAY_SIZE(queue_handler))
- return -EINVAL;
-
- mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] && queue_handler[pf] != qh) {
- mutex_unlock(&queue_handler_mutex);
- return -EINVAL;
- }
-
- rcu_assign_pointer(queue_handler[pf], NULL);
- mutex_unlock(&queue_handler_mutex);
-
+ RCU_INIT_POINTER(queue_handler, NULL);
synchronize_rcu();
-
- return 0;
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
-void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
-{
- u_int8_t pf;
-
- mutex_lock(&queue_handler_mutex);
- for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) {
- if (queue_handler[pf] == qh)
- rcu_assign_pointer(queue_handler[pf], NULL);
- }
- mutex_unlock(&queue_handler_mutex);
-
- synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
-
-static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
/* Release those devices we held, or Alexey will kill me. */
if (entry->indev)
@@ -102,75 +65,87 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
/* Drop reference to owner of hook which queued us. */
module_put(entry->elem->owner);
}
+EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
+
+/* Bump dev refs so they don't vanish while packet is out */
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+{
+ if (!try_module_get(entry->elem->owner))
+ return false;
+
+ if (entry->indev)
+ dev_hold(entry->indev);
+ if (entry->outdev)
+ dev_hold(entry->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (entry->skb->nf_bridge) {
+ struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge;
+ struct net_device *physdev;
+
+ physdev = nf_bridge->physindev;
+ if (physdev)
+ dev_hold(physdev);
+ physdev = nf_bridge->physoutdev;
+ if (physdev)
+ dev_hold(physdev);
+ }
+#endif
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
/*
* Any packet that leaves via this function must come back
* through nf_reinject().
*/
-static int __nf_queue(struct sk_buff *skb,
- struct list_head *elem,
+int nf_queue(struct sk_buff *skb,
+ struct nf_hook_ops *elem,
u_int8_t pf, unsigned int hook,
struct net_device *indev,
struct net_device *outdev,
int (*okfn)(struct sk_buff *),
unsigned int queuenum)
{
- int status;
+ int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
-#ifdef CONFIG_BRIDGE_NETFILTER
- struct net_device *physindev;
- struct net_device *physoutdev;
-#endif
const struct nf_afinfo *afinfo;
const struct nf_queue_handler *qh;
- /* QUEUE == DROP if noone is waiting, to be safe. */
+ /* QUEUE == DROP if no one is waiting, to be safe. */
rcu_read_lock();
- qh = rcu_dereference(queue_handler[pf]);
- if (!qh)
+ qh = rcu_dereference(queue_handler);
+ if (!qh) {
+ status = -ESRCH;
goto err_unlock;
+ }
afinfo = nf_get_afinfo(pf);
if (!afinfo)
goto err_unlock;
entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
- if (!entry)
+ if (!entry) {
+ status = -ENOMEM;
goto err_unlock;
+ }
*entry = (struct nf_queue_entry) {
.skb = skb,
- .elem = list_entry(elem, struct nf_hook_ops, list),
+ .elem = elem,
.pf = pf,
.hook = hook,
.indev = indev,
.outdev = outdev,
.okfn = okfn,
+ .size = sizeof(*entry) + afinfo->route_key_size,
};
- /* If it's going away, ignore hook. */
- if (!try_module_get(entry->elem->owner)) {
- rcu_read_unlock();
- kfree(entry);
- return 0;
- }
-
- /* Bump dev refs so they don't vanish while packet is out */
- if (indev)
- dev_hold(indev);
- if (outdev)
- dev_hold(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- physindev = skb->nf_bridge->physindev;
- if (physindev)
- dev_hold(physindev);
- physoutdev = skb->nf_bridge->physoutdev;
- if (physoutdev)
- dev_hold(physoutdev);
+ if (!nf_queue_entry_get_refs(entry)) {
+ status = -ECANCELED;
+ goto err_unlock;
}
-#endif
skb_dst_force(skb);
afinfo->saveroute(skb, entry);
status = qh->outfn(entry, queuenum);
@@ -182,61 +157,21 @@ static int __nf_queue(struct sk_buff *skb,
goto err;
}
- return 1;
+ return 0;
err_unlock:
rcu_read_unlock();
err:
- kfree_skb(skb);
kfree(entry);
- return 1;
-}
-
-int nf_queue(struct sk_buff *skb,
- struct list_head *elem,
- u_int8_t pf, unsigned int hook,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *),
- unsigned int queuenum)
-{
- struct sk_buff *segs;
-
- if (!skb_is_gso(skb))
- return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
- queuenum);
-
- switch (pf) {
- case NFPROTO_IPV4:
- skb->protocol = htons(ETH_P_IP);
- break;
- case NFPROTO_IPV6:
- skb->protocol = htons(ETH_P_IPV6);
- break;
- }
-
- segs = skb_gso_segment(skb, 0);
- kfree_skb(skb);
- if (IS_ERR(segs))
- return 1;
-
- do {
- struct sk_buff *nskb = segs->next;
-
- segs->next = NULL;
- if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn,
- queuenum))
- kfree_skb(segs);
- segs = nskb;
- } while (segs);
- return 1;
+ return status;
}
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
struct sk_buff *skb = entry->skb;
- struct list_head *elem = &entry->elem->list;
+ struct nf_hook_ops *elem = entry->elem;
const struct nf_afinfo *afinfo;
+ int err;
rcu_read_lock();
@@ -244,7 +179,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
/* Continue traversal iff userspace said ok... */
if (verdict == NF_REPEAT) {
- elem = elem->prev;
+ elem = list_entry(elem->list.prev, struct nf_hook_ops, list);
verdict = NF_ACCEPT;
}
@@ -270,12 +205,20 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
local_bh_enable();
break;
case NF_QUEUE:
- if (!__nf_queue(skb, elem, entry->pf, entry->hook,
+ err = nf_queue(skb, elem, entry->pf, entry->hook,
entry->indev, entry->outdev, entry->okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
break;
case NF_STOLEN:
+ break;
default:
kfree_skb(skb);
}
@@ -283,77 +226,3 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
kfree(entry);
}
EXPORT_SYMBOL(nf_reinject);
-
-#ifdef CONFIG_PROC_FS
-static void *seq_start(struct seq_file *seq, loff_t *pos)
-{
- if (*pos >= ARRAY_SIZE(queue_handler))
- return NULL;
-
- return pos;
-}
-
-static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- (*pos)++;
-
- if (*pos >= ARRAY_SIZE(queue_handler))
- return NULL;
-
- return pos;
-}
-
-static void seq_stop(struct seq_file *s, void *v)
-{
-
-}
-
-static int seq_show(struct seq_file *s, void *v)
-{
- int ret;
- loff_t *pos = v;
- const struct nf_queue_handler *qh;
-
- rcu_read_lock();
- qh = rcu_dereference(queue_handler[*pos]);
- if (!qh)
- ret = seq_printf(s, "%2lld NONE\n", *pos);
- else
- ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
- rcu_read_unlock();
-
- return ret;
-}
-
-static const struct seq_operations nfqueue_seq_ops = {
- .start = seq_start,
- .next = seq_next,
- .stop = seq_stop,
- .show = seq_show,
-};
-
-static int nfqueue_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &nfqueue_seq_ops);
-}
-
-static const struct file_operations nfqueue_file_ops = {
- .owner = THIS_MODULE,
- .open = nfqueue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-#endif /* PROC_FS */
-
-
-int __init netfilter_queue_init(void)
-{
-#ifdef CONFIG_PROC_FS
- if (!proc_create("nf_queue", S_IRUGO,
- proc_net_netfilter, &nfqueue_file_ops))
- return -1;
-#endif
- return 0;
-}
-
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
new file mode 100644
index 00000000000..52e20c9a46a
--- /dev/null
+++ b/net/netfilter/nf_synproxy_core.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+int synproxy_net_id;
+EXPORT_SYMBOL_GPL(synproxy_net_id);
+
+bool
+synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+ const struct tcphdr *th, struct synproxy_options *opts)
+{
+ int length = (th->doff * 4) - sizeof(*th);
+ u8 buf[40], *ptr;
+
+ ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
+ if (ptr == NULL)
+ return false;
+
+ opts->options = 0;
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return true;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2)
+ return true;
+ if (opsize > length)
+ return true;
+
+ switch (opcode) {
+ case TCPOPT_MSS:
+ if (opsize == TCPOLEN_MSS) {
+ opts->mss = get_unaligned_be16(ptr);
+ opts->options |= XT_SYNPROXY_OPT_MSS;
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if (opsize == TCPOLEN_WINDOW) {
+ opts->wscale = *ptr;
+ if (opts->wscale > 14)
+ opts->wscale = 14;
+ opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opsize == TCPOLEN_TIMESTAMP) {
+ opts->tsval = get_unaligned_be32(ptr);
+ opts->tsecr = get_unaligned_be32(ptr + 4);
+ opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opsize == TCPOLEN_SACK_PERM)
+ opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+ break;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_parse_options);
+
+unsigned int synproxy_options_size(const struct synproxy_options *opts)
+{
+ unsigned int size = 0;
+
+ if (opts->options & XT_SYNPROXY_OPT_MSS)
+ size += TCPOLEN_MSS_ALIGNED;
+ if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ size += TCPOLEN_TSTAMP_ALIGNED;
+ else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ size += TCPOLEN_SACKPERM_ALIGNED;
+ if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+ size += TCPOLEN_WSCALE_ALIGNED;
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(synproxy_options_size);
+
+void
+synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
+{
+ __be32 *ptr = (__be32 *)(th + 1);
+ u8 options = opts->options;
+
+ if (options & XT_SYNPROXY_OPT_MSS)
+ *ptr++ = htonl((TCPOPT_MSS << 24) |
+ (TCPOLEN_MSS << 16) |
+ opts->mss);
+
+ if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
+ if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+ (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ else
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+
+ *ptr++ = htonl(opts->tsval);
+ *ptr++ = htonl(opts->tsecr);
+ } else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) |
+ TCPOLEN_SACK_PERM);
+
+ if (options & XT_SYNPROXY_OPT_WSCALE)
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_WINDOW << 16) |
+ (TCPOLEN_WINDOW << 8) |
+ opts->wscale);
+}
+EXPORT_SYMBOL_GPL(synproxy_build_options);
+
+void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+ struct synproxy_options *opts)
+{
+ opts->tsecr = opts->tsval;
+ opts->tsval = tcp_time_stamp & ~0x3f;
+
+ if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
+ opts->tsval |= opts->wscale;
+ opts->wscale = info->wscale;
+ } else
+ opts->tsval |= 0xf;
+
+ if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ opts->tsval |= 1 << 4;
+
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ opts->tsval |= 1 << 5;
+}
+EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
+
+void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+{
+ opts->wscale = opts->tsecr & 0xf;
+ if (opts->wscale != 0xf)
+ opts->options |= XT_SYNPROXY_OPT_WSCALE;
+
+ opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+
+ opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+}
+EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
+
+unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
+ unsigned int protoff,
+ struct tcphdr *th,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_conn_synproxy *synproxy)
+{
+ unsigned int optoff, optend;
+ u32 *ptr, old;
+
+ if (synproxy->tsoff == 0)
+ return 1;
+
+ optoff = protoff + sizeof(struct tcphdr);
+ optend = protoff + th->doff * 4;
+
+ if (!skb_make_writable(skb, optend))
+ return 0;
+
+ while (optoff < optend) {
+ unsigned char *op = skb->data + optoff;
+
+ switch (op[0]) {
+ case TCPOPT_EOL:
+ return 1;
+ case TCPOPT_NOP:
+ optoff++;
+ continue;
+ default:
+ if (optoff + 1 == optend ||
+ optoff + op[1] > optend ||
+ op[1] < 2)
+ return 0;
+ if (op[0] == TCPOPT_TIMESTAMP &&
+ op[1] == TCPOLEN_TIMESTAMP) {
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ ptr = (u32 *)&op[2];
+ old = *ptr;
+ *ptr = htonl(ntohl(*ptr) -
+ synproxy->tsoff);
+ } else {
+ ptr = (u32 *)&op[6];
+ old = *ptr;
+ *ptr = htonl(ntohl(*ptr) +
+ synproxy->tsoff);
+ }
+ inet_proto_csum_replace4(&th->check, skb,
+ old, *ptr, 0);
+ return 1;
+ }
+ optoff += op[1];
+ }
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
+
+static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_synproxy),
+ .align = __alignof__(struct nf_conn_synproxy),
+ .id = NF_CT_EXT_SYNPROXY,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(snet->stats, cpu);
+ }
+
+ return NULL;
+}
+
+static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(snet->stats, cpu);
+ }
+
+ return NULL;
+}
+
+static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+static int synproxy_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct synproxy_stats *stats = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries\t\tsyn_received\t"
+ "cookie_invalid\tcookie_valid\t"
+ "cookie_retrans\tconn_reopened\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0,
+ stats->syn_received,
+ stats->cookie_invalid,
+ stats->cookie_valid,
+ stats->cookie_retrans,
+ stats->conn_reopened);
+
+ return 0;
+}
+
+static const struct seq_operations synproxy_cpu_seq_ops = {
+ .start = synproxy_cpu_seq_start,
+ .next = synproxy_cpu_seq_next,
+ .stop = synproxy_cpu_seq_stop,
+ .show = synproxy_cpu_seq_show,
+};
+
+static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations synproxy_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = synproxy_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init synproxy_proc_init(struct net *net)
+{
+ if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
+ &synproxy_cpu_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+ remove_proc_entry("synproxy", net->proc_net_stat);
+}
+#else
+static int __net_init synproxy_proc_init(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+ return;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init synproxy_net_init(struct net *net)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct nf_conntrack_tuple t;
+ struct nf_conn *ct;
+ int err = -ENOMEM;
+
+ memset(&t, 0, sizeof(t));
+ ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
+ if (IS_ERR(ct)) {
+ err = PTR_ERR(ct);
+ goto err1;
+ }
+
+ if (!nfct_seqadj_ext_add(ct))
+ goto err2;
+ if (!nfct_synproxy_ext_add(ct))
+ goto err2;
+
+ nf_conntrack_tmpl_insert(net, ct);
+ snet->tmpl = ct;
+
+ snet->stats = alloc_percpu(struct synproxy_stats);
+ if (snet->stats == NULL)
+ goto err2;
+
+ err = synproxy_proc_init(net);
+ if (err < 0)
+ goto err3;
+
+ return 0;
+
+err3:
+ free_percpu(snet->stats);
+err2:
+ nf_conntrack_free(ct);
+err1:
+ return err;
+}
+
+static void __net_exit synproxy_net_exit(struct net *net)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+
+ nf_ct_put(snet->tmpl);
+ synproxy_proc_exit(net);
+ free_percpu(snet->stats);
+}
+
+static struct pernet_operations synproxy_net_ops = {
+ .init = synproxy_net_init,
+ .exit = synproxy_net_exit,
+ .id = &synproxy_net_id,
+ .size = sizeof(struct synproxy_net),
+};
+
+static int __init synproxy_core_init(void)
+{
+ int err;
+
+ err = nf_ct_extend_register(&nf_ct_synproxy_extend);
+ if (err < 0)
+ goto err1;
+
+ err = register_pernet_subsys(&synproxy_net_ops);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+
+err2:
+ nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+err1:
+ return err;
+}
+
+static void __exit synproxy_core_exit(void)
+{
+ unregister_pernet_subsys(&synproxy_net_ops);
+ nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+}
+
+module_init(synproxy_core_init);
+module_exit(synproxy_core_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
new file mode 100644
index 00000000000..8746ff9a835
--- /dev/null
+++ b/net/netfilter/nf_tables_api.c
@@ -0,0 +1,4041 @@
+/*
+ * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+static LIST_HEAD(nf_tables_expressions);
+
+/**
+ * nft_register_afinfo - register nf_tables address family info
+ *
+ * @afi: address family info to register
+ *
+ * Register the address family for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
+{
+ INIT_LIST_HEAD(&afi->tables);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_tail_rcu(&afi->list, &net->nft.af_info);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_afinfo);
+
+/**
+ * nft_unregister_afinfo - unregister nf_tables address family info
+ *
+ * @afi: address family info to unregister
+ *
+ * Unregister the address family for use with nf_tables.
+ */
+void nft_unregister_afinfo(struct nft_af_info *afi)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&afi->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
+
+static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family)
+{
+ struct nft_af_info *afi;
+
+ list_for_each_entry(afi, &net->nft.af_info, list) {
+ if (afi->family == family)
+ return afi;
+ }
+ return NULL;
+}
+
+static struct nft_af_info *
+nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
+{
+ struct nft_af_info *afi;
+
+ afi = nft_afinfo_lookup(net, family);
+ if (afi != NULL)
+ return afi;
+#ifdef CONFIG_MODULES
+ if (autoload) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-afinfo-%u", family);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ afi = nft_afinfo_lookup(net, family);
+ if (afi != NULL)
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
+static void nft_ctx_init(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nft_af_info *afi,
+ struct nft_table *table,
+ struct nft_chain *chain,
+ const struct nlattr * const *nla)
+{
+ ctx->net = sock_net(skb->sk);
+ ctx->afi = afi;
+ ctx->table = table;
+ ctx->chain = chain;
+ ctx->nla = nla;
+ ctx->portid = NETLINK_CB(skb).portid;
+ ctx->report = nlmsg_report(nlh);
+ ctx->seq = nlh->nlmsg_seq;
+}
+
+static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
+ u32 size)
+{
+ struct nft_trans *trans;
+
+ trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL);
+ if (trans == NULL)
+ return NULL;
+
+ trans->msg_type = msg_type;
+ trans->ctx = *ctx;
+
+ return trans;
+}
+
+static void nft_trans_destroy(struct nft_trans *trans)
+{
+ list_del(&trans->list);
+ kfree(trans);
+}
+
+/*
+ * Tables
+ */
+
+static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla)
+{
+ struct nft_table *table;
+
+ list_for_each_entry(table, &afi->tables, list) {
+ if (!nla_strcmp(nla, table->name))
+ return table;
+ }
+ return NULL;
+}
+
+static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla)
+{
+ struct nft_table *table;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ table = nft_table_lookup(afi, nla);
+ if (table != NULL)
+ return table;
+
+ return ERR_PTR(-ENOENT);
+}
+
+static inline u64 nf_tables_alloc_handle(struct nft_table *table)
+{
+ return ++table->hgenerator;
+}
+
+static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
+
+static const struct nf_chain_type *
+__nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+{
+ int i;
+
+ for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
+ if (chain_type[family][i] != NULL &&
+ !nla_strcmp(nla, chain_type[family][i]->name))
+ return chain_type[family][i];
+ }
+ return NULL;
+}
+
+static const struct nf_chain_type *
+nf_tables_chain_type_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla,
+ bool autoload)
+{
+ const struct nf_chain_type *type;
+
+ type = __nf_tables_chain_type_lookup(afi->family, nla);
+ if (type != NULL)
+ return type;
+#ifdef CONFIG_MODULES
+ if (autoload) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-chain-%u-%.*s", afi->family,
+ nla_len(nla), (const char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ type = __nf_tables_chain_type_lookup(afi->family, nla);
+ if (type != NULL)
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
+ [NFTA_TABLE_NAME] = { .type = NLA_STRING },
+ [NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, u32 flags, int family,
+ const struct nft_table *table)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
+ nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
+ nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0,
+ ctx->afi->family, ctx->table);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_dump_tables(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_table_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWTABLE,
+ NLM_F_MULTI,
+ afi->family, table) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+/* Internal table flags */
+#define NFT_TABLE_INACTIVE (1 << 15)
+
+static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_tables,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
+ family, table);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static int nf_tables_table_enable(const struct nft_af_info *afi,
+ struct nft_table *table)
+{
+ struct nft_chain *chain;
+ int err, i = 0;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ if (err < 0)
+ goto err;
+
+ i++;
+ }
+ return 0;
+err:
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ if (i-- <= 0)
+ break;
+
+ nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops);
+ }
+ return err;
+}
+
+static void nf_tables_table_disable(const struct nft_af_info *afi,
+ struct nft_table *table)
+{
+ struct nft_chain *chain;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (chain->flags & NFT_BASE_CHAIN)
+ nf_unregister_hooks(nft_base_chain(chain)->ops,
+ afi->nops);
+ }
+}
+
+static int nf_tables_updtable(struct nft_ctx *ctx)
+{
+ struct nft_trans *trans;
+ u32 flags;
+ int ret = 0;
+
+ if (!ctx->nla[NFTA_TABLE_FLAGS])
+ return 0;
+
+ flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
+ if (flags & ~NFT_TABLE_F_DORMANT)
+ return -EINVAL;
+
+ if (flags == ctx->table->flags)
+ return 0;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
+ sizeof(struct nft_trans_table));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if ((flags & NFT_TABLE_F_DORMANT) &&
+ !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_trans_table_enable(trans) = false;
+ } else if (!(flags & NFT_TABLE_F_DORMANT) &&
+ ctx->table->flags & NFT_TABLE_F_DORMANT) {
+ ret = nf_tables_table_enable(ctx->afi, ctx->table);
+ if (ret >= 0) {
+ ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
+ nft_trans_table_enable(trans) = true;
+ }
+ }
+ if (ret < 0)
+ goto err;
+
+ nft_trans_table_update(trans) = true;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+err:
+ nft_trans_destroy(trans);
+ return ret;
+}
+
+static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWTABLE)
+ ctx->table->flags |= NFT_TABLE_INACTIVE;
+
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+}
+
+static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nlattr *name;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ u32 flags = 0;
+ struct nft_ctx ctx;
+ int err;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ name = nla[NFTA_TABLE_NAME];
+ table = nf_tables_table_lookup(afi, name);
+ if (IS_ERR(table)) {
+ if (PTR_ERR(table) != -ENOENT)
+ return PTR_ERR(table);
+ table = NULL;
+ }
+
+ if (table != NULL) {
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ return nf_tables_updtable(&ctx);
+ }
+
+ if (nla[NFTA_TABLE_FLAGS]) {
+ flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
+ if (flags & ~NFT_TABLE_F_DORMANT)
+ return -EINVAL;
+ }
+
+ if (!try_module_get(afi->owner))
+ return -EAFNOSUPPORT;
+
+ table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL);
+ if (table == NULL) {
+ module_put(afi->owner);
+ return -ENOMEM;
+ }
+
+ nla_strlcpy(table->name, name, nla_len(name));
+ INIT_LIST_HEAD(&table->chains);
+ INIT_LIST_HEAD(&table->sets);
+ table->flags = flags;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
+ if (err < 0) {
+ kfree(table);
+ module_put(afi->owner);
+ return err;
+ }
+ list_add_tail_rcu(&table->list, &afi->tables);
+ return 0;
+}
+
+static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family, err;
+ struct nft_ctx ctx;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ if (table->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE);
+ if (err < 0)
+ return err;
+
+ list_del_rcu(&table->list);
+ return 0;
+}
+
+static void nf_tables_table_destroy(struct nft_ctx *ctx)
+{
+ BUG_ON(ctx->table->use > 0);
+
+ kfree(ctx->table);
+ module_put(ctx->afi->owner);
+}
+
+int nft_register_chain_type(const struct nf_chain_type *ctype)
+{
+ int err = 0;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (chain_type[ctype->family][ctype->type] != NULL) {
+ err = -EBUSY;
+ goto out;
+ }
+ chain_type[ctype->family][ctype->type] = ctype;
+out:
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_register_chain_type);
+
+void nft_unregister_chain_type(const struct nf_chain_type *ctype)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ chain_type[ctype->family][ctype->type] = NULL;
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
+
+/*
+ * Chains
+ */
+
+static struct nft_chain *
+nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
+{
+ struct nft_chain *chain;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (chain->handle == handle)
+ return chain;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
+ const struct nlattr *nla)
+{
+ struct nft_chain *chain;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nla_strcmp(nla, chain->name))
+ return chain;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
+ [NFTA_CHAIN_TABLE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 },
+ [NFTA_CHAIN_NAME] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED },
+ [NFTA_CHAIN_POLICY] = { .type = NLA_U32 },
+ [NFTA_CHAIN_TYPE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
+ [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 },
+ [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 },
+};
+
+static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+{
+ struct nft_stats *cpu_stats, total;
+ struct nlattr *nest;
+ unsigned int seq;
+ u64 pkts, bytes;
+ int cpu;
+
+ memset(&total, 0, sizeof(total));
+ for_each_possible_cpu(cpu) {
+ cpu_stats = per_cpu_ptr(stats, cpu);
+ do {
+ seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ pkts = cpu_stats->pkts;
+ bytes = cpu_stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ total.pkts += pkts;
+ total.bytes += bytes;
+ }
+ nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
+ nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, u32 flags, int family,
+ const struct nft_table *table,
+ const struct nft_chain *chain)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle)))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
+ goto nla_put_failure;
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain = nft_base_chain(chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
+ htonl(basechain->policy)))
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
+ goto nla_put_failure;
+
+ if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0,
+ ctx->afi->family, ctx->table,
+ ctx->chain);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_dump_chains(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(chain, &table->chains, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWCHAIN,
+ NLM_F_MULTI,
+ afi->family, table, chain) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_chains,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
+ family, table, chain);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+ [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
+ [NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
+};
+
+static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
+{
+ struct nlattr *tb[NFTA_COUNTER_MAX+1];
+ struct nft_stats __percpu *newstats;
+ struct nft_stats *stats;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
+ return ERR_PTR(-EINVAL);
+
+ newstats = netdev_alloc_pcpu_stats(struct nft_stats);
+ if (newstats == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ /* Restore old counters on this cpu, no problem. Per-cpu statistics
+ * are not exposed to userspace.
+ */
+ stats = this_cpu_ptr(newstats);
+ stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+
+ return newstats;
+}
+
+static void nft_chain_stats_replace(struct nft_base_chain *chain,
+ struct nft_stats __percpu *newstats)
+{
+ if (chain->stats) {
+ struct nft_stats __percpu *oldstats =
+ nft_dereference(chain->stats);
+
+ rcu_assign_pointer(chain->stats, newstats);
+ synchronize_rcu();
+ free_percpu(oldstats);
+ } else
+ rcu_assign_pointer(chain->stats, newstats);
+}
+
+static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWCHAIN)
+ ctx->chain->flags |= NFT_CHAIN_INACTIVE;
+
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+}
+
+static void nf_tables_chain_destroy(struct nft_chain *chain)
+{
+ BUG_ON(chain->use > 0);
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ module_put(nft_base_chain(chain)->type->owner);
+ free_percpu(nft_base_chain(chain)->stats);
+ kfree(nft_base_chain(chain));
+ } else {
+ kfree(chain);
+ }
+}
+
+static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nlattr * uninitialized_var(name);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_base_chain *basechain = NULL;
+ struct nlattr *ha[NFTA_HOOK_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ u8 policy = NF_ACCEPT;
+ u64 handle = 0;
+ unsigned int i;
+ struct nft_stats __percpu *stats;
+ int err;
+ bool create;
+ struct nft_ctx ctx;
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ chain = NULL;
+ name = nla[NFTA_CHAIN_NAME];
+
+ if (nla[NFTA_CHAIN_HANDLE]) {
+ handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+ chain = nf_tables_chain_lookup_byhandle(table, handle);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ } else {
+ chain = nf_tables_chain_lookup(table, name);
+ if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) != -ENOENT)
+ return PTR_ERR(chain);
+ chain = NULL;
+ }
+ }
+
+ if (nla[NFTA_CHAIN_POLICY]) {
+ if ((chain != NULL &&
+ !(chain->flags & NFT_BASE_CHAIN)) ||
+ nla[NFTA_CHAIN_HOOK] == NULL)
+ return -EOPNOTSUPP;
+
+ policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
+ switch (policy) {
+ case NF_DROP:
+ case NF_ACCEPT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (chain != NULL) {
+ struct nft_stats *stats = NULL;
+ struct nft_trans *trans;
+
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (nla[NFTA_CHAIN_HANDLE] && name &&
+ !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
+ return -EEXIST;
+
+ if (nla[NFTA_CHAIN_COUNTERS]) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ return -EOPNOTSUPP;
+
+ stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
+ if (IS_ERR(stats))
+ return PTR_ERR(stats);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
+ sizeof(struct nft_trans_chain));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ nft_trans_chain_stats(trans) = stats;
+ nft_trans_chain_update(trans) = true;
+
+ if (nla[NFTA_CHAIN_POLICY])
+ nft_trans_chain_policy(trans) = policy;
+ else
+ nft_trans_chain_policy(trans) = -1;
+
+ if (nla[NFTA_CHAIN_HANDLE] && name) {
+ nla_strlcpy(nft_trans_chain_name(trans), name,
+ NFT_CHAIN_MAXNAMELEN);
+ }
+ list_add_tail(&trans->list, &net->nft.commit_list);
+ return 0;
+ }
+
+ if (table->use == UINT_MAX)
+ return -EOVERFLOW;
+
+ if (nla[NFTA_CHAIN_HOOK]) {
+ const struct nf_chain_type *type;
+ struct nf_hook_ops *ops;
+ nf_hookfn *hookfn;
+ u32 hooknum, priority;
+
+ type = chain_type[family][NFT_CHAIN_T_DEFAULT];
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = nf_tables_chain_type_lookup(afi,
+ nla[NFTA_CHAIN_TYPE],
+ create);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+ }
+
+ err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
+ nft_hook_policy);
+ if (err < 0)
+ return err;
+ if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
+ ha[NFTA_HOOK_PRIORITY] == NULL)
+ return -EINVAL;
+
+ hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ if (hooknum >= afi->nhooks)
+ return -EINVAL;
+ priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+
+ if (!(type->hook_mask & (1 << hooknum)))
+ return -EOPNOTSUPP;
+ if (!try_module_get(type->owner))
+ return -ENOENT;
+ hookfn = type->hooks[hooknum];
+
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ if (basechain == NULL)
+ return -ENOMEM;
+
+ if (nla[NFTA_CHAIN_COUNTERS]) {
+ stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
+ if (IS_ERR(stats)) {
+ module_put(type->owner);
+ kfree(basechain);
+ return PTR_ERR(stats);
+ }
+ basechain->stats = stats;
+ } else {
+ stats = netdev_alloc_pcpu_stats(struct nft_stats);
+ if (IS_ERR(stats)) {
+ module_put(type->owner);
+ kfree(basechain);
+ return PTR_ERR(stats);
+ }
+ rcu_assign_pointer(basechain->stats, stats);
+ }
+
+ basechain->type = type;
+ chain = &basechain->chain;
+
+ for (i = 0; i < afi->nops; i++) {
+ ops = &basechain->ops[i];
+ ops->pf = family;
+ ops->owner = afi->owner;
+ ops->hooknum = hooknum;
+ ops->priority = priority;
+ ops->priv = chain;
+ ops->hook = afi->hooks[ops->hooknum];
+ if (hookfn)
+ ops->hook = hookfn;
+ if (afi->hook_ops_init)
+ afi->hook_ops_init(ops, i);
+ }
+
+ chain->flags |= NFT_BASE_CHAIN;
+ basechain->policy = policy;
+ } else {
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ if (chain == NULL)
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&chain->rules);
+ chain->handle = nf_tables_alloc_handle(table);
+ chain->net = net;
+ chain->table = table;
+ nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ chain->flags & NFT_BASE_CHAIN) {
+ err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ if (err < 0)
+ goto err1;
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN);
+ if (err < 0)
+ goto err2;
+
+ table->use++;
+ list_add_tail_rcu(&chain->list, &table->chains);
+ return 0;
+err2:
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ chain->flags & NFT_BASE_CHAIN) {
+ nf_unregister_hooks(nft_base_chain(chain)->ops,
+ afi->nops);
+ }
+err1:
+ nf_tables_chain_destroy(chain);
+ return err;
+}
+
+static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ struct nft_ctx ctx;
+ int err;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+ if (chain->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN);
+ if (err < 0)
+ return err;
+
+ table->use--;
+ list_del_rcu(&chain->list);
+ return 0;
+}
+
+/*
+ * Expressions
+ */
+
+/**
+ * nft_register_expr - register nf_tables expr type
+ * @ops: expr type
+ *
+ * Registers the expr type for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_expr(struct nft_expr_type *type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (type->family == NFPROTO_UNSPEC)
+ list_add_tail_rcu(&type->list, &nf_tables_expressions);
+ else
+ list_add_rcu(&type->list, &nf_tables_expressions);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_expr);
+
+/**
+ * nft_unregister_expr - unregister nf_tables expr type
+ * @ops: expr type
+ *
+ * Unregisters the expr typefor use with nf_tables.
+ */
+void nft_unregister_expr(struct nft_expr_type *type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&type->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_expr);
+
+static const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ list_for_each_entry(type, &nf_tables_expressions, list) {
+ if (!nla_strcmp(nla, type->name) &&
+ (!type->family || type->family == family))
+ return type;
+ }
+ return NULL;
+}
+
+static const struct nft_expr_type *nft_expr_type_get(u8 family,
+ struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ type = __nft_expr_type_get(family, nla);
+ if (type != NULL && try_module_get(type->owner))
+ return type;
+
+#ifdef CONFIG_MODULES
+ if (type == NULL) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(family, nla))
+ return ERR_PTR(-EAGAIN);
+
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%.*s",
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(family, nla))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
+ [NFTA_EXPR_NAME] = { .type = NLA_STRING },
+ [NFTA_EXPR_DATA] = { .type = NLA_NESTED },
+};
+
+static int nf_tables_fill_expr_info(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
+ goto nla_put_failure;
+
+ if (expr->ops->dump) {
+ struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
+ if (data == NULL)
+ goto nla_put_failure;
+ if (expr->ops->dump(skb, expr) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, data);
+ }
+
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+};
+
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nf_tables_expr_parse(const struct nft_ctx *ctx,
+ const struct nlattr *nla,
+ struct nft_expr_info *info)
+{
+ const struct nft_expr_type *type;
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
+ if (err < 0)
+ return err;
+
+ type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ if (tb[NFTA_EXPR_DATA]) {
+ err = nla_parse_nested(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA], type->policy);
+ if (err < 0)
+ goto err1;
+ } else
+ memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
+
+ if (type->select_ops != NULL) {
+ ops = type->select_ops(ctx,
+ (const struct nlattr * const *)info->tb);
+ if (IS_ERR(ops)) {
+ err = PTR_ERR(ops);
+ goto err1;
+ }
+ } else
+ ops = type->ops;
+
+ info->ops = ops;
+ return 0;
+
+err1:
+ module_put(type->owner);
+ return err;
+}
+
+static int nf_tables_newexpr(const struct nft_ctx *ctx,
+ const struct nft_expr_info *info,
+ struct nft_expr *expr)
+{
+ const struct nft_expr_ops *ops = info->ops;
+ int err;
+
+ expr->ops = ops;
+ if (ops->init) {
+ err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+ if (err < 0)
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ expr->ops = NULL;
+ return err;
+}
+
+static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
+{
+ if (expr->ops->destroy)
+ expr->ops->destroy(ctx, expr);
+ module_put(expr->ops->type->owner);
+}
+
+/*
+ * Rules
+ */
+
+static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
+ u64 handle)
+{
+ struct nft_rule *rule;
+
+ // FIXME: this sucks
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (handle == rule->handle)
+ return rule;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
+ const struct nlattr *nla)
+{
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+}
+
+static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
+ [NFTA_RULE_TABLE] = { .type = NLA_STRING },
+ [NFTA_RULE_CHAIN] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_RULE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
+ [NFTA_RULE_POSITION] = { .type = NLA_U64 },
+ [NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
+};
+
+static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, u32 flags, int family,
+ const struct nft_table *table,
+ const struct nft_chain *chain,
+ const struct nft_rule *rule)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ const struct nft_expr *expr, *next;
+ struct nlattr *list;
+ const struct nft_rule *prule;
+ int type = event | NFNL_SUBSYS_NFTABLES << 8;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
+ goto nla_put_failure;
+
+ if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
+ prule = list_entry(rule->list.prev, struct nft_rule, list);
+ if (nla_put_be64(skb, NFTA_RULE_POSITION,
+ cpu_to_be64(prule->handle)))
+ goto nla_put_failure;
+ }
+
+ list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+ if (list == NULL)
+ goto nla_put_failure;
+ nft_rule_for_each_expr(expr, next, rule) {
+ struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM);
+ if (elem == NULL)
+ goto nla_put_failure;
+ if (nf_tables_fill_expr_info(skb, expr) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, elem);
+ }
+ nla_nest_end(skb, list);
+
+ if (rule->ulen &&
+ nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule)))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_rule_notify(const struct nft_ctx *ctx,
+ const struct nft_rule *rule,
+ int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0,
+ ctx->afi->family, ctx->table,
+ ctx->chain, rule);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static inline bool
+nft_rule_is_active(struct net *net, const struct nft_rule *rule)
+{
+ return (rule->genmask & (1 << net->nft.gencursor)) == 0;
+}
+
+static inline int gencursor_next(struct net *net)
+{
+ return net->nft.gencursor+1 == 1 ? 1 : 0;
+}
+
+static inline int
+nft_rule_is_active_next(struct net *net, const struct nft_rule *rule)
+{
+ return (rule->genmask & (1 << gencursor_next(net))) == 0;
+}
+
+static inline void
+nft_rule_activate_next(struct net *net, struct nft_rule *rule)
+{
+ /* Now inactive, will be active in the future */
+ rule->genmask = (1 << net->nft.gencursor);
+}
+
+static inline void
+nft_rule_disactivate_next(struct net *net, struct nft_rule *rule)
+{
+ rule->genmask = (1 << gencursor_next(net));
+}
+
+static inline void nft_rule_clear(struct net *net, struct nft_rule *rule)
+{
+ rule->genmask = 0;
+}
+
+static int nf_tables_dump_rules(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(chain, &table->chains, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list) {
+ if (!nft_rule_is_active(net, rule))
+ goto cont;
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWRULE,
+ NLM_F_MULTI | NLM_F_APPEND,
+ afi->family, table, chain, rule) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_rules,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+
+ rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+ family, table, chain, rule);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
+ struct nft_rule *rule)
+{
+ struct nft_expr *expr;
+
+ /*
+ * Careful: some expressions might not be initialized in case this
+ * is called on error from nf_tables_newrule().
+ */
+ expr = nft_expr_first(rule);
+ while (expr->ops && expr != nft_expr_last(rule)) {
+ nf_tables_expr_destroy(ctx, expr);
+ expr = nft_expr_next(expr);
+ }
+ kfree(rule);
+}
+
+static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_rule *rule)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule));
+ if (trans == NULL)
+ return NULL;
+
+ nft_trans_rule(trans) = rule;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return trans;
+}
+
+#define NFT_RULE_MAXEXPRS 128
+
+static struct nft_expr_info *info;
+
+static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_rule *rule, *old_rule = NULL;
+ struct nft_trans *trans = NULL;
+ struct nft_expr *expr;
+ struct nft_ctx ctx;
+ struct nlattr *tmp;
+ unsigned int size, i, n, ulen = 0;
+ int err, rem;
+ bool create;
+ u64 handle, pos_handle;
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+
+ if (nla[NFTA_RULE_HANDLE]) {
+ handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
+ rule = __nf_tables_rule_lookup(chain, handle);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ old_rule = rule;
+ else
+ return -EOPNOTSUPP;
+ } else {
+ if (!create || nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EINVAL;
+ handle = nf_tables_alloc_handle(table);
+
+ if (chain->use == UINT_MAX)
+ return -EOVERFLOW;
+ }
+
+ if (nla[NFTA_RULE_POSITION]) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -EOPNOTSUPP;
+
+ pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
+ old_rule = __nf_tables_rule_lookup(chain, pos_handle);
+ if (IS_ERR(old_rule))
+ return PTR_ERR(old_rule);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
+ n = 0;
+ size = 0;
+ if (nla[NFTA_RULE_EXPRESSIONS]) {
+ nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
+ err = -EINVAL;
+ if (nla_type(tmp) != NFTA_LIST_ELEM)
+ goto err1;
+ if (n == NFT_RULE_MAXEXPRS)
+ goto err1;
+ err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
+ if (err < 0)
+ goto err1;
+ size += info[n].ops->size;
+ n++;
+ }
+ }
+
+ if (nla[NFTA_RULE_USERDATA])
+ ulen = nla_len(nla[NFTA_RULE_USERDATA]);
+
+ err = -ENOMEM;
+ rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL);
+ if (rule == NULL)
+ goto err1;
+
+ nft_rule_activate_next(net, rule);
+
+ rule->handle = handle;
+ rule->dlen = size;
+ rule->ulen = ulen;
+
+ if (ulen)
+ nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen);
+
+ expr = nft_expr_first(rule);
+ for (i = 0; i < n; i++) {
+ err = nf_tables_newexpr(&ctx, &info[i], expr);
+ if (err < 0)
+ goto err2;
+ info[i].ops = NULL;
+ expr = nft_expr_next(expr);
+ }
+
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (nft_rule_is_active_next(net, old_rule)) {
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
+ old_rule);
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+ nft_rule_disactivate_next(net, old_rule);
+ chain->use--;
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ } else {
+ err = -ENOENT;
+ goto err2;
+ }
+ } else if (nlh->nlmsg_flags & NLM_F_APPEND)
+ if (old_rule)
+ list_add_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_tail_rcu(&rule->list, &chain->rules);
+ else {
+ if (old_rule)
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_rcu(&rule->list, &chain->rules);
+ }
+
+ if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ err = -ENOMEM;
+ goto err3;
+ }
+ chain->use++;
+ return 0;
+
+err3:
+ list_del_rcu(&rule->list);
+ if (trans) {
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ nft_rule_clear(net, nft_trans_rule(trans));
+ nft_trans_destroy(trans);
+ chain->use++;
+ }
+err2:
+ nf_tables_rule_destroy(&ctx, rule);
+err1:
+ for (i = 0; i < n; i++) {
+ if (info[i].ops != NULL)
+ module_put(info[i].ops->type->owner);
+ }
+ return err;
+}
+
+static int
+nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule)
+{
+ /* You cannot delete the same rule twice */
+ if (nft_rule_is_active_next(ctx->net, rule)) {
+ if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL)
+ return -ENOMEM;
+ nft_rule_disactivate_next(ctx->net, rule);
+ ctx->chain->use--;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int nf_table_delrule_by_chain(struct nft_ctx *ctx)
+{
+ struct nft_rule *rule;
+ int err;
+
+ list_for_each_entry(rule, &ctx->chain->rules, list) {
+ err = nf_tables_delrule_one(ctx, rule);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
+static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_chain *chain = NULL;
+ struct nft_rule *rule;
+ int family = nfmsg->nfgen_family, err = 0;
+ struct nft_ctx ctx;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ if (nla[NFTA_RULE_CHAIN]) {
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
+ if (chain) {
+ if (nla[NFTA_RULE_HANDLE]) {
+ rule = nf_tables_rule_lookup(chain,
+ nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ err = nf_tables_delrule_one(&ctx, rule);
+ } else {
+ err = nf_table_delrule_by_chain(&ctx);
+ }
+ } else {
+ list_for_each_entry(chain, &table->chains, list) {
+ ctx.chain = chain;
+ err = nf_table_delrule_by_chain(&ctx);
+ if (err < 0)
+ break;
+ }
+ }
+
+ return err;
+}
+
+/*
+ * Sets
+ */
+
+static LIST_HEAD(nf_tables_set_ops);
+
+int nft_register_set(struct nft_set_ops *ops)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_tail_rcu(&ops->list, &nf_tables_set_ops);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_set);
+
+void nft_unregister_set(struct nft_set_ops *ops)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&ops->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_set);
+
+/*
+ * Select a set implementation based on the data characteristics and the
+ * given policy. The total memory use might not be known if no size is
+ * given, in that case the amount of memory per element is used.
+ */
+static const struct nft_set_ops *
+nft_select_set_ops(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc,
+ enum nft_set_policies policy)
+{
+ const struct nft_set_ops *ops, *bops;
+ struct nft_set_estimate est, best;
+ u32 features;
+
+#ifdef CONFIG_MODULES
+ if (list_empty(&nf_tables_set_ops)) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-set");
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (!list_empty(&nf_tables_set_ops))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ features = 0;
+ if (nla[NFTA_SET_FLAGS] != NULL) {
+ features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+ features &= NFT_SET_INTERVAL | NFT_SET_MAP;
+ }
+
+ bops = NULL;
+ best.size = ~0;
+ best.class = ~0;
+
+ list_for_each_entry(ops, &nf_tables_set_ops, list) {
+ if ((ops->features & features) != features)
+ continue;
+ if (!ops->estimate(desc, features, &est))
+ continue;
+
+ switch (policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ if (est.class < best.class)
+ break;
+ if (est.class == best.class && est.size < best.size)
+ break;
+ continue;
+ case NFT_SET_POL_MEMORY:
+ if (est.size < best.size)
+ break;
+ if (est.size == best.size && est.class < best.class)
+ break;
+ continue;
+ default:
+ break;
+ }
+
+ if (!try_module_get(ops->owner))
+ continue;
+ if (bops != NULL)
+ module_put(bops->owner);
+
+ bops = ops;
+ best = est;
+ }
+
+ if (bops != NULL)
+ return bops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
+ [NFTA_SET_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_NAME] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NFTA_SET_FLAGS] = { .type = NLA_U32 },
+ [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
+ [NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
+ [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 },
+ [NFTA_SET_DATA_LEN] = { .type = NLA_U32 },
+ [NFTA_SET_POLICY] = { .type = NLA_U32 },
+ [NFTA_SET_DESC] = { .type = NLA_NESTED },
+ [NFTA_SET_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
+ [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
+};
+
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi = NULL;
+ struct nft_table *table = NULL;
+
+ if (nfmsg->nfgen_family != NFPROTO_UNSPEC) {
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+ }
+
+ if (nla[NFTA_SET_TABLE] != NULL) {
+ if (afi == NULL)
+ return -EAFNOSUPPORT;
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ }
+
+ nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ return 0;
+}
+
+struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
+ const struct nlattr *nla)
+{
+ struct nft_set *set;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry(set, &table->sets, list) {
+ if (!nla_strcmp(nla, set->name))
+ return set;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
+ const struct nlattr *nla)
+{
+ struct nft_trans *trans;
+ u32 id = ntohl(nla_get_be32(nla));
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ if (trans->msg_type == NFT_MSG_NEWSET &&
+ id == nft_trans_set_id(trans))
+ return nft_trans_set(trans);
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
+ const char *name)
+{
+ const struct nft_set *i;
+ const char *p;
+ unsigned long *inuse;
+ unsigned int n = 0, min = 0;
+
+ p = strnchr(name, IFNAMSIZ, '%');
+ if (p != NULL) {
+ if (p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
+
+ inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+ if (inuse == NULL)
+ return -ENOMEM;
+cont:
+ list_for_each_entry(i, &ctx->table->sets, list) {
+ int tmp;
+
+ if (!sscanf(i->name, name, &tmp))
+ continue;
+ if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
+ continue;
+
+ set_bit(tmp - min, inuse);
+ }
+
+ n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
+ if (n >= BITS_PER_BYTE * PAGE_SIZE) {
+ min += BITS_PER_BYTE * PAGE_SIZE;
+ memset(inuse, 0, PAGE_SIZE);
+ goto cont;
+ }
+ free_page((unsigned long)inuse);
+ }
+
+ snprintf(set->name, sizeof(set->name), name, min + n);
+ list_for_each_entry(i, &ctx->table->sets, list) {
+ if (!strcmp(set->name, i->name))
+ return -ENFILE;
+ }
+ return 0;
+}
+
+static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
+ const struct nft_set *set, u16 event, u16 flags)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *desc;
+ u32 portid = ctx->portid;
+ u32 seq = ctx->seq;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx->afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_NAME, set->name))
+ goto nla_put_failure;
+ if (set->flags != 0)
+ if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen)))
+ goto nla_put_failure;
+ if (set->flags & NFT_SET_MAP) {
+ if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
+ goto nla_put_failure;
+ }
+
+ desc = nla_nest_start(skb, NFTA_SET_DESC);
+ if (desc == NULL)
+ goto nla_put_failure;
+ if (set->size &&
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ goto nla_put_failure;
+ nla_nest_end(skb, desc);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_set_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ int event, gfp_t gfp_flags)
+{
+ struct sk_buff *skb;
+ u32 portid = ctx->portid;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_set(skb, ctx, set, event, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES,
+ ctx->report, gfp_flags);
+err:
+ if (err < 0)
+ nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
+ return err;
+}
+
+static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ unsigned int idx = 0, s_idx = cb->args[0];
+
+ if (cb->args[1])
+ return skb->len;
+
+ rcu_read_lock();
+ cb->seq = ctx->net->nft.base_seq;
+
+ list_for_each_entry_rcu(set, &ctx->table->sets, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET,
+ NLM_F_MULTI) < 0) {
+ cb->args[0] = idx;
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ cb->args[1] = 1;
+done:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ unsigned int idx, s_idx = cb->args[0];
+ struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
+
+ if (cb->args[1])
+ return skb->len;
+
+ rcu_read_lock();
+ cb->seq = ctx->net->nft.base_seq;
+
+ list_for_each_entry_rcu(table, &ctx->afi->tables, list) {
+ if (cur_table) {
+ if (cur_table != table)
+ continue;
+
+ cur_table = NULL;
+ }
+ ctx->table = table;
+ idx = 0;
+ list_for_each_entry_rcu(set, &ctx->table->sets, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET,
+ NLM_F_MULTI) < 0) {
+ cb->args[0] = idx;
+ cb->args[2] = (unsigned long) table;
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ cb->args[1] = 1;
+done:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ unsigned int idx, s_idx = cb->args[0];
+ struct nft_af_info *afi;
+ struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
+ struct net *net = sock_net(skb->sk);
+ int cur_family = cb->args[3];
+
+ if (cb->args[1])
+ return skb->len;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (cur_family) {
+ if (afi->family != cur_family)
+ continue;
+
+ cur_family = 0;
+ }
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (cur_table) {
+ if (cur_table != table)
+ continue;
+
+ cur_table = NULL;
+ }
+
+ ctx->table = table;
+ ctx->afi = afi;
+ idx = 0;
+ list_for_each_entry_rcu(set, &ctx->table->sets, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (nf_tables_fill_set(skb, ctx, set,
+ NFT_MSG_NEWSET,
+ NLM_F_MULTI) < 0) {
+ cb->args[0] = idx;
+ cb->args[2] = (unsigned long) table;
+ cb->args[3] = afi->family;
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ if (s_idx)
+ s_idx = 0;
+ }
+ }
+ cb->args[1] = 1;
+done:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nlattr *nla[NFTA_SET_MAX + 1];
+ struct nft_ctx ctx;
+ int err, ret;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX,
+ nft_set_policy);
+ if (err < 0)
+ return err;
+
+ err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla);
+ if (err < 0)
+ return err;
+
+ if (ctx.table == NULL) {
+ if (ctx.afi == NULL)
+ ret = nf_tables_dump_sets_all(&ctx, skb, cb);
+ else
+ ret = nf_tables_dump_sets_family(&ctx, skb, cb);
+ } else
+ ret = nf_tables_dump_sets_table(&ctx, skb, cb);
+
+ return ret;
+}
+
+#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */
+
+static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+ struct sk_buff *skb2;
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ int err;
+
+ /* Verify existance before starting dump */
+ err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ if (err < 0)
+ return err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_sets,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ /* Only accept unspec with dump */
+ if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ return -EAFNOSUPPORT;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
+ struct nft_set_desc *desc,
+ const struct nlattr *nla)
+{
+ struct nlattr *da[NFTA_SET_DESC_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy);
+ if (err < 0)
+ return err;
+
+ if (da[NFTA_SET_DESC_SIZE] != NULL)
+ desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
+
+ return 0;
+}
+
+static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_set *set)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) {
+ nft_trans_set_id(trans) =
+ ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
+ set->flags |= NFT_SET_INACTIVE;
+ }
+ nft_trans_set(trans) = set;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+}
+
+static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_set_ops *ops;
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ char name[IFNAMSIZ];
+ unsigned int size;
+ bool create;
+ u32 ktype, dtype, flags, policy;
+ struct nft_set_desc desc;
+ int err;
+
+ if (nla[NFTA_SET_TABLE] == NULL ||
+ nla[NFTA_SET_NAME] == NULL ||
+ nla[NFTA_SET_KEY_LEN] == NULL ||
+ nla[NFTA_SET_ID] == NULL)
+ return -EINVAL;
+
+ memset(&desc, 0, sizeof(desc));
+
+ ktype = NFT_DATA_VALUE;
+ if (nla[NFTA_SET_KEY_TYPE] != NULL) {
+ ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
+ if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
+ return -EINVAL;
+ }
+
+ desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+ if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data))
+ return -EINVAL;
+
+ flags = 0;
+ if (nla[NFTA_SET_FLAGS] != NULL) {
+ flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+ if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
+ NFT_SET_INTERVAL | NFT_SET_MAP))
+ return -EINVAL;
+ }
+
+ dtype = 0;
+ if (nla[NFTA_SET_DATA_TYPE] != NULL) {
+ if (!(flags & NFT_SET_MAP))
+ return -EINVAL;
+
+ dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
+ if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
+ dtype != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ if (dtype != NFT_DATA_VERDICT) {
+ if (nla[NFTA_SET_DATA_LEN] == NULL)
+ return -EINVAL;
+ desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
+ if (desc.dlen == 0 ||
+ desc.dlen > FIELD_SIZEOF(struct nft_data, data))
+ return -EINVAL;
+ } else
+ desc.dlen = sizeof(struct nft_data);
+ } else if (flags & NFT_SET_MAP)
+ return -EINVAL;
+
+ policy = NFT_SET_POL_PERFORMANCE;
+ if (nla[NFTA_SET_POLICY] != NULL)
+ policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+
+ if (nla[NFTA_SET_DESC] != NULL) {
+ err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]);
+ if (err < 0)
+ return err;
+ }
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+
+ set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set)) {
+ if (PTR_ERR(set) != -ENOENT)
+ return PTR_ERR(set);
+ set = NULL;
+ }
+
+ if (set != NULL) {
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+ return 0;
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -ENOENT;
+
+ ops = nft_select_set_ops(nla, &desc, policy);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ size = 0;
+ if (ops->privsize != NULL)
+ size = ops->privsize(nla);
+
+ err = -ENOMEM;
+ set = kzalloc(sizeof(*set) + size, GFP_KERNEL);
+ if (set == NULL)
+ goto err1;
+
+ nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name));
+ err = nf_tables_set_alloc_name(&ctx, set, name);
+ if (err < 0)
+ goto err2;
+
+ INIT_LIST_HEAD(&set->bindings);
+ set->ops = ops;
+ set->ktype = ktype;
+ set->klen = desc.klen;
+ set->dtype = dtype;
+ set->dlen = desc.dlen;
+ set->flags = flags;
+ set->size = desc.size;
+
+ err = ops->init(set, &desc, nla);
+ if (err < 0)
+ goto err2;
+
+ err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
+ if (err < 0)
+ goto err2;
+
+ list_add_tail_rcu(&set->list, &table->sets);
+ table->use++;
+ return 0;
+
+err2:
+ kfree(set);
+err1:
+ module_put(ops->owner);
+ return err;
+}
+
+static void nft_set_destroy(struct nft_set *set)
+{
+ set->ops->destroy(set);
+ module_put(set->ops->owner);
+ kfree(set);
+}
+
+static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ list_del_rcu(&set->list);
+ nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
+ nft_set_destroy(set);
+}
+
+static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int err;
+
+ if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ return -EAFNOSUPPORT;
+ if (nla[NFTA_SET_TABLE] == NULL)
+ return -EINVAL;
+
+ err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+ if (!list_empty(&set->bindings))
+ return -EBUSY;
+
+ err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set);
+ if (err < 0)
+ return err;
+
+ list_del_rcu(&set->list);
+ ctx.table->use--;
+ return 0;
+}
+
+static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ enum nft_registers dreg;
+
+ dreg = nft_type_to_reg(set->dtype);
+ return nft_validate_data_load(ctx, dreg, &elem->data,
+ set->dtype == NFT_DATA_VERDICT ?
+ NFT_DATA_VERDICT : NFT_DATA_VALUE);
+}
+
+int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding)
+{
+ struct nft_set_binding *i;
+ struct nft_set_iter iter;
+
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS)
+ return -EBUSY;
+
+ if (set->flags & NFT_SET_MAP) {
+ /* If the set is already bound to the same chain all
+ * jumps are already validated for that chain.
+ */
+ list_for_each_entry(i, &set->bindings, list) {
+ if (i->chain == binding->chain)
+ goto bind;
+ }
+
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nf_tables_bind_check_setelem;
+
+ set->ops->walk(ctx, set, &iter);
+ if (iter.err < 0) {
+ /* Destroy anonymous sets if binding fails */
+ if (set->flags & NFT_SET_ANONYMOUS)
+ nf_tables_set_destroy(ctx, set);
+
+ return iter.err;
+ }
+ }
+bind:
+ binding->chain = ctx->chain;
+ list_add_tail_rcu(&binding->list, &set->bindings);
+ return 0;
+}
+
+void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding)
+{
+ list_del_rcu(&binding->list);
+
+ if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS &&
+ !(set->flags & NFT_SET_INACTIVE))
+ nf_tables_set_destroy(ctx, set);
+}
+
+/*
+ * Set elements
+ */
+
+static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
+ [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
+ [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
+};
+
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[],
+ bool trans)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (!trans && (table->flags & NFT_TABLE_INACTIVE))
+ return -ENOENT;
+
+ nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ return 0;
+}
+
+static int nf_tables_fill_setelem(struct sk_buff *skb,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE,
+ set->klen) < 0)
+ goto nla_put_failure;
+
+ if (set->flags & NFT_SET_MAP &&
+ !(elem->flags & NFT_SET_ELEM_INTERVAL_END) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data,
+ set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
+ set->dlen) < 0)
+ goto nla_put_failure;
+
+ if (elem->flags != 0)
+ if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+struct nft_set_dump_args {
+ const struct netlink_callback *cb;
+ struct nft_set_iter iter;
+ struct sk_buff *skb;
+};
+
+static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ struct nft_set_dump_args *args;
+
+ args = container_of(iter, struct nft_set_dump_args, iter);
+ return nf_tables_fill_setelem(args->skb, set, elem);
+}
+
+static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ struct nft_set_dump_args args;
+ struct nft_ctx ctx;
+ struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ u32 portid, seq;
+ int event, err;
+
+ err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla,
+ NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy);
+ if (err < 0)
+ return err;
+
+ err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla,
+ false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ event = NFT_MSG_NEWSETELEM;
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ NLM_F_MULTI);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx.afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ args.cb = cb;
+ args.skb = skb;
+ args.iter.skip = cb->args[0];
+ args.iter.count = 0;
+ args.iter.err = 0;
+ args.iter.fn = nf_tables_dump_setelem;
+ set->ops->walk(&ctx, set, &args.iter);
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+
+ if (args.iter.err && args.iter.err != -EMSGSIZE)
+ return args.iter.err;
+ if (args.iter.count == cb->args[0])
+ return 0;
+
+ cb->args[0] = args.iter.count;
+ return skb->len;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+ int err;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_set,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+ return -EOPNOTSUPP;
+}
+
+static int nf_tables_fill_setelem_info(struct sk_buff *skb,
+ const struct nft_ctx *ctx, u32 seq,
+ u32 portid, int event, u16 flags,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ int err;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx->afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_NAME, set->name))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ err = nf_tables_fill_setelem(skb, set, elem);
+ if (err < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ int event, u16 flags)
+{
+ struct net *net = ctx->net;
+ u32 portid = ctx->portid;
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
+ set, elem);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
+ GFP_KERNEL);
+err:
+ if (err < 0)
+ nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+ return err;
+}
+
+static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
+ int msg_type,
+ struct nft_set *set)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem));
+ if (trans == NULL)
+ return NULL;
+
+ nft_trans_elem_set(trans) = set;
+ return trans;
+}
+
+static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_data_desc d1, d2;
+ struct nft_set_elem elem;
+ struct nft_set_binding *binding;
+ enum nft_registers dreg;
+ struct nft_trans *trans;
+ int err;
+
+ if (set->size && set->nelems == set->size)
+ return -ENFILE;
+
+ err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy);
+ if (err < 0)
+ return err;
+
+ if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ return -EINVAL;
+
+ elem.flags = 0;
+ if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
+ elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
+ if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ }
+
+ if (set->flags & NFT_SET_MAP) {
+ if (nla[NFTA_SET_ELEM_DATA] == NULL &&
+ !(elem.flags & NFT_SET_ELEM_INTERVAL_END))
+ return -EINVAL;
+ if (nla[NFTA_SET_ELEM_DATA] != NULL &&
+ elem.flags & NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ } else {
+ if (nla[NFTA_SET_ELEM_DATA] != NULL)
+ return -EINVAL;
+ }
+
+ err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err1;
+ err = -EINVAL;
+ if (d1.type != NFT_DATA_VALUE || d1.len != set->klen)
+ goto err2;
+
+ err = -EEXIST;
+ if (set->ops->get(set, &elem) == 0)
+ goto err2;
+
+ if (nla[NFTA_SET_ELEM_DATA] != NULL) {
+ err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]);
+ if (err < 0)
+ goto err2;
+
+ err = -EINVAL;
+ if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen)
+ goto err3;
+
+ dreg = nft_type_to_reg(set->dtype);
+ list_for_each_entry(binding, &set->bindings, list) {
+ struct nft_ctx bind_ctx = {
+ .afi = ctx->afi,
+ .table = ctx->table,
+ .chain = (struct nft_chain *)binding->chain,
+ };
+
+ err = nft_validate_data_load(&bind_ctx, dreg,
+ &elem.data, d2.type);
+ if (err < 0)
+ goto err3;
+ }
+ }
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
+ if (trans == NULL)
+ goto err3;
+
+ err = set->ops->insert(set, &elem);
+ if (err < 0)
+ goto err4;
+
+ nft_trans_elem(trans) = elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+
+err4:
+ kfree(trans);
+err3:
+ if (nla[NFTA_SET_ELEM_DATA] != NULL)
+ nft_data_uninit(&elem.data, d2.type);
+err2:
+ nft_data_uninit(&elem.key, d1.type);
+err1:
+ return err;
+}
+
+static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nlattr *attr;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int rem, err = 0;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set)) {
+ if (nla[NFTA_SET_ELEM_LIST_SET_ID]) {
+ set = nf_tables_set_lookup_byid(net,
+ nla[NFTA_SET_ELEM_LIST_SET_ID]);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ return -EBUSY;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_add_set_elem(&ctx, set, attr);
+ if (err < 0)
+ break;
+
+ set->nelems++;
+ }
+ return err;
+}
+
+static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_data_desc desc;
+ struct nft_set_elem elem;
+ struct nft_trans *trans;
+ int err;
+
+ err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy);
+ if (err < 0)
+ goto err1;
+
+ err = -EINVAL;
+ if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ goto err1;
+
+ err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err1;
+
+ err = -EINVAL;
+ if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
+ goto err2;
+
+ err = set->ops->get(set, &elem);
+ if (err < 0)
+ goto err2;
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
+ if (trans == NULL)
+ goto err2;
+
+ nft_trans_elem(trans) = elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ nft_data_uninit(&elem.key, NFT_DATA_VALUE);
+ if (set->flags & NFT_SET_MAP)
+ nft_data_uninit(&elem.data, set->dtype);
+
+err2:
+ nft_data_uninit(&elem.key, desc.type);
+err1:
+ return err;
+}
+
+static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nlattr *attr;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int rem, err = 0;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ return -EBUSY;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_del_setelem(&ctx, set, attr);
+ if (err < 0)
+ break;
+
+ set->nelems--;
+ }
+ return err;
+}
+
+static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
+ [NFT_MSG_NEWTABLE] = {
+ .call_batch = nf_tables_newtable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_GETTABLE] = {
+ .call = nf_tables_gettable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_DELTABLE] = {
+ .call_batch = nf_tables_deltable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_NEWCHAIN] = {
+ .call_batch = nf_tables_newchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_GETCHAIN] = {
+ .call = nf_tables_getchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_DELCHAIN] = {
+ .call_batch = nf_tables_delchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_NEWRULE] = {
+ .call_batch = nf_tables_newrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_GETRULE] = {
+ .call = nf_tables_getrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_DELRULE] = {
+ .call_batch = nf_tables_delrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_NEWSET] = {
+ .call_batch = nf_tables_newset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_GETSET] = {
+ .call = nf_tables_getset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_DELSET] = {
+ .call_batch = nf_tables_delset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_NEWSETELEM] = {
+ .call_batch = nf_tables_newsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_GETSETELEM] = {
+ .call = nf_tables_getsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_DELSETELEM] = {
+ .call_batch = nf_tables_delsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+};
+
+static void nft_chain_commit_update(struct nft_trans *trans)
+{
+ struct nft_base_chain *basechain;
+
+ if (nft_trans_chain_name(trans)[0])
+ strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans));
+
+ if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN))
+ return;
+
+ basechain = nft_base_chain(trans->ctx.chain);
+ nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans));
+
+ switch (nft_trans_chain_policy(trans)) {
+ case NF_DROP:
+ case NF_ACCEPT:
+ basechain->policy = nft_trans_chain_policy(trans);
+ break;
+ }
+}
+
+/* Schedule objects for release via rcu to make sure no packets are accesing
+ * removed rules.
+ */
+static void nf_tables_commit_release_rcu(struct rcu_head *rt)
+{
+ struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
+
+ switch (trans->msg_type) {
+ case NFT_MSG_DELTABLE:
+ nf_tables_table_destroy(&trans->ctx);
+ break;
+ case NFT_MSG_DELCHAIN:
+ nf_tables_chain_destroy(trans->ctx.chain);
+ break;
+ case NFT_MSG_DELRULE:
+ nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ break;
+ case NFT_MSG_DELSET:
+ nft_set_destroy(nft_trans_set(trans));
+ break;
+ }
+ kfree(trans);
+}
+
+static int nf_tables_commit(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nft_trans *trans, *next;
+ struct nft_set *set;
+
+ /* Bump generation counter, invalidate any dump in progress */
+ while (++net->nft.base_seq == 0);
+
+ /* A new generation has just started */
+ net->nft.gencursor = gencursor_next(net);
+
+ /* Make sure all packets have left the previous generation before
+ * purging old rules.
+ */
+ synchronize_rcu();
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ if (nft_trans_table_update(trans)) {
+ if (!nft_trans_table_enable(trans)) {
+ nf_tables_table_disable(trans->ctx.afi,
+ trans->ctx.table);
+ trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ }
+ } else {
+ trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE;
+ }
+ nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELTABLE:
+ nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain_update(trans))
+ nft_chain_commit_update(trans);
+ else
+ trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE;
+
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELCHAIN:
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
+ if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) &&
+ trans->ctx.chain->flags & NFT_BASE_CHAIN) {
+ nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops,
+ trans->ctx.afi->nops);
+ }
+ break;
+ case NFT_MSG_NEWRULE:
+ nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nf_tables_rule_notify(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_MSG_NEWRULE);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELRULE:
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ nf_tables_rule_notify(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_MSG_DELRULE);
+ break;
+ case NFT_MSG_NEWSET:
+ nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE;
+ /* This avoids hitting -EBUSY when deleting the table
+ * from the transaction.
+ */
+ if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS &&
+ !list_empty(&nft_trans_set(trans)->bindings))
+ trans->ctx.table->use--;
+
+ nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ NFT_MSG_NEWSET, GFP_KERNEL);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSET:
+ nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ NFT_MSG_DELSET, GFP_KERNEL);
+ break;
+ case NFT_MSG_NEWSETELEM:
+ nf_tables_setelem_notify(&trans->ctx,
+ nft_trans_elem_set(trans),
+ &nft_trans_elem(trans),
+ NFT_MSG_NEWSETELEM, 0);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSETELEM:
+ nf_tables_setelem_notify(&trans->ctx,
+ nft_trans_elem_set(trans),
+ &nft_trans_elem(trans),
+ NFT_MSG_DELSETELEM, 0);
+ set = nft_trans_elem_set(trans);
+ set->ops->get(set, &nft_trans_elem(trans));
+ set->ops->remove(set, &nft_trans_elem(trans));
+ nft_trans_destroy(trans);
+ break;
+ }
+ }
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_del(&trans->list);
+ trans->ctx.nla = NULL;
+ call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu);
+ }
+
+ return 0;
+}
+
+/* Schedule objects for release via rcu to make sure no packets are accesing
+ * aborted rules.
+ */
+static void nf_tables_abort_release_rcu(struct rcu_head *rt)
+{
+ struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ nf_tables_table_destroy(&trans->ctx);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ nf_tables_chain_destroy(trans->ctx.chain);
+ break;
+ case NFT_MSG_NEWRULE:
+ nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ break;
+ case NFT_MSG_NEWSET:
+ nft_set_destroy(nft_trans_set(trans));
+ break;
+ }
+ kfree(trans);
+}
+
+static int nf_tables_abort(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nft_trans *trans, *next;
+ struct nft_set *set;
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ if (nft_trans_table_update(trans)) {
+ if (nft_trans_table_enable(trans)) {
+ nf_tables_table_disable(trans->ctx.afi,
+ trans->ctx.table);
+ trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ }
+ nft_trans_destroy(trans);
+ } else {
+ list_del_rcu(&trans->ctx.table->list);
+ }
+ break;
+ case NFT_MSG_DELTABLE:
+ list_add_tail_rcu(&trans->ctx.table->list,
+ &trans->ctx.afi->tables);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ if (nft_trans_chain_stats(trans))
+ free_percpu(nft_trans_chain_stats(trans));
+
+ nft_trans_destroy(trans);
+ } else {
+ trans->ctx.table->use--;
+ list_del_rcu(&trans->ctx.chain->list);
+ if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) &&
+ trans->ctx.chain->flags & NFT_BASE_CHAIN) {
+ nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops,
+ trans->ctx.afi->nops);
+ }
+ }
+ break;
+ case NFT_MSG_DELCHAIN:
+ trans->ctx.table->use++;
+ list_add_tail_rcu(&trans->ctx.chain->list,
+ &trans->ctx.table->chains);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWRULE:
+ trans->ctx.chain->use--;
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ break;
+ case NFT_MSG_DELRULE:
+ trans->ctx.chain->use++;
+ nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWSET:
+ trans->ctx.table->use--;
+ list_del_rcu(&nft_trans_set(trans)->list);
+ break;
+ case NFT_MSG_DELSET:
+ trans->ctx.table->use++;
+ list_add_tail_rcu(&nft_trans_set(trans)->list,
+ &trans->ctx.table->sets);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWSETELEM:
+ nft_trans_elem_set(trans)->nelems--;
+ set = nft_trans_elem_set(trans);
+ set->ops->get(set, &nft_trans_elem(trans));
+ set->ops->remove(set, &nft_trans_elem(trans));
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSETELEM:
+ nft_trans_elem_set(trans)->nelems++;
+ nft_trans_destroy(trans);
+ break;
+ }
+ }
+
+ list_for_each_entry_safe_reverse(trans, next,
+ &net->nft.commit_list, list) {
+ list_del(&trans->list);
+ trans->ctx.nla = NULL;
+ call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu);
+ }
+
+ return 0;
+}
+
+static const struct nfnetlink_subsystem nf_tables_subsys = {
+ .name = "nf_tables",
+ .subsys_id = NFNL_SUBSYS_NFTABLES,
+ .cb_count = NFT_MSG_MAX,
+ .cb = nf_tables_cb,
+ .commit = nf_tables_commit,
+ .abort = nf_tables_abort,
+};
+
+/*
+ * Loop detection - walk through the ruleset beginning at the destination chain
+ * of a new jump until either the source chain is reached (loop) or all
+ * reachable chains have been traversed.
+ *
+ * The loop check is performed whenever a new jump verdict is added to an
+ * expression or verdict map or a verdict map is bound to a new chain.
+ */
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+ const struct nft_chain *chain);
+
+static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ if (elem->flags & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ switch (elem->data.verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ return nf_tables_check_loops(ctx, elem->data.chain);
+ default:
+ return 0;
+ }
+}
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+ const struct nft_chain *chain)
+{
+ const struct nft_rule *rule;
+ const struct nft_expr *expr, *last;
+ const struct nft_set *set;
+ struct nft_set_binding *binding;
+ struct nft_set_iter iter;
+
+ if (ctx->chain == chain)
+ return -ELOOP;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ nft_rule_for_each_expr(expr, last, rule) {
+ const struct nft_data *data = NULL;
+ int err;
+
+ if (!expr->ops->validate)
+ continue;
+
+ err = expr->ops->validate(ctx, expr, &data);
+ if (err < 0)
+ return err;
+
+ if (data == NULL)
+ continue;
+
+ switch (data->verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ err = nf_tables_check_loops(ctx, data->chain);
+ if (err < 0)
+ return err;
+ default:
+ break;
+ }
+ }
+ }
+
+ list_for_each_entry(set, &ctx->table->sets, list) {
+ if (!(set->flags & NFT_SET_MAP) ||
+ set->dtype != NFT_DATA_VERDICT)
+ continue;
+
+ list_for_each_entry(binding, &set->bindings, list) {
+ if (binding->chain != chain)
+ continue;
+
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nf_tables_loop_check_setelem;
+
+ set->ops->walk(ctx, set, &iter);
+ if (iter.err < 0)
+ return iter.err;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * nft_validate_input_register - validate an expressions' input register
+ *
+ * @reg: the register number
+ *
+ * Validate that the input register is one of the general purpose
+ * registers.
+ */
+int nft_validate_input_register(enum nft_registers reg)
+{
+ if (reg <= NFT_REG_VERDICT)
+ return -EINVAL;
+ if (reg > NFT_REG_MAX)
+ return -ERANGE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_input_register);
+
+/**
+ * nft_validate_output_register - validate an expressions' output register
+ *
+ * @reg: the register number
+ *
+ * Validate that the output register is one of the general purpose
+ * registers or the verdict register.
+ */
+int nft_validate_output_register(enum nft_registers reg)
+{
+ if (reg < NFT_REG_VERDICT)
+ return -EINVAL;
+ if (reg > NFT_REG_MAX)
+ return -ERANGE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_output_register);
+
+/**
+ * nft_validate_data_load - validate an expressions' data load
+ *
+ * @ctx: context of the expression performing the load
+ * @reg: the destination register number
+ * @data: the data to load
+ * @type: the data type
+ *
+ * Validate that a data load uses the appropriate data type for
+ * the destination register. A value of NULL for the data means
+ * that its runtime gathered data, which is always of type
+ * NFT_DATA_VALUE.
+ */
+int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type)
+{
+ int err;
+
+ switch (reg) {
+ case NFT_REG_VERDICT:
+ if (data == NULL || type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) {
+ err = nf_tables_check_loops(ctx, data->chain);
+ if (err < 0)
+ return err;
+
+ if (ctx->chain->level + 1 > data->chain->level) {
+ if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
+ return -EMLINK;
+ data->chain->level = ctx->chain->level + 1;
+ }
+ }
+
+ return 0;
+ default:
+ if (data != NULL && type != NFT_DATA_VALUE)
+ return -EINVAL;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_validate_data_load);
+
+static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
+ [NFTA_VERDICT_CODE] = { .type = NLA_U32 },
+ [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+};
+
+static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ struct nlattr *tb[NFTA_VERDICT_MAX + 1];
+ struct nft_chain *chain;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_VERDICT_CODE])
+ return -EINVAL;
+ data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
+
+ switch (data->verdict) {
+ default:
+ switch (data->verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* fall through */
+ case NFT_CONTINUE:
+ case NFT_BREAK:
+ case NFT_RETURN:
+ desc->len = sizeof(data->verdict);
+ break;
+ case NFT_JUMP:
+ case NFT_GOTO:
+ if (!tb[NFTA_VERDICT_CHAIN])
+ return -EINVAL;
+ chain = nf_tables_chain_lookup(ctx->table,
+ tb[NFTA_VERDICT_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_BASE_CHAIN)
+ return -EOPNOTSUPP;
+
+ chain->use++;
+ data->chain = chain;
+ desc->len = sizeof(data);
+ break;
+ }
+
+ desc->type = NFT_DATA_VERDICT;
+ return 0;
+}
+
+static void nft_verdict_uninit(const struct nft_data *data)
+{
+ switch (data->verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ data->chain->use--;
+ break;
+ }
+}
+
+static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_DATA_VERDICT);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict)))
+ goto nla_put_failure;
+
+ switch (data->verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ unsigned int len;
+
+ len = nla_len(nla);
+ if (len == 0)
+ return -EINVAL;
+ if (len > sizeof(data->data))
+ return -EOVERFLOW;
+
+ nla_memcpy(data->data, nla, sizeof(data->data));
+ desc->type = NFT_DATA_VALUE;
+ desc->len = len;
+ return 0;
+}
+
+static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
+ unsigned int len)
+{
+ return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
+}
+
+static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
+ [NFTA_DATA_VALUE] = { .type = NLA_BINARY,
+ .len = FIELD_SIZEOF(struct nft_data, data) },
+ [NFTA_DATA_VERDICT] = { .type = NLA_NESTED },
+};
+
+/**
+ * nft_data_init - parse nf_tables data netlink attributes
+ *
+ * @ctx: context of the expression using the data
+ * @data: destination struct nft_data
+ * @desc: data description
+ * @nla: netlink attribute containing data
+ *
+ * Parse the netlink data attributes and initialize a struct nft_data.
+ * The type and length of data are returned in the data description.
+ *
+ * The caller can indicate that it only wants to accept data of type
+ * NFT_DATA_VALUE by passing NULL for the ctx argument.
+ */
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ struct nlattr *tb[NFTA_DATA_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DATA_VALUE])
+ return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
+ return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(nft_data_init);
+
+/**
+ * nft_data_uninit - release a nft_data item
+ *
+ * @data: struct nft_data to release
+ * @type: type of data
+ *
+ * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ * all others need to be released by calling this function.
+ */
+void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
+{
+ switch (type) {
+ case NFT_DATA_VALUE:
+ return;
+ case NFT_DATA_VERDICT:
+ return nft_verdict_uninit(data);
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL_GPL(nft_data_uninit);
+
+int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
+ enum nft_data_types type, unsigned int len)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start(skb, attr);
+ if (nest == NULL)
+ return -1;
+
+ switch (type) {
+ case NFT_DATA_VALUE:
+ err = nft_value_dump(skb, data, len);
+ break;
+ case NFT_DATA_VERDICT:
+ err = nft_verdict_dump(skb, data);
+ break;
+ default:
+ err = -EINVAL;
+ WARN_ON(1);
+ }
+
+ nla_nest_end(skb, nest);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_data_dump);
+
+static int nf_tables_init_net(struct net *net)
+{
+ INIT_LIST_HEAD(&net->nft.af_info);
+ INIT_LIST_HEAD(&net->nft.commit_list);
+ net->nft.base_seq = 1;
+ return 0;
+}
+
+static struct pernet_operations nf_tables_net_ops = {
+ .init = nf_tables_init_net,
+};
+
+static int __init nf_tables_module_init(void)
+{
+ int err;
+
+ info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS,
+ GFP_KERNEL);
+ if (info == NULL) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = nf_tables_core_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nfnetlink_subsys_register(&nf_tables_subsys);
+ if (err < 0)
+ goto err3;
+
+ pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
+ return register_pernet_subsys(&nf_tables_net_ops);
+err3:
+ nf_tables_core_module_exit();
+err2:
+ kfree(info);
+err1:
+ return err;
+}
+
+static void __exit nf_tables_module_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_net_ops);
+ nfnetlink_subsys_unregister(&nf_tables_subsys);
+ nf_tables_core_module_exit();
+ kfree(info);
+}
+
+module_init(nf_tables_module_init);
+module_exit(nf_tables_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
new file mode 100644
index 00000000000..3b90eb2b2c5
--- /dev/null
+++ b/net/netfilter/nf_tables_core.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+
+static void nft_cmp_fast_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1])
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ u32 mask = nft_cmp_fast_mask(priv->len);
+
+ if ((data[priv->sreg].data[0] & mask) == priv->data)
+ return;
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static bool nft_payload_fast_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ struct nft_data *dest = &data[priv->dreg];
+ unsigned char *ptr;
+
+ if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
+ ptr = skb_network_header(skb);
+ else
+ ptr = skb_network_header(skb) + pkt->xt.thoff;
+
+ ptr += priv->offset;
+
+ if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
+ return false;
+
+ if (priv->len == 2)
+ *(u16 *)dest->data = *(u16 *)ptr;
+ else if (priv->len == 4)
+ *(u32 *)dest->data = *(u32 *)ptr;
+ else
+ *(u8 *)dest->data = *(u8 *)ptr;
+ return true;
+}
+
+struct nft_jumpstack {
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ int rulenum;
+};
+
+enum nft_trace {
+ NFT_TRACE_RULE,
+ NFT_TRACE_RETURN,
+ NFT_TRACE_POLICY,
+};
+
+static const char *const comments[] = {
+ [NFT_TRACE_RULE] = "rule",
+ [NFT_TRACE_RETURN] = "return",
+ [NFT_TRACE_POLICY] = "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 4,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+static void nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_chain *chain,
+ int rulenum, enum nft_trace type)
+{
+ struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+ nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
+ pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
+ chain->table->name, chain->name, comments[type],
+ rulenum);
+}
+
+unsigned int
+nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
+{
+ const struct nft_chain *chain = ops->priv, *basechain = chain;
+ const struct nft_rule *rule;
+ const struct nft_expr *expr, *last;
+ struct nft_data data[NFT_REG_MAX + 1];
+ unsigned int stackptr = 0;
+ struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
+ struct nft_stats *stats;
+ int rulenum;
+ /*
+ * Cache cursor to avoid problems in case that the cursor is updated
+ * while traversing the ruleset.
+ */
+ unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor);
+
+do_chain:
+ rulenum = 0;
+ rule = list_entry(&chain->rules, struct nft_rule, list);
+next_rule:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
+
+ /* This rule is not active, skip. */
+ if (unlikely(rule->genmask & (1 << gencursor)))
+ continue;
+
+ rulenum++;
+
+ nft_rule_for_each_expr(expr, last, rule) {
+ if (expr->ops == &nft_cmp_fast_ops)
+ nft_cmp_fast_eval(expr, data);
+ else if (expr->ops != &nft_payload_fast_ops ||
+ !nft_payload_fast_eval(expr, data, pkt))
+ expr->ops->eval(expr, data, pkt);
+
+ if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
+ break;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict) {
+ case NFT_BREAK:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ continue;
+ case NFT_CONTINUE:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+ continue;
+ }
+ break;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ return data[NFT_REG_VERDICT].verdict;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict) {
+ case NFT_JUMP:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
+ jumpstack[stackptr].chain = chain;
+ jumpstack[stackptr].rule = rule;
+ jumpstack[stackptr].rulenum = rulenum;
+ stackptr++;
+ chain = data[NFT_REG_VERDICT].chain;
+ goto do_chain;
+ case NFT_GOTO:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ chain = data[NFT_REG_VERDICT].chain;
+ goto do_chain;
+ case NFT_RETURN:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN);
+ break;
+ case NFT_CONTINUE:
+ if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN)))
+ nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN);
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ if (stackptr > 0) {
+ stackptr--;
+ chain = jumpstack[stackptr].chain;
+ rule = jumpstack[stackptr].rule;
+ rulenum = jumpstack[stackptr].rulenum;
+ goto next_rule;
+ }
+
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY);
+
+ rcu_read_lock_bh();
+ stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats));
+ u64_stats_update_begin(&stats->syncp);
+ stats->pkts++;
+ stats->bytes += pkt->skb->len;
+ u64_stats_update_end(&stats->syncp);
+ rcu_read_unlock_bh();
+
+ return nft_base_chain(basechain)->policy;
+}
+EXPORT_SYMBOL_GPL(nft_do_chain);
+
+int __init nf_tables_core_module_init(void)
+{
+ int err;
+
+ err = nft_immediate_module_init();
+ if (err < 0)
+ goto err1;
+
+ err = nft_cmp_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nft_lookup_module_init();
+ if (err < 0)
+ goto err3;
+
+ err = nft_bitwise_module_init();
+ if (err < 0)
+ goto err4;
+
+ err = nft_byteorder_module_init();
+ if (err < 0)
+ goto err5;
+
+ err = nft_payload_module_init();
+ if (err < 0)
+ goto err6;
+
+ return 0;
+
+err6:
+ nft_byteorder_module_exit();
+err5:
+ nft_bitwise_module_exit();
+err4:
+ nft_lookup_module_exit();
+err3:
+ nft_cmp_module_exit();
+err2:
+ nft_immediate_module_exit();
+err1:
+ return err;
+}
+
+void nf_tables_core_module_exit(void)
+{
+ nft_payload_module_exit();
+ nft_byteorder_module_exit();
+ nft_bitwise_module_exit();
+ nft_lookup_module_exit();
+ nft_cmp_module_exit();
+ nft_immediate_module_exit();
+}
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
new file mode 100644
index 00000000000..9dd2d216cfc
--- /dev/null
+++ b/net/netfilter/nf_tables_inet.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/ip.h>
+
+static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n)
+{
+ struct nft_af_info *afi;
+
+ if (n == 1)
+ afi = &nft_af_ipv4;
+ else
+ afi = &nft_af_ipv6;
+
+ ops->pf = afi->family;
+ if (afi->hooks[ops->hooknum])
+ ops->hook = afi->hooks[ops->hooknum];
+}
+
+static struct nft_af_info nft_af_inet __read_mostly = {
+ .family = NFPROTO_INET,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 2,
+ .hook_ops_init = nft_inet_hook_ops_init,
+};
+
+static int __net_init nf_tables_inet_init_net(struct net *net)
+{
+ net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.inet == NULL)
+ return -ENOMEM;
+ memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet));
+
+ if (nft_register_afinfo(net, net->nft.inet) < 0)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(net->nft.inet);
+ return -ENOMEM;
+}
+
+static void __net_exit nf_tables_inet_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.inet);
+ kfree(net->nft.inet);
+}
+
+static struct pernet_operations nf_tables_inet_net_ops = {
+ .init = nf_tables_inet_init_net,
+ .exit = nf_tables_inet_exit_net,
+};
+
+static const struct nf_chain_type filter_inet = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_INET,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_inet_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_inet);
+ ret = register_pernet_subsys(&nf_tables_inet_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_inet);
+
+ return ret;
+}
+
+static void __exit nf_tables_inet_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_inet_net_ops);
+ nft_unregister_chain_type(&filter_inet);
+}
+
+module_init(nf_tables_inet_init);
+module_exit(nf_tables_inet_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(1);
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
deleted file mode 100644
index 4d87befb04c..00000000000
--- a/net/netfilter/nf_tproxy_core.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Transparent proxy support for Linux/iptables
- *
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
- * Author: Balazs Scheidler, Krisztian Kovacs
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-
-#include <linux/net.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <net/udp.h>
-#include <net/netfilter/nf_tproxy_core.h>
-
-
-static void
-nf_tproxy_destructor(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
-
- skb->sk = NULL;
- skb->destructor = NULL;
-
- if (sk)
- nf_tproxy_put_sock(sk);
-}
-
-/* consumes sk */
-int
-nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
-{
- bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_transparent :
- inet_sk(sk)->transparent;
-
- if (transparent) {
- skb_orphan(skb);
- skb->sk = sk;
- skb->destructor = nf_tproxy_destructor;
- return 1;
- } else
- nf_tproxy_put_sock(sk);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock);
-
-static int __init nf_tproxy_init(void)
-{
- pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n");
- pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n");
- return 0;
-}
-
-module_init(nf_tproxy_init);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
-MODULE_DESCRIPTION("Transparent proxy support core routines");
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index b4a4532823e..c138b8fbe28 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -23,12 +23,10 @@
#include <linux/net.h>
#include <linux/skbuff.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <net/sock.h>
-#include <net/netlink.h>
#include <linux/init.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
@@ -37,30 +35,49 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
static char __initdata nfversion[] = "0.30";
-static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
-static DEFINE_MUTEX(nfnl_mutex);
+static struct {
+ struct mutex mutex;
+ const struct nfnetlink_subsystem __rcu *subsys;
+} table[NFNL_SUBSYS_COUNT];
+
+static const int nfnl_group2type[NFNLGRP_MAX+1] = {
+ [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP,
+ [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP,
+ [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+};
-void nfnl_lock(void)
+void nfnl_lock(__u8 subsys_id)
{
- mutex_lock(&nfnl_mutex);
+ mutex_lock(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(nfnl_lock);
-void nfnl_unlock(void)
+void nfnl_unlock(__u8 subsys_id)
{
- mutex_unlock(&nfnl_mutex);
+ mutex_unlock(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(nfnl_unlock);
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_nfnl_is_held(u8 subsys_id)
+{
+ return lockdep_is_held(&table[subsys_id].mutex);
+}
+EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
+#endif
+
int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
{
- nfnl_lock();
- if (subsys_table[n->subsys_id]) {
- nfnl_unlock();
+ nfnl_lock(n->subsys_id);
+ if (table[n->subsys_id].subsys) {
+ nfnl_unlock(n->subsys_id);
return -EBUSY;
}
- subsys_table[n->subsys_id] = n;
- nfnl_unlock();
+ rcu_assign_pointer(table[n->subsys_id].subsys, n);
+ nfnl_unlock(n->subsys_id);
return 0;
}
@@ -68,10 +85,10 @@ EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
{
- nfnl_lock();
- subsys_table[n->subsys_id] = NULL;
- nfnl_unlock();
-
+ nfnl_lock(n->subsys_id);
+ table[n->subsys_id].subsys = NULL;
+ nfnl_unlock(n->subsys_id);
+ synchronize_rcu();
return 0;
}
EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
@@ -83,7 +100,7 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t
if (subsys_id >= NFNL_SUBSYS_COUNT)
return NULL;
- return subsys_table[subsys_id];
+ return rcu_dereference(table[subsys_id].subsys);
}
static inline const struct nfnl_callback *
@@ -103,22 +120,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
-int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
- unsigned group, int echo, gfp_t flags)
+struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+ u32 dst_portid, gfp_t gfp_mask)
+{
+ return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
+
+int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
+ unsigned int group, int echo, gfp_t flags)
{
- return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags);
+ return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_send);
-int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
+int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
- return netlink_set_err(net->nfnl, pid, group, error);
+ return netlink_set_err(net->nfnl, portid, group, error);
}
EXPORT_SYMBOL_GPL(nfnetlink_set_err);
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
+ int flags)
{
- return netlink_unicast(net->nfnl, skb, pid, flags);
+ return netlink_unicast(net->nfnl, skb, portid, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_unicast);
@@ -130,63 +155,276 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
const struct nfnetlink_subsystem *ss;
int type, err;
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
- return -EPERM;
-
/* All the messages must at least contain nfgenmsg */
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg)))
+ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
return 0;
type = nlh->nlmsg_type;
replay:
+ rcu_read_lock();
ss = nfnetlink_get_subsys(type);
if (!ss) {
#ifdef CONFIG_MODULES
- nfnl_unlock();
+ rcu_read_unlock();
request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
- nfnl_lock();
+ rcu_read_lock();
ss = nfnetlink_get_subsys(type);
if (!ss)
#endif
+ {
+ rcu_read_unlock();
return -EINVAL;
+ }
}
nc = nfnetlink_find_client(type, ss);
- if (!nc)
+ if (!nc) {
+ rcu_read_unlock();
return -EINVAL;
+ }
{
- int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
+ __u8 subsys_id = NFNL_SUBSYS_ID(type);
err = nla_parse(cda, ss->cb[cb_id].attr_count,
attr, attrlen, ss->cb[cb_id].policy);
- if (err < 0)
+ if (err < 0) {
+ rcu_read_unlock();
return err;
-
- err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda);
+ }
+
+ if (nc->call_rcu) {
+ err = nc->call_rcu(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ nfnl_lock(subsys_id);
+ if (rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex)) != ss ||
+ nfnetlink_find_client(type, ss) != nc)
+ err = -EAGAIN;
+ else if (nc->call)
+ err = nc->call(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ else
+ err = -EINVAL;
+ nfnl_unlock(subsys_id);
+ }
if (err == -EAGAIN)
goto replay;
return err;
}
}
+static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
+ u_int16_t subsys_id)
+{
+ struct sk_buff *nskb, *oskb = skb;
+ struct net *net = sock_net(skb->sk);
+ const struct nfnetlink_subsystem *ss;
+ const struct nfnl_callback *nc;
+ bool success = true, done = false;
+ int err;
+
+ if (subsys_id >= NFNL_SUBSYS_COUNT)
+ return netlink_ack(skb, nlh, -EINVAL);
+replay:
+ nskb = netlink_skb_clone(oskb, GFP_KERNEL);
+ if (!nskb)
+ return netlink_ack(oskb, nlh, -ENOMEM);
+
+ nskb->sk = oskb->sk;
+ skb = nskb;
+
+ nfnl_lock(subsys_id);
+ ss = rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex));
+ if (!ss) {
+#ifdef CONFIG_MODULES
+ nfnl_unlock(subsys_id);
+ request_module("nfnetlink-subsys-%d", subsys_id);
+ nfnl_lock(subsys_id);
+ ss = rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex));
+ if (!ss)
+#endif
+ {
+ nfnl_unlock(subsys_id);
+ netlink_ack(skb, nlh, -EOPNOTSUPP);
+ return kfree_skb(nskb);
+ }
+ }
+
+ if (!ss->commit || !ss->abort) {
+ nfnl_unlock(subsys_id);
+ netlink_ack(skb, nlh, -EOPNOTSUPP);
+ return kfree_skb(skb);
+ }
+
+ while (skb->len >= nlmsg_total_size(0)) {
+ int msglen, type;
+
+ nlh = nlmsg_hdr(skb);
+ err = 0;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ /* Only requests are handled by the kernel */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ type = nlh->nlmsg_type;
+ if (type == NFNL_MSG_BATCH_BEGIN) {
+ /* Malformed: Batch begin twice */
+ success = false;
+ goto done;
+ } else if (type == NFNL_MSG_BATCH_END) {
+ done = true;
+ goto done;
+ } else if (type < NLMSG_MIN_TYPE) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ /* We only accept a batch with messages for the same
+ * subsystem.
+ */
+ if (NFNL_SUBSYS_ID(type) != subsys_id) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ nc = nfnetlink_find_client(type, ss);
+ if (!nc) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ {
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ int attrlen = nlh->nlmsg_len - min_len;
+
+ err = nla_parse(cda, ss->cb[cb_id].attr_count,
+ attr, attrlen, ss->cb[cb_id].policy);
+ if (err < 0)
+ goto ack;
+
+ if (nc->call_batch) {
+ err = nc->call_batch(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ }
+
+ /* The lock was released to autoload some module, we
+ * have to abort and start from scratch using the
+ * original skb.
+ */
+ if (err == -EAGAIN) {
+ ss->abort(skb);
+ nfnl_unlock(subsys_id);
+ kfree_skb(nskb);
+ goto replay;
+ }
+ }
+ack:
+ if (nlh->nlmsg_flags & NLM_F_ACK || err) {
+ /* We don't stop processing the batch on errors, thus,
+ * userspace gets all the errors that the batch
+ * triggers.
+ */
+ netlink_ack(skb, nlh, err);
+ if (err)
+ success = false;
+ }
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+done:
+ if (success && done)
+ ss->commit(skb);
+ else
+ ss->abort(skb);
+
+ nfnl_unlock(subsys_id);
+ kfree_skb(nskb);
+}
+
static void nfnetlink_rcv(struct sk_buff *skb)
{
- nfnl_lock();
- netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
- nfnl_unlock();
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ int msglen;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < nlh->nlmsg_len)
+ return;
+
+ if (!netlink_net_capable(skb, CAP_NET_ADMIN)) {
+ netlink_ack(skb, nlh, -EPERM);
+ return;
+ }
+
+ if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) {
+ struct nfgenmsg *nfgenmsg;
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
+ return;
+
+ nfgenmsg = nlmsg_data(nlh);
+ skb_pull(skb, msglen);
+ nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+ } else {
+ netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
+ }
}
+#ifdef CONFIG_MODULES
+static int nfnetlink_bind(int group)
+{
+ const struct nfnetlink_subsystem *ss;
+ int type = nfnl_group2type[group];
+
+ rcu_read_lock();
+ ss = nfnetlink_get_subsys(type);
+ rcu_read_unlock();
+ if (!ss)
+ request_module("nfnetlink-subsys-%d", type);
+ return 0;
+}
+#endif
+
static int __net_init nfnetlink_net_init(struct net *net)
{
struct sock *nfnl;
+ struct netlink_kernel_cfg cfg = {
+ .groups = NFNLGRP_MAX,
+ .input = nfnetlink_rcv,
+#ifdef CONFIG_MODULES
+ .bind = nfnetlink_bind,
+#endif
+ };
- nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX,
- nfnetlink_rcv, NULL, THIS_MODULE);
+ nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
if (!nfnl)
return -ENOMEM;
net->nfnl_stash = nfnl;
@@ -199,7 +437,7 @@ static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
struct net *net;
list_for_each_entry(net, net_exit_list, exit_list)
- rcu_assign_pointer(net->nfnl, NULL);
+ RCU_INIT_POINTER(net->nfnl, NULL);
synchronize_net();
list_for_each_entry(net, net_exit_list, exit_list)
netlink_kernel_release(net->nfnl_stash);
@@ -212,6 +450,11 @@ static struct pernet_operations nfnetlink_net_ops = {
static int __init nfnetlink_init(void)
{
+ int i;
+
+ for (i=0; i<NFNL_SUBSYS_COUNT; i++)
+ mutex_init(&table[i].mutex);
+
pr_info("Netfilter messages via NETLINK v%s.\n", nfversion);
return register_pernet_subsys(&nfnetlink_net_ops);
}
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
new file mode 100644
index 00000000000..2baa125c2e8
--- /dev/null
+++ b/net/netfilter/nfnetlink_acct.c
@@ -0,0 +1,454 @@
+/*
+ * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/atomic.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_acct.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
+
+static LIST_HEAD(nfnl_acct_list);
+
+struct nf_acct {
+ atomic64_t pkts;
+ atomic64_t bytes;
+ unsigned long flags;
+ struct list_head head;
+ atomic_t refcnt;
+ char name[NFACCT_NAME_MAX];
+ struct rcu_head rcu_head;
+ char data[0];
+};
+
+#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
+
+static int
+nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ struct nf_acct *nfacct, *matching = NULL;
+ char *acct_name;
+ unsigned int size = 0;
+ u32 flags = 0;
+
+ if (!tb[NFACCT_NAME])
+ return -EINVAL;
+
+ acct_name = nla_data(tb[NFACCT_NAME]);
+ if (strlen(acct_name) == 0)
+ return -EINVAL;
+
+ list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+ if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ matching = nfacct;
+ break;
+ }
+
+ if (matching) {
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ /* reset counters if you request a replacement. */
+ atomic64_set(&matching->pkts, 0);
+ atomic64_set(&matching->bytes, 0);
+ smp_mb__before_atomic();
+ /* reset overquota flag if quota is enabled. */
+ if ((matching->flags & NFACCT_F_QUOTA))
+ clear_bit(NFACCT_F_OVERQUOTA, &matching->flags);
+ return 0;
+ }
+ return -EBUSY;
+ }
+
+ if (tb[NFACCT_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS]));
+ if (flags & ~NFACCT_F_QUOTA)
+ return -EOPNOTSUPP;
+ if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA)
+ return -EINVAL;
+ if (flags & NFACCT_F_OVERQUOTA)
+ return -EINVAL;
+
+ size += sizeof(u64);
+ }
+
+ nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL);
+ if (nfacct == NULL)
+ return -ENOMEM;
+
+ if (flags & NFACCT_F_QUOTA) {
+ u64 *quota = (u64 *)nfacct->data;
+
+ *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA]));
+ nfacct->flags = flags;
+ }
+
+ strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+
+ if (tb[NFACCT_BYTES]) {
+ atomic64_set(&nfacct->bytes,
+ be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES])));
+ }
+ if (tb[NFACCT_PKTS]) {
+ atomic64_set(&nfacct->pkts,
+ be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
+ }
+ atomic_set(&nfacct->refcnt, 1);
+ list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+ return 0;
+}
+
+static int
+nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct nf_acct *acct)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ u64 pkts, bytes;
+
+ event |= NFNL_SUBSYS_ACCT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFACCT_NAME, acct->name))
+ goto nla_put_failure;
+
+ if (type == NFNL_MSG_ACCT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&acct->pkts, 0);
+ bytes = atomic64_xchg(&acct->bytes, 0);
+ smp_mb__before_atomic();
+ if (acct->flags & NFACCT_F_QUOTA)
+ clear_bit(NFACCT_F_OVERQUOTA, &acct->flags);
+ } else {
+ pkts = atomic64_read(&acct->pkts);
+ bytes = atomic64_read(&acct->bytes);
+ }
+ if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) ||
+ nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) ||
+ nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))))
+ goto nla_put_failure;
+ if (acct->flags & NFACCT_F_QUOTA) {
+ u64 *quota = (u64 *)acct->data;
+
+ if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) ||
+ nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota)))
+ goto nla_put_failure;
+ }
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_acct *cur, *last;
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct nf_acct *)cb->args[1];
+ if (cb->args[1])
+ cb->args[1] = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ if (last) {
+ if (cur != last)
+ continue;
+
+ last = NULL;
+ }
+ if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ break;
+ }
+ }
+ if (!cb->args[1])
+ cb->args[2] = 1;
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = -ENOENT;
+ struct nf_acct *cur;
+ char *acct_name;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nfnl_acct_dump,
+ };
+ return netlink_dump_start(nfnl, skb, nlh, &c);
+ }
+
+ if (!tb[NFACCT_NAME])
+ return -EINVAL;
+ acct_name = nla_data(tb[NFACCT_NAME]);
+
+ list_for_each_entry(cur, &nfnl_acct_list, head) {
+ struct sk_buff *skb2;
+
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ return ret;
+}
+
+/* try to delete object, fail if it is still in use. */
+static int nfnl_acct_try_del(struct nf_acct *cur)
+{
+ int ret = 0;
+
+ /* we want to avoid races with nfnl_acct_find_get. */
+ if (atomic_dec_and_test(&cur->refcnt)) {
+ /* We are protected by nfnl mutex. */
+ list_del_rcu(&cur->head);
+ kfree_rcu(cur, rcu_head);
+ } else {
+ /* still in use, restore reference counter. */
+ atomic_inc(&cur->refcnt);
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+static int
+nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ char *acct_name;
+ struct nf_acct *cur;
+ int ret = -ENOENT;
+
+ if (!tb[NFACCT_NAME]) {
+ list_for_each_entry(cur, &nfnl_acct_list, head)
+ nfnl_acct_try_del(cur);
+
+ return 0;
+ }
+ acct_name = nla_data(tb[NFACCT_NAME]);
+
+ list_for_each_entry(cur, &nfnl_acct_list, head) {
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
+ continue;
+
+ ret = nfnl_acct_try_del(cur);
+ if (ret < 0)
+ return ret;
+
+ break;
+ }
+ return ret;
+}
+
+static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
+ [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 },
+ [NFACCT_BYTES] = { .type = NLA_U64 },
+ [NFACCT_PKTS] = { .type = NLA_U64 },
+ [NFACCT_FLAGS] = { .type = NLA_U32 },
+ [NFACCT_QUOTA] = { .type = NLA_U64 },
+};
+
+static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
+ [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_acct_subsys = {
+ .name = "acct",
+ .subsys_id = NFNL_SUBSYS_ACCT,
+ .cb_count = NFNL_MSG_ACCT_MAX,
+ .cb = nfnl_acct_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
+
+struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+{
+ struct nf_acct *cur, *acct = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+ continue;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err;
+
+ if (!atomic_inc_not_zero(&cur->refcnt)) {
+ module_put(THIS_MODULE);
+ goto err;
+ }
+
+ acct = cur;
+ break;
+ }
+err:
+ rcu_read_unlock();
+ return acct;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
+
+void nfnl_acct_put(struct nf_acct *acct)
+{
+ atomic_dec(&acct->refcnt);
+ module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_put);
+
+void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+ atomic64_inc(&nfacct->pkts);
+ atomic64_add(skb->len, &nfacct->bytes);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_update);
+
+static void nfnl_overquota_report(struct nf_acct *nfacct)
+{
+ int ret;
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0,
+ nfacct);
+ if (ret <= 0) {
+ kfree_skb(skb);
+ return;
+ }
+ netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
+ GFP_ATOMIC);
+}
+
+int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+ u64 now;
+ u64 *quota;
+ int ret = NFACCT_UNDERQUOTA;
+
+ /* no place here if we don't have a quota */
+ if (!(nfacct->flags & NFACCT_F_QUOTA))
+ return NFACCT_NO_QUOTA;
+
+ quota = (u64 *)nfacct->data;
+ now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ?
+ atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes);
+
+ ret = now > *quota;
+
+ if (now >= *quota &&
+ !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) {
+ nfnl_overquota_report(nfacct);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
+
+static int __init nfnl_acct_init(void)
+{
+ int ret;
+
+ pr_info("nfnl_acct: registering with nfnetlink.\n");
+ ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
+ if (ret < 0) {
+ pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+ return 0;
+err_out:
+ return ret;
+}
+
+static void __exit nfnl_acct_exit(void)
+{
+ struct nf_acct *cur, *tmp;
+
+ pr_info("nfnl_acct: unregistering from nfnetlink.\n");
+ nfnetlink_subsys_unregister(&nfnl_acct_subsys);
+
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
+ list_del_rcu(&cur->head);
+ /* We are sure that our objects have no clients at this point,
+ * it's safe to release them all without checking refcnt. */
+ kfree_rcu(cur, rcu_head);
+ }
+}
+
+module_init(nfnl_acct_init);
+module_exit(nfnl_acct_exit);
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
new file mode 100644
index 00000000000..9e287cb56a0
--- /dev/null
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -0,0 +1,680 @@
+/*
+ * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ *
+ * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/netfilter/nfnetlink_cthelper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
+
+static int
+nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ const struct nf_conn_help *help;
+ struct nf_conntrack_helper *helper;
+
+ help = nfct_help(ct);
+ if (help == NULL)
+ return NF_DROP;
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+ if (helper == NULL)
+ return NF_DROP;
+
+ /* This is an user-space helper not yet configured, skip. */
+ if ((helper->flags &
+ (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
+ NF_CT_HELPER_F_USERSPACE)
+ return NF_ACCEPT;
+
+ /* If the user-space helper is not available, don't block traffic. */
+ return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS;
+}
+
+static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = {
+ [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, },
+ [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, },
+};
+
+static int
+nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nlattr *attr)
+{
+ int err;
+ struct nlattr *tb[NFCTH_TUPLE_MAX+1];
+
+ err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM])
+ return -EINVAL;
+
+ tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM]));
+ tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]);
+
+ return 0;
+}
+
+static int
+nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
+{
+ const struct nf_conn_help *help = nfct_help(ct);
+
+ if (attr == NULL)
+ return -EINVAL;
+
+ if (help->helper->data_len == 0)
+ return -EINVAL;
+
+ memcpy(&help->data, nla_data(attr), help->helper->data_len);
+ return 0;
+}
+
+static int
+nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ const struct nf_conn_help *help = nfct_help(ct);
+
+ if (help->helper->data_len &&
+ nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = {
+ [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN-1 },
+ [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, },
+ [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
+ const struct nlattr *attr)
+{
+ int err;
+ struct nlattr *tb[NFCTH_POLICY_MAX+1];
+
+ err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_POLICY_NAME] ||
+ !tb[NFCTH_POLICY_EXPECT_MAX] ||
+ !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
+ return -EINVAL;
+
+ strncpy(expect_policy->name,
+ nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+ expect_policy->max_expected =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+ expect_policy->timeout =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
+
+ return 0;
+}
+
+static const struct nla_policy
+nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = {
+ [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
+ const struct nlattr *attr)
+{
+ int i, ret;
+ struct nf_conntrack_expect_policy *expect_policy;
+ struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
+
+ ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
+ nfnl_cthelper_expect_policy_set);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[NFCTH_POLICY_SET_NUM])
+ return -EINVAL;
+
+ helper->expect_class_max =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+
+ if (helper->expect_class_max != 0 &&
+ helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
+ return -EOVERFLOW;
+
+ expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
+ helper->expect_class_max, GFP_KERNEL);
+ if (expect_policy == NULL)
+ return -ENOMEM;
+
+ for (i=0; i<helper->expect_class_max; i++) {
+ if (!tb[NFCTH_POLICY_SET+i])
+ goto err;
+
+ ret = nfnl_cthelper_expect_policy(&expect_policy[i],
+ tb[NFCTH_POLICY_SET+i]);
+ if (ret < 0)
+ goto err;
+ }
+ helper->expect_policy = expect_policy;
+ return 0;
+err:
+ kfree(expect_policy);
+ return -EINVAL;
+}
+
+static int
+nfnl_cthelper_create(const struct nlattr * const tb[],
+ struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_helper *helper;
+ int ret;
+
+ if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
+ return -EINVAL;
+
+ helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL);
+ if (helper == NULL)
+ return -ENOMEM;
+
+ ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
+ if (ret < 0)
+ goto err;
+
+ strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+ helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
+ helper->flags |= NF_CT_HELPER_F_USERSPACE;
+ memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple));
+
+ helper->me = THIS_MODULE;
+ helper->help = nfnl_userspace_cthelper;
+ helper->from_nlattr = nfnl_cthelper_from_nlattr;
+ helper->to_nlattr = nfnl_cthelper_to_nlattr;
+
+ /* Default to queue number zero, this can be updated at any time. */
+ if (tb[NFCTH_QUEUE_NUM])
+ helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+ if (tb[NFCTH_STATUS]) {
+ int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+ switch(status) {
+ case NFCT_HELPER_STATUS_ENABLED:
+ helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+ break;
+ case NFCT_HELPER_STATUS_DISABLED:
+ helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+ break;
+ }
+ }
+
+ ret = nf_conntrack_helper_register(helper);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(helper);
+ return ret;
+}
+
+static int
+nfnl_cthelper_update(const struct nlattr * const tb[],
+ struct nf_conntrack_helper *helper)
+{
+ int ret;
+
+ if (tb[NFCTH_PRIV_DATA_LEN])
+ return -EBUSY;
+
+ if (tb[NFCTH_POLICY]) {
+ ret = nfnl_cthelper_parse_expect_policy(helper,
+ tb[NFCTH_POLICY]);
+ if (ret < 0)
+ return ret;
+ }
+ if (tb[NFCTH_QUEUE_NUM])
+ helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+ if (tb[NFCTH_STATUS]) {
+ int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+ switch(status) {
+ case NFCT_HELPER_STATUS_ENABLED:
+ helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+ break;
+ case NFCT_HELPER_STATUS_DISABLED:
+ helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+ break;
+ }
+ }
+ return 0;
+}
+
+static int
+nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ const char *helper_name;
+ struct nf_conntrack_helper *cur, *helper = NULL;
+ struct nf_conntrack_tuple tuple;
+ int ret = 0, i;
+
+ if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
+ return -EINVAL;
+
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ rcu_read_lock();
+ for (i = 0; i < nf_ct_helper_hsize && !helper; i++) {
+ hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0)
+ continue;
+
+ if ((tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ ret = -EEXIST;
+ goto err;
+ }
+ helper = cur;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (helper == NULL)
+ ret = nfnl_cthelper_create(tb, &tuple);
+ else
+ ret = nfnl_cthelper_update(tb, helper);
+
+ return ret;
+err:
+ rcu_read_unlock();
+ return ret;
+}
+
+static int
+nfnl_cthelper_dump_tuple(struct sk_buff *skb,
+ struct nf_conntrack_helper *helper)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED);
+ if (nest_parms == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM,
+ htons(helper->tuple.src.l3num)))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+nfnl_cthelper_dump_policy(struct sk_buff *skb,
+ struct nf_conntrack_helper *helper)
+{
+ int i;
+ struct nlattr *nest_parms1, *nest_parms2;
+
+ nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED);
+ if (nest_parms1 == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
+ htonl(helper->expect_class_max)))
+ goto nla_put_failure;
+
+ for (i=0; i<helper->expect_class_max; i++) {
+ nest_parms2 = nla_nest_start(skb,
+ (NFCTH_POLICY_SET+i) | NLA_F_NESTED);
+ if (nest_parms2 == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, NFCTH_POLICY_NAME,
+ helper->expect_policy[i].name))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX,
+ htonl(helper->expect_policy[i].max_expected)))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT,
+ htonl(helper->expect_policy[i].timeout)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms2);
+ }
+ nla_nest_end(skb, nest_parms1);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct nf_conntrack_helper *helper)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ int status;
+
+ event |= NFNL_SUBSYS_CTHELPER << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFCTH_NAME, helper->name))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num)))
+ goto nla_put_failure;
+
+ if (nfnl_cthelper_dump_tuple(skb, helper) < 0)
+ goto nla_put_failure;
+
+ if (nfnl_cthelper_dump_policy(skb, helper) < 0)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len)))
+ goto nla_put_failure;
+
+ if (helper->flags & NF_CT_HELPER_F_CONFIGURED)
+ status = NFCT_HELPER_STATUS_ENABLED;
+ else
+ status = NFCT_HELPER_STATUS_DISABLED;
+
+ if (nla_put_be32(skb, NFCTH_STATUS, htonl(status)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_helper *cur, *last;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_helper *)cb->args[1];
+ for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) {
+restart:
+ hlist_for_each_entry_rcu(cur,
+ &nf_ct_helper_hash[cb->args[0]], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (cb->args[1]) {
+ if (cur != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (nfnl_cthelper_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ NFNL_MSG_CTHELPER_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ goto out;
+ }
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+out:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = -ENOENT, i;
+ struct nf_conntrack_helper *cur;
+ struct sk_buff *skb2;
+ char *helper_name = NULL;
+ struct nf_conntrack_tuple tuple;
+ bool tuple_set = false;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nfnl_cthelper_dump_table,
+ };
+ return netlink_dump_start(nfnl, skb, nlh, &c);
+ }
+
+ if (tb[NFCTH_NAME])
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ if (tb[NFCTH_TUPLE]) {
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ tuple_set = true;
+ }
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (helper_name && strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0) {
+ continue;
+ }
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_CTHELPER_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ }
+ return ret;
+}
+
+static int
+nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ char *helper_name = NULL;
+ struct nf_conntrack_helper *cur;
+ struct hlist_node *tmp;
+ struct nf_conntrack_tuple tuple;
+ bool tuple_set = false, found = false;
+ int i, j = 0, ret;
+
+ if (tb[NFCTH_NAME])
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ if (tb[NFCTH_TUPLE]) {
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ tuple_set = true;
+ }
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
+ hnode) {
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ j++;
+
+ if (helper_name && strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0) {
+ continue;
+ }
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ found = true;
+ nf_conntrack_helper_unregister(cur);
+ }
+ }
+ /* Make sure we return success if we flush and there is no helpers */
+ return (found || j == 0) ? 0 : -ENOENT;
+}
+
+static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
+ [NFCTH_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN-1 },
+ [NFCTH_QUEUE_NUM] = { .type = NLA_U32, },
+};
+
+static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
+ [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
+ .name = "cthelper",
+ .subsys_id = NFNL_SUBSYS_CTHELPER,
+ .cb_count = NFNL_MSG_CTHELPER_MAX,
+ .cb = nfnl_cthelper_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER);
+
+static int __init nfnl_cthelper_init(void)
+{
+ int ret;
+
+ ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys);
+ if (ret < 0) {
+ pr_err("nfnl_cthelper: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+ return 0;
+err_out:
+ return ret;
+}
+
+static void __exit nfnl_cthelper_exit(void)
+{
+ struct nf_conntrack_helper *cur;
+ struct hlist_node *tmp;
+ int i;
+
+ nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
+
+ for (i=0; i<nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
+ hnode) {
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ nf_conntrack_helper_unregister(cur);
+ }
+ }
+}
+
+module_init(nfnl_cthelper_init);
+module_exit(nfnl_cthelper_exit);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
new file mode 100644
index 00000000000..476accd1714
--- /dev/null
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -0,0 +1,585 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/rculist.h>
+#include <linux/rculist_nulls.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/security.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning");
+
+static LIST_HEAD(cttimeout_list);
+
+static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
+ [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING,
+ .len = CTNL_TIMEOUT_NAME_MAX - 1},
+ [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 },
+ [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 },
+ [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED },
+};
+
+static int
+ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto,
+ struct net *net, const struct nlattr *attr)
+{
+ int ret = 0;
+
+ if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) {
+ struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1];
+
+ ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
+ attr, l4proto->ctnl_timeout.nla_policy);
+ if (ret < 0)
+ return ret;
+
+ ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+ }
+ return ret;
+}
+
+static int
+cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct ctnl_timeout *timeout, *matching = NULL;
+ struct net *net = sock_net(skb->sk);
+ char *name;
+ int ret;
+
+ if (!cda[CTA_TIMEOUT_NAME] ||
+ !cda[CTA_TIMEOUT_L3PROTO] ||
+ !cda[CTA_TIMEOUT_L4PROTO] ||
+ !cda[CTA_TIMEOUT_DATA])
+ return -EINVAL;
+
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+
+ list_for_each_entry(timeout, &cttimeout_list, head) {
+ if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ matching = timeout;
+ break;
+ }
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supportted, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
+ goto err_proto_put;
+ }
+
+ if (matching) {
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ /* You cannot replace one timeout policy by another of
+ * different kind, sorry.
+ */
+ if (matching->l3num != l3num ||
+ matching->l4proto->l4proto != l4num) {
+ ret = -EINVAL;
+ goto err_proto_put;
+ }
+
+ ret = ctnl_timeout_parse_policy(&matching->data,
+ l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ return ret;
+ }
+ ret = -EBUSY;
+ goto err_proto_put;
+ }
+
+ timeout = kzalloc(sizeof(struct ctnl_timeout) +
+ l4proto->ctnl_timeout.obj_size, GFP_KERNEL);
+ if (timeout == NULL) {
+ ret = -ENOMEM;
+ goto err_proto_put;
+ }
+
+ ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ if (ret < 0)
+ goto err;
+
+ strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME]));
+ timeout->l3num = l3num;
+ timeout->l4proto = l4proto;
+ atomic_set(&timeout->refcnt, 1);
+ list_add_tail_rcu(&timeout->head, &cttimeout_list);
+
+ return 0;
+err:
+ kfree(timeout);
+err_proto_put:
+ nf_ct_l4proto_put(l4proto);
+ return ret;
+}
+
+static int
+ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct ctnl_timeout *timeout)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
+
+ event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
+ nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) ||
+ nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) ||
+ nla_put_be32(skb, CTA_TIMEOUT_USE,
+ htonl(atomic_read(&timeout->refcnt))))
+ goto nla_put_failure;
+
+ if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
+ struct nlattr *nest_parms;
+ int ret;
+
+ nest_parms = nla_nest_start(skb,
+ CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ }
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ctnl_timeout *cur, *last;
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct ctnl_timeout *)cb->args[1];
+ if (cb->args[1])
+ cb->args[1] = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &cttimeout_list, head) {
+ if (last) {
+ if (cur != last)
+ continue;
+
+ last = NULL;
+ }
+ if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ break;
+ }
+ }
+ if (!cb->args[1])
+ cb->args[2] = 1;
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int ret = -ENOENT;
+ char *name;
+ struct ctnl_timeout *cur;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnl_timeout_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ if (!cda[CTA_TIMEOUT_NAME])
+ return -EINVAL;
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+
+ list_for_each_entry(cur, &cttimeout_list, head) {
+ struct sk_buff *skb2;
+
+ if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ return ret;
+}
+
+/* try to delete object, fail if it is still in use. */
+static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
+{
+ int ret = 0;
+
+ /* we want to avoid races with nf_ct_timeout_find_get. */
+ if (atomic_dec_and_test(&timeout->refcnt)) {
+ /* We are protected by nfnl mutex. */
+ list_del_rcu(&timeout->head);
+ nf_ct_l4proto_put(timeout->l4proto);
+ kfree_rcu(timeout, rcu_head);
+ } else {
+ /* still in use, restore reference counter. */
+ atomic_inc(&timeout->refcnt);
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+static int
+cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ char *name;
+ struct ctnl_timeout *cur;
+ int ret = -ENOENT;
+
+ if (!cda[CTA_TIMEOUT_NAME]) {
+ list_for_each_entry(cur, &cttimeout_list, head)
+ ctnl_timeout_try_del(cur);
+
+ return 0;
+ }
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+
+ list_for_each_entry(cur, &cttimeout_list, head) {
+ if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ ret = ctnl_timeout_try_del(cur);
+ if (ret < 0)
+ return ret;
+
+ break;
+ }
+ return ret;
+}
+
+static int
+cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct net *net = sock_net(skb->sk);
+ unsigned int *timeouts;
+ int ret;
+
+ if (!cda[CTA_TIMEOUT_L3PROTO] ||
+ !cda[CTA_TIMEOUT_L4PROTO] ||
+ !cda[CTA_TIMEOUT_DATA])
+ return -EINVAL;
+
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supported, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
+
+ timeouts = l4proto->get_timeouts(net);
+
+ ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ if (ret < 0)
+ goto err;
+
+ nf_ct_l4proto_put(l4proto);
+ return 0;
+err:
+ nf_ct_l4proto_put(l4proto);
+ return ret;
+}
+
+static int
+cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
+ u32 seq, u32 type, int event,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+ event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) ||
+ nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
+ goto nla_put_failure;
+
+ if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
+ struct nlattr *nest_parms;
+ unsigned int *timeouts = l4proto->get_timeouts(net);
+ int ret;
+
+ nest_parms = nla_nest_start(skb,
+ CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ }
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct net *net = sock_net(skb->sk);
+ struct sk_buff *skb2;
+ int ret, err;
+
+ if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
+ return -EINVAL;
+
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supported, skip. */
+ if (l4proto->l4proto != l4num) {
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
+ l4proto);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ err = -ENOMEM;
+ goto err;
+ }
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+err:
+ nf_ct_l4proto_put(l4proto);
+ return err;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+static struct ctnl_timeout *ctnl_timeout_find_get(const char *name)
+{
+ struct ctnl_timeout *timeout, *matching = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(timeout, &cttimeout_list, head) {
+ if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err;
+
+ if (!atomic_inc_not_zero(&timeout->refcnt)) {
+ module_put(THIS_MODULE);
+ goto err;
+ }
+ matching = timeout;
+ break;
+ }
+err:
+ rcu_read_unlock();
+ return matching;
+}
+
+static void ctnl_timeout_put(struct ctnl_timeout *timeout)
+{
+ atomic_dec(&timeout->refcnt);
+ module_put(THIS_MODULE);
+}
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+
+static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
+ [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+};
+
+static const struct nfnetlink_subsystem cttimeout_subsys = {
+ .name = "conntrack_timeout",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT,
+ .cb_count = IPCTNL_MSG_TIMEOUT_MAX,
+ .cb = cttimeout_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT);
+
+static int __init cttimeout_init(void)
+{
+ int ret;
+
+ ret = nfnetlink_subsys_register(&cttimeout_subsys);
+ if (ret < 0) {
+ pr_err("cttimeout_init: cannot register cttimeout with "
+ "nfnetlink.\n");
+ goto err_out;
+ }
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
+ RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+ return 0;
+
+err_out:
+ return ret;
+}
+
+static void __exit cttimeout_exit(void)
+{
+ struct ctnl_timeout *cur, *tmp;
+
+ pr_info("cttimeout: unregistering from nfnetlink.\n");
+
+ nfnetlink_subsys_unregister(&cttimeout_subsys);
+ list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) {
+ list_del_rcu(&cur->head);
+ /* We are sure that our objects have no clients at this point,
+ * it's safe to release them all without checking refcnt.
+ */
+ nf_ct_l4proto_put(cur->l4proto);
+ kfree_rcu(cur, rcu_head);
+ }
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
+ RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+}
+
+module_init(cttimeout_init);
+module_exit(cttimeout_exit);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6a1572b0ab4..d292c8d286e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -3,6 +3,7 @@
* nfetlink.
*
* (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on the old ipv4-only ipt_ULOG.c:
* (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
@@ -13,12 +14,13 @@
*/
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_log.h>
#include <linux/spinlock.h>
@@ -26,14 +28,13 @@
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/list.h>
-#include <linux/jhash.h>
-#include <linux/random.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
#include <net/netfilter/nfnetlink_log.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#ifdef CONFIG_BRIDGE_NETFILTER
#include "../bridge/br_private.h"
@@ -55,7 +56,9 @@ struct nfulnl_instance {
unsigned int qlen; /* number of nlmsgs in skb */
struct sk_buff *skb; /* pre-allocatd skb */
struct timer_list timer;
- int peer_pid; /* PID of the peer process */
+ struct net *net;
+ struct user_namespace *peer_user_ns; /* User namespace of the peer process */
+ int peer_portid; /* PORTID of the peer process */
/* configurable parameters */
unsigned int flushtimeout; /* timeout until queue flush */
@@ -69,12 +72,20 @@ struct nfulnl_instance {
struct rcu_head rcu;
};
-static DEFINE_SPINLOCK(instances_lock);
-static atomic_t global_seq;
-
#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS];
-static unsigned int hash_init;
+
+static int nfnl_log_net_id __read_mostly;
+
+struct nfnl_log_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+ atomic_t global_seq;
+};
+
+static struct nfnl_log_net *nfnl_log_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_log_net_id);
+}
static inline u_int8_t instance_hashfn(u_int16_t group_num)
{
@@ -82,14 +93,13 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num)
}
static struct nfulnl_instance *
-__instance_lookup(u_int16_t group_num)
+__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
{
struct hlist_head *head;
- struct hlist_node *pos;
struct nfulnl_instance *inst;
- head = &instance_table[instance_hashfn(group_num)];
- hlist_for_each_entry_rcu(inst, pos, head, hlist) {
+ head = &log->instance_table[instance_hashfn(group_num)];
+ hlist_for_each_entry_rcu(inst, head, hlist) {
if (inst->group_num == group_num)
return inst;
}
@@ -103,12 +113,12 @@ instance_get(struct nfulnl_instance *inst)
}
static struct nfulnl_instance *
-instance_lookup_get(u_int16_t group_num)
+instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
{
struct nfulnl_instance *inst;
rcu_read_lock_bh();
- inst = __instance_lookup(group_num);
+ inst = __instance_lookup(log, group_num);
if (inst && !atomic_inc_not_zero(&inst->use))
inst = NULL;
rcu_read_unlock_bh();
@@ -118,7 +128,11 @@ instance_lookup_get(u_int16_t group_num)
static void nfulnl_instance_free_rcu(struct rcu_head *head)
{
- kfree(container_of(head, struct nfulnl_instance, rcu));
+ struct nfulnl_instance *inst =
+ container_of(head, struct nfulnl_instance, rcu);
+
+ put_net(inst->net);
+ kfree(inst);
module_put(THIS_MODULE);
}
@@ -132,13 +146,15 @@ instance_put(struct nfulnl_instance *inst)
static void nfulnl_timer(unsigned long data);
static struct nfulnl_instance *
-instance_create(u_int16_t group_num, int pid)
+instance_create(struct net *net, u_int16_t group_num,
+ int portid, struct user_namespace *user_ns)
{
struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
int err;
- spin_lock_bh(&instances_lock);
- if (__instance_lookup(group_num)) {
+ spin_lock_bh(&log->instances_lock);
+ if (__instance_lookup(log, group_num)) {
err = -EEXIST;
goto out_unlock;
}
@@ -162,7 +178,9 @@ instance_create(u_int16_t group_num, int pid)
setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
- inst->peer_pid = pid;
+ inst->net = get_net(net);
+ inst->peer_user_ns = user_ns;
+ inst->peer_portid = portid;
inst->group_num = group_num;
inst->qthreshold = NFULNL_QTHRESH_DEFAULT;
@@ -172,14 +190,15 @@ instance_create(u_int16_t group_num, int pid)
inst->copy_range = NFULNL_COPY_RANGE_MAX;
hlist_add_head_rcu(&inst->hlist,
- &instance_table[instance_hashfn(group_num)]);
+ &log->instance_table[instance_hashfn(group_num)]);
+
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
return inst;
out_unlock:
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
return ERR_PTR(err);
}
@@ -208,11 +227,12 @@ __instance_destroy(struct nfulnl_instance *inst)
}
static inline void
-instance_destroy(struct nfulnl_instance *inst)
+instance_destroy(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst)
{
- spin_lock_bh(&instances_lock);
+ spin_lock_bh(&log->instances_lock);
__instance_destroy(inst);
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
}
static int
@@ -296,7 +316,8 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
}
static struct sk_buff *
-nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
+nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
+ unsigned int pkt_size)
{
struct sk_buff *skb;
unsigned int n;
@@ -305,19 +326,17 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
* message. WARNING: has to be <= 128k due to slab restrictions */
n = max(inst_size, pkt_size);
- skb = alloc_skb(n, GFP_ATOMIC);
+ skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);
if (!skb) {
- pr_notice("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
- inst_size);
-
if (n > pkt_size) {
/* try to allocate only as much as we need for current
* packet */
- skb = alloc_skb(pkt_size, GFP_ATOMIC);
+ skb = nfnetlink_alloc_skb(net, pkt_size,
+ peer_portid, GFP_ATOMIC);
if (!skb)
- pr_err("nfnetlink_log: can't even alloc %u "
- "bytes\n", pkt_size);
+ pr_err("nfnetlink_log: can't even alloc %u bytes\n",
+ pkt_size);
}
}
@@ -329,18 +348,20 @@ __nfulnl_send(struct nfulnl_instance *inst)
{
int status = -1;
- if (inst->qlen > 1)
- NLMSG_PUT(inst->skb, 0, 0,
- NLMSG_DONE,
- sizeof(struct nfgenmsg));
-
- status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid,
+ if (inst->qlen > 1) {
+ struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0,
+ NLMSG_DONE,
+ sizeof(struct nfgenmsg),
+ 0);
+ if (!nlh)
+ goto out;
+ }
+ status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
MSG_DONTWAIT);
inst->qlen = 0;
inst->skb = NULL;
-
-nlmsg_failure:
+out:
return status;
}
@@ -369,111 +390,139 @@ nfulnl_timer(unsigned long data)
/* This is an inline function, we don't really care about a long
* list of arguments */
static inline int
-__build_packet_message(struct nfulnl_instance *inst,
+__build_packet_message(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst,
const struct sk_buff *skb,
unsigned int data_len,
u_int8_t pf,
unsigned int hooknum,
const struct net_device *indev,
const struct net_device *outdev,
- const struct nf_loginfo *li,
const char *prefix, unsigned int plen)
{
struct nfulnl_msg_packet_hdr pmsg;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
- __be32 tmp_uint;
sk_buff_data_t old_tail = inst->skb->tail;
+ struct sock *sk;
+ const unsigned char *hwhdrp;
- nlh = NLMSG_PUT(inst->skb, 0, 0,
+ nlh = nlmsg_put(inst->skb, 0, 0,
NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
- sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
+ sizeof(struct nfgenmsg), 0);
+ if (!nlh)
+ return -1;
+ nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = pf;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = htons(inst->group_num);
+ memset(&pmsg, 0, sizeof(pmsg));
pmsg.hw_protocol = skb->protocol;
pmsg.hook = hooknum;
- NLA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
+ if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg))
+ goto nla_put_failure;
- if (prefix)
- NLA_PUT(inst->skb, NFULA_PREFIX, plen, prefix);
+ if (prefix &&
+ nla_put(inst->skb, NFULA_PREFIX, plen, prefix))
+ goto nla_put_failure;
if (indev) {
#ifndef CONFIG_BRIDGE_NETFILTER
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
- htonl(indev->ifindex));
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
#else
if (pf == PF_BRIDGE) {
/* Case 1: outdev is physical input device, we need to
* look for bridge group (when called from
* netfilter_bridge) */
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- htonl(indev->ifindex));
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ htonl(indev->ifindex)) ||
/* this is the bridge group "brX" */
/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
- htonl(br_port_get_rcu(indev)->br->dev->ifindex));
+ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
+ goto nla_put_failure;
} else {
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
- htonl(indev->ifindex));
- if (skb->nf_bridge && skb->nf_bridge->physindev)
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- htonl(skb->nf_bridge->physindev->ifindex));
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
+ if (skb->nf_bridge && skb->nf_bridge->physindev &&
+ nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ htonl(skb->nf_bridge->physindev->ifindex)))
+ goto nla_put_failure;
}
#endif
}
if (outdev) {
- tmp_uint = htonl(outdev->ifindex);
#ifndef CONFIG_BRIDGE_NETFILTER
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
- htonl(outdev->ifindex));
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
#else
if (pf == PF_BRIDGE) {
/* Case 1: outdev is physical output device, we need to
* look for bridge group (when called from
* netfilter_bridge) */
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
- htonl(outdev->ifindex));
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ htonl(outdev->ifindex)) ||
/* this is the bridge group "brX" */
/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
- htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
+ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
+ goto nla_put_failure;
} else {
/* Case 2: indev is a bridge group, we need to look
* for physical device (when called from ipv4) */
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
- htonl(outdev->ifindex));
- if (skb->nf_bridge && skb->nf_bridge->physoutdev)
- NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
- htonl(skb->nf_bridge->physoutdev->ifindex));
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
+ if (skb->nf_bridge && skb->nf_bridge->physoutdev &&
+ nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ htonl(skb->nf_bridge->physoutdev->ifindex)))
+ goto nla_put_failure;
}
#endif
}
- if (skb->mark)
- NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark));
+ if (skb->mark &&
+ nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark)))
+ goto nla_put_failure;
- if (indev && skb->dev) {
+ if (indev && skb->dev &&
+ skb->mac_header != skb->network_header) {
struct nfulnl_msg_packet_hw phw;
- int len = dev_parse_header(skb, phw.hw_addr);
+ int len;
+
+ memset(&phw, 0, sizeof(phw));
+ len = dev_parse_header(skb, phw.hw_addr);
if (len > 0) {
phw.hw_addrlen = htons(len);
- NLA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
+ if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw))
+ goto nla_put_failure;
}
}
if (indev && skb_mac_header_was_set(skb)) {
- NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type));
- NLA_PUT_BE16(inst->skb, NFULA_HWLEN,
- htons(skb->dev->hard_header_len));
- NLA_PUT(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len,
- skb_mac_header(skb));
+ if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) ||
+ nla_put_be16(inst->skb, NFULA_HWLEN,
+ htons(skb->dev->hard_header_len)))
+ goto nla_put_failure;
+
+ hwhdrp = skb_mac_header(skb);
+
+ if (skb->dev->type == ARPHRD_SIT)
+ hwhdrp -= ETH_HLEN;
+
+ if (hwhdrp >= skb->head &&
+ nla_put(inst->skb, NFULA_HWHEADER,
+ skb->dev->hard_header_len, hwhdrp))
+ goto nla_put_failure;
}
if (skb->tstamp.tv64) {
@@ -482,32 +531,38 @@ __build_packet_message(struct nfulnl_instance *inst,
ts.sec = cpu_to_be64(tv.tv_sec);
ts.usec = cpu_to_be64(tv.tv_usec);
- NLA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
+ if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts))
+ goto nla_put_failure;
}
/* UID */
- if (skb->sk) {
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
- struct file *file = skb->sk->sk_socket->file;
- __be32 uid = htonl(file->f_cred->fsuid);
- __be32 gid = htonl(file->f_cred->fsgid);
- /* need to unlock here since NLA_PUT may goto */
- read_unlock_bh(&skb->sk->sk_callback_lock);
- NLA_PUT_BE32(inst->skb, NFULA_UID, uid);
- NLA_PUT_BE32(inst->skb, NFULA_GID, gid);
+ sk = skb->sk;
+ if (sk && sk->sk_state != TCP_TIME_WAIT) {
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ struct file *file = sk->sk_socket->file;
+ const struct cred *cred = file->f_cred;
+ struct user_namespace *user_ns = inst->peer_user_ns;
+ __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid));
+ __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid));
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (nla_put_be32(inst->skb, NFULA_UID, uid) ||
+ nla_put_be32(inst->skb, NFULA_GID, gid))
+ goto nla_put_failure;
} else
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
/* local sequence number */
- if (inst->flags & NFULNL_CFG_F_SEQ)
- NLA_PUT_BE32(inst->skb, NFULA_SEQ, htonl(inst->seq++));
+ if ((inst->flags & NFULNL_CFG_F_SEQ) &&
+ nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++)))
+ goto nla_put_failure;
/* global sequence number */
- if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
- NLA_PUT_BE32(inst->skb, NFULA_SEQ_GLOBAL,
- htonl(atomic_inc_return(&global_seq)));
+ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) &&
+ nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL,
+ htonl(atomic_inc_return(&log->global_seq))))
+ goto nla_put_failure;
if (data_len) {
struct nlattr *nla;
@@ -515,7 +570,7 @@ __build_packet_message(struct nfulnl_instance *inst,
if (skb_tailroom(inst->skb) < nla_total_size(data_len)) {
printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
- goto nlmsg_failure;
+ return -1;
}
nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len));
@@ -529,7 +584,6 @@ __build_packet_message(struct nfulnl_instance *inst,
nlh->nlmsg_len = inst->skb->tail - old_tail;
return 0;
-nlmsg_failure:
nla_put_failure:
PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
return -1;
@@ -550,7 +604,8 @@ static struct nf_loginfo default_loginfo = {
/* log handler for internal netfilter logging api */
void
-nfulnl_log_packet(u_int8_t pf,
+nfulnl_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -563,13 +618,14 @@ nfulnl_log_packet(u_int8_t pf,
const struct nf_loginfo *li;
unsigned int qthreshold;
unsigned int plen;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
else
li = &default_loginfo;
- inst = instance_lookup_get(li->u.ulog.group);
+ inst = instance_lookup_get(log, li->u.ulog.group);
if (!inst)
return;
@@ -580,7 +636,7 @@ nfulnl_log_packet(u_int8_t pf,
/* FIXME: do we want to make the size calculation conditional based on
* what is actually present? way more branches and checks, but more
* memory efficient... */
- size = NLMSG_SPACE(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -644,15 +700,16 @@ nfulnl_log_packet(u_int8_t pf,
}
if (!inst->skb) {
- inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size);
+ inst->skb = nfulnl_alloc_skb(net, inst->peer_portid,
+ inst->nlbufsiz, size);
if (!inst->skb)
goto alloc_failure;
}
inst->qlen++;
- __build_packet_message(inst, skb, data_len, pf,
- hooknum, in, out, li, prefix, plen);
+ __build_packet_message(log, inst, skb, data_len, pf,
+ hooknum, in, out, prefix, plen);
if (inst->qlen >= qthreshold)
__nfulnl_flush(inst);
@@ -680,24 +737,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct netlink_notify *n = ptr;
+ struct nfnl_log_net *log = nfnl_log_pernet(n->net);
if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
int i;
- /* destroy all instances for this pid */
- spin_lock_bh(&instances_lock);
+ /* destroy all instances for this portid */
+ spin_lock_bh(&log->instances_lock);
for (i = 0; i < INSTANCE_BUCKETS; i++) {
- struct hlist_node *tmp, *t2;
+ struct hlist_node *t2;
struct nfulnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &log->instance_table[i];
- hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
- if ((net_eq(n->net, &init_net)) &&
- (n->pid == inst->peer_pid))
+ hlist_for_each_entry_safe(inst, t2, head, hlist) {
+ if (n->portid == inst->peer_portid)
__instance_destroy(inst);
}
}
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
}
return NOTIFY_DONE;
}
@@ -734,10 +791,12 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nfula[])
{
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int16_t group_num = ntohs(nfmsg->res_id);
struct nfulnl_instance *inst;
struct nfulnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
int ret = 0;
if (nfula[NFULA_CFG_CMD]) {
@@ -747,15 +806,15 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
/* Commands without queue context */
switch (cmd->command) {
case NFULNL_CFG_CMD_PF_BIND:
- return nf_log_bind_pf(pf, &nfulnl_logger);
+ return nf_log_bind_pf(net, pf, &nfulnl_logger);
case NFULNL_CFG_CMD_PF_UNBIND:
- nf_log_unbind_pf(pf);
+ nf_log_unbind_pf(net, pf);
return 0;
}
}
- inst = instance_lookup_get(group_num);
- if (inst && inst->peer_pid != NETLINK_CB(skb).pid) {
+ inst = instance_lookup_get(log, group_num);
+ if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {
ret = -EPERM;
goto out_put;
}
@@ -768,8 +827,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out_put;
}
- inst = instance_create(group_num,
- NETLINK_CB(skb).pid);
+ inst = instance_create(net, group_num,
+ NETLINK_CB(skb).portid,
+ sk_user_ns(NETLINK_CB(skb).sk));
if (IS_ERR(inst)) {
ret = PTR_ERR(inst);
goto out;
@@ -781,7 +841,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out;
}
- instance_destroy(inst);
+ instance_destroy(log, inst);
goto out_put;
default:
ret = -ENOTSUPP;
@@ -864,55 +924,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = {
#ifdef CONFIG_PROC_FS
struct iter_state {
+ struct seq_net_private p;
unsigned int bucket;
};
-static struct hlist_node *get_first(struct iter_state *st)
+static struct hlist_node *get_first(struct net *net, struct iter_state *st)
{
+ struct nfnl_log_net *log;
if (!st)
return NULL;
+ log = nfnl_log_pernet(net);
+
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return rcu_dereference_bh(instance_table[st->bucket].first);
+ struct hlist_head *head = &log->instance_table[st->bucket];
+
+ if (!hlist_empty(head))
+ return rcu_dereference_bh(hlist_first_rcu(head));
}
return NULL;
}
-static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
+static struct hlist_node *get_next(struct net *net, struct iter_state *st,
+ struct hlist_node *h)
{
- h = rcu_dereference_bh(h->next);
+ h = rcu_dereference_bh(hlist_next_rcu(h));
while (!h) {
+ struct nfnl_log_net *log;
+ struct hlist_head *head;
+
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = rcu_dereference_bh(instance_table[st->bucket].first);
+ log = nfnl_log_pernet(net);
+ head = &log->instance_table[st->bucket];
+ h = rcu_dereference_bh(hlist_first_rcu(head));
}
return h;
}
-static struct hlist_node *get_idx(struct iter_state *st, loff_t pos)
+static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
+ loff_t pos)
{
struct hlist_node *head;
- head = get_first(st);
+ head = get_first(net, st);
if (head)
- while (pos && (head = get_next(st, head)))
+ while (pos && (head = get_next(net, st, head)))
pos--;
return pos ? NULL : head;
}
-static void *seq_start(struct seq_file *seq, loff_t *pos)
+static void *seq_start(struct seq_file *s, loff_t *pos)
__acquires(rcu_bh)
{
rcu_read_lock_bh();
- return get_idx(seq->private, *pos);
+ return get_idx(seq_file_net(s), s->private, *pos);
}
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
(*pos)++;
- return get_next(s->private, v);
+ return get_next(seq_file_net(s), s->private, v);
}
static void seq_stop(struct seq_file *s, void *v)
@@ -927,7 +1000,7 @@ static int seq_show(struct seq_file *s, void *v)
return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",
inst->group_num,
- inst->peer_pid, inst->qlen,
+ inst->peer_portid, inst->qlen,
inst->copy_mode, inst->copy_range,
inst->flushtimeout, atomic_read(&inst->use));
}
@@ -941,8 +1014,8 @@ static const struct seq_operations nful_seq_ops = {
static int nful_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &nful_seq_ops,
- sizeof(struct iter_state));
+ return seq_open_net(inode, file, &nful_seq_ops,
+ sizeof(struct iter_state));
}
static const struct file_operations nful_file_ops = {
@@ -950,47 +1023,69 @@ static const struct file_operations nful_file_ops = {
.open = nful_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
#endif /* PROC_FS */
-static int __init nfnetlink_log_init(void)
+static int __net_init nfnl_log_net_init(struct net *net)
{
- int i, status = -ENOMEM;
+ unsigned int i;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
+ INIT_HLIST_HEAD(&log->instance_table[i]);
+ spin_lock_init(&log->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_log", 0440,
+ net->nf.proc_netfilter, &nful_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
- /* it's not really all that important to have a random value, so
- * we can do this from the init function, even if there hasn't
- * been that much entropy yet */
- get_random_bytes(&hash_init, sizeof(hash_init));
+static void __net_exit nfnl_log_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
+#endif
+ nf_log_unset(net, &nfulnl_logger);
+}
+
+static struct pernet_operations nfnl_log_net_ops = {
+ .init = nfnl_log_net_init,
+ .exit = nfnl_log_net_exit,
+ .id = &nfnl_log_net_id,
+ .size = sizeof(struct nfnl_log_net),
+};
+
+static int __init nfnetlink_log_init(void)
+{
+ int status = -ENOMEM;
netlink_register_notifier(&nfulnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfulnl_subsys);
if (status < 0) {
- printk(KERN_ERR "log: failed to create netlink socket\n");
+ pr_err("log: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
}
status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
if (status < 0) {
- printk(KERN_ERR "log: failed to register logger\n");
+ pr_err("log: failed to register logger\n");
goto cleanup_subsys;
}
-#ifdef CONFIG_PROC_FS
- if (!proc_create("nfnetlink_log", 0440,
- proc_net_netfilter, &nful_file_ops))
+ status = register_pernet_subsys(&nfnl_log_net_ops);
+ if (status < 0) {
+ pr_err("log: failed to register pernet ops\n");
goto cleanup_logger;
-#endif
+ }
return status;
-#ifdef CONFIG_PROC_FS
cleanup_logger:
nf_log_unregister(&nfulnl_logger);
-#endif
cleanup_subsys:
nfnetlink_subsys_unregister(&nfulnl_subsys);
cleanup_netlink_notifier:
@@ -1000,10 +1095,8 @@ cleanup_netlink_notifier:
static void __exit nfnetlink_log_fini(void)
{
+ unregister_pernet_subsys(&nfnl_log_net_ops);
nf_log_unregister(&nfulnl_logger);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_log", proc_net_netfilter);
-#endif
nfnetlink_subsys_unregister(&nfulnl_subsys);
netlink_unregister_notifier(&nfulnl_rtnl_notifier);
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
deleted file mode 100644
index 68e67d19724..00000000000
--- a/net/netfilter/nfnetlink_queue.c
+++ /dev/null
@@ -1,943 +0,0 @@
-/*
- * This is a module which is used for queueing packets and communicating with
- * userspace via nfnetlink.
- *
- * (C) 2005 by Harald Welte <laforge@netfilter.org>
- * (C) 2007 by Patrick McHardy <kaber@trash.net>
- *
- * Based on the old ipv4-only ip_queue.c:
- * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
- * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/proc_fs.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_queue.h>
-#include <linux/list.h>
-#include <net/sock.h>
-#include <net/netfilter/nf_queue.h>
-
-#include <asm/atomic.h>
-
-#ifdef CONFIG_BRIDGE_NETFILTER
-#include "../bridge/br_private.h"
-#endif
-
-#define NFQNL_QMAX_DEFAULT 1024
-
-struct nfqnl_instance {
- struct hlist_node hlist; /* global list of queues */
- struct rcu_head rcu;
-
- int peer_pid;
- unsigned int queue_maxlen;
- unsigned int copy_range;
- unsigned int queue_dropped;
- unsigned int queue_user_dropped;
-
-
- u_int16_t queue_num; /* number of this queue */
- u_int8_t copy_mode;
-/*
- * Following fields are dirtied for each queued packet,
- * keep them in same cache line if possible.
- */
- spinlock_t lock;
- unsigned int queue_total;
- atomic_t id_sequence; /* 'sequence' of pkt ids */
- struct list_head queue_list; /* packets in queue */
-};
-
-typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
-
-static DEFINE_SPINLOCK(instances_lock);
-
-#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly;
-
-static inline u_int8_t instance_hashfn(u_int16_t queue_num)
-{
- return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
-}
-
-static struct nfqnl_instance *
-instance_lookup(u_int16_t queue_num)
-{
- struct hlist_head *head;
- struct hlist_node *pos;
- struct nfqnl_instance *inst;
-
- head = &instance_table[instance_hashfn(queue_num)];
- hlist_for_each_entry_rcu(inst, pos, head, hlist) {
- if (inst->queue_num == queue_num)
- return inst;
- }
- return NULL;
-}
-
-static struct nfqnl_instance *
-instance_create(u_int16_t queue_num, int pid)
-{
- struct nfqnl_instance *inst;
- unsigned int h;
- int err;
-
- spin_lock(&instances_lock);
- if (instance_lookup(queue_num)) {
- err = -EEXIST;
- goto out_unlock;
- }
-
- inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
- if (!inst) {
- err = -ENOMEM;
- goto out_unlock;
- }
-
- inst->queue_num = queue_num;
- inst->peer_pid = pid;
- inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
- inst->copy_range = 0xfffff;
- inst->copy_mode = NFQNL_COPY_NONE;
- spin_lock_init(&inst->lock);
- INIT_LIST_HEAD(&inst->queue_list);
-
- if (!try_module_get(THIS_MODULE)) {
- err = -EAGAIN;
- goto out_free;
- }
-
- h = instance_hashfn(queue_num);
- hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
-
- spin_unlock(&instances_lock);
-
- return inst;
-
-out_free:
- kfree(inst);
-out_unlock:
- spin_unlock(&instances_lock);
- return ERR_PTR(err);
-}
-
-static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
- unsigned long data);
-
-static void
-instance_destroy_rcu(struct rcu_head *head)
-{
- struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
- rcu);
-
- nfqnl_flush(inst, NULL, 0);
- kfree(inst);
- module_put(THIS_MODULE);
-}
-
-static void
-__instance_destroy(struct nfqnl_instance *inst)
-{
- hlist_del_rcu(&inst->hlist);
- call_rcu(&inst->rcu, instance_destroy_rcu);
-}
-
-static void
-instance_destroy(struct nfqnl_instance *inst)
-{
- spin_lock(&instances_lock);
- __instance_destroy(inst);
- spin_unlock(&instances_lock);
-}
-
-static inline void
-__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
-{
- list_add_tail(&entry->list, &queue->queue_list);
- queue->queue_total++;
-}
-
-static struct nf_queue_entry *
-find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
-{
- struct nf_queue_entry *entry = NULL, *i;
-
- spin_lock_bh(&queue->lock);
-
- list_for_each_entry(i, &queue->queue_list, list) {
- if (i->id == id) {
- entry = i;
- break;
- }
- }
-
- if (entry) {
- list_del(&entry->list);
- queue->queue_total--;
- }
-
- spin_unlock_bh(&queue->lock);
-
- return entry;
-}
-
-static void
-nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
-{
- struct nf_queue_entry *entry, *next;
-
- spin_lock_bh(&queue->lock);
- list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
- if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue->queue_total--;
- nf_reinject(entry, NF_DROP);
- }
- }
- spin_unlock_bh(&queue->lock);
-}
-
-static struct sk_buff *
-nfqnl_build_packet_message(struct nfqnl_instance *queue,
- struct nf_queue_entry *entry)
-{
- sk_buff_data_t old_tail;
- size_t size;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct nfqnl_msg_packet_hdr pmsg;
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- struct sk_buff *entskb = entry->skb;
- struct net_device *indev;
- struct net_device *outdev;
-
- size = NLMSG_SPACE(sizeof(struct nfgenmsg))
- + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
- + nla_total_size(sizeof(u_int32_t)) /* ifindex */
- + nla_total_size(sizeof(u_int32_t)) /* ifindex */
-#ifdef CONFIG_BRIDGE_NETFILTER
- + nla_total_size(sizeof(u_int32_t)) /* ifindex */
- + nla_total_size(sizeof(u_int32_t)) /* ifindex */
-#endif
- + nla_total_size(sizeof(u_int32_t)) /* mark */
- + nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
- + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
-
- outdev = entry->outdev;
-
- switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
- case NFQNL_COPY_META:
- case NFQNL_COPY_NONE:
- break;
-
- case NFQNL_COPY_PACKET:
- if (entskb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(entskb))
- return NULL;
-
- data_len = ACCESS_ONCE(queue->copy_range);
- if (data_len == 0 || data_len > entskb->len)
- data_len = entskb->len;
-
- size += nla_total_size(data_len);
- break;
- }
-
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail = skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0,
- NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
- sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
- nfmsg->nfgen_family = entry->pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(queue->queue_num);
-
- entry->id = atomic_inc_return(&queue->id_sequence);
- pmsg.packet_id = htonl(entry->id);
- pmsg.hw_protocol = entskb->protocol;
- pmsg.hook = entry->hook;
-
- NLA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
-
- indev = entry->indev;
- if (indev) {
-#ifndef CONFIG_BRIDGE_NETFILTER
- NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex));
-#else
- if (entry->pf == PF_BRIDGE) {
- /* Case 1: indev is physical input device, we need to
- * look for bridge group (when called from
- * netfilter_bridge) */
- NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV,
- htonl(indev->ifindex));
- /* this is the bridge group "brX" */
- /* rcu_read_lock()ed by __nf_queue */
- NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
- htonl(br_port_get_rcu(indev)->br->dev->ifindex));
- } else {
- /* Case 2: indev is bridge group, we need to look for
- * physical device (when called from ipv4) */
- NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
- htonl(indev->ifindex));
- if (entskb->nf_bridge && entskb->nf_bridge->physindev)
- NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV,
- htonl(entskb->nf_bridge->physindev->ifindex));
- }
-#endif
- }
-
- if (outdev) {
-#ifndef CONFIG_BRIDGE_NETFILTER
- NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex));
-#else
- if (entry->pf == PF_BRIDGE) {
- /* Case 1: outdev is physical output device, we need to
- * look for bridge group (when called from
- * netfilter_bridge) */
- NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV,
- htonl(outdev->ifindex));
- /* this is the bridge group "brX" */
- /* rcu_read_lock()ed by __nf_queue */
- NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
- htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
- } else {
- /* Case 2: outdev is bridge group, we need to look for
- * physical output device (when called from ipv4) */
- NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
- htonl(outdev->ifindex));
- if (entskb->nf_bridge && entskb->nf_bridge->physoutdev)
- NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV,
- htonl(entskb->nf_bridge->physoutdev->ifindex));
- }
-#endif
- }
-
- if (entskb->mark)
- NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark));
-
- if (indev && entskb->dev) {
- struct nfqnl_msg_packet_hw phw;
- int len = dev_parse_header(entskb, phw.hw_addr);
- if (len) {
- phw.hw_addrlen = htons(len);
- NLA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
- }
- }
-
- if (entskb->tstamp.tv64) {
- struct nfqnl_msg_packet_timestamp ts;
- struct timeval tv = ktime_to_timeval(entskb->tstamp);
- ts.sec = cpu_to_be64(tv.tv_sec);
- ts.usec = cpu_to_be64(tv.tv_usec);
-
- NLA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
- }
-
- if (data_len) {
- struct nlattr *nla;
- int sz = nla_attr_size(data_len);
-
- if (skb_tailroom(skb) < nla_total_size(data_len)) {
- printk(KERN_WARNING "nf_queue: no tailroom!\n");
- goto nlmsg_failure;
- }
-
- nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len));
- nla->nla_type = NFQA_PAYLOAD;
- nla->nla_len = sz;
-
- if (skb_copy_bits(entskb, 0, nla_data(nla), data_len))
- BUG();
- }
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
-nla_put_failure:
- if (skb)
- kfree_skb(skb);
- if (net_ratelimit())
- printk(KERN_ERR "nf_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
-{
- struct sk_buff *nskb;
- struct nfqnl_instance *queue;
- int err;
-
- /* rcu_read_lock()ed by nf_hook_slow() */
- queue = instance_lookup(queuenum);
- if (!queue)
- goto err_out;
-
- if (queue->copy_mode == NFQNL_COPY_NONE)
- goto err_out;
-
- nskb = nfqnl_build_packet_message(queue, entry);
- if (nskb == NULL)
- goto err_out;
-
- spin_lock_bh(&queue->lock);
-
- if (!queue->peer_pid)
- goto err_out_free_nskb;
-
- if (queue->queue_total >= queue->queue_maxlen) {
- queue->queue_dropped++;
- if (net_ratelimit())
- printk(KERN_WARNING "nf_queue: full at %d entries, "
- "dropping packets(s).\n",
- queue->queue_total);
- goto err_out_free_nskb;
- }
-
- /* nfnetlink_unicast will either free the nskb or add it to a socket */
- err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT);
- if (err < 0) {
- queue->queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __enqueue_entry(queue, entry);
-
- spin_unlock_bh(&queue->lock);
- return 0;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-err_out_unlock:
- spin_unlock_bh(&queue->lock);
-err_out:
- return -1;
-}
-
-static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e)
-{
- struct sk_buff *nskb;
- int diff;
-
- diff = data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
- diff, GFP_ATOMIC);
- if (!nskb) {
- printk(KERN_WARNING "nf_queue: OOM "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- kfree_skb(e->skb);
- e->skb = nskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(e->skb, data_len))
- return -ENOMEM;
- skb_copy_to_linear_data(e->skb, data, data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
- return 0;
-}
-
-static int
-nfqnl_set_mode(struct nfqnl_instance *queue,
- unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- spin_lock_bh(&queue->lock);
- switch (mode) {
- case NFQNL_COPY_NONE:
- case NFQNL_COPY_META:
- queue->copy_mode = mode;
- queue->copy_range = 0;
- break;
-
- case NFQNL_COPY_PACKET:
- queue->copy_mode = mode;
- /* we're using struct nlattr which has 16bit nla_len */
- if (range > 0xffff)
- queue->copy_range = 0xffff;
- else
- queue->copy_range = range;
- break;
-
- default:
- status = -EINVAL;
-
- }
- spin_unlock_bh(&queue->lock);
-
- return status;
-}
-
-static int
-dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->indev)
- if (entry->indev->ifindex == ifindex)
- return 1;
- if (entry->outdev)
- if (entry->outdev->ifindex == ifindex)
- return 1;
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (entry->skb->nf_bridge) {
- if (entry->skb->nf_bridge->physindev &&
- entry->skb->nf_bridge->physindev->ifindex == ifindex)
- return 1;
- if (entry->skb->nf_bridge->physoutdev &&
- entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
- return 1;
- }
-#endif
- return 0;
-}
-
-/* drop all packets with either indev or outdev == ifindex from all queue
- * instances */
-static void
-nfqnl_dev_drop(int ifindex)
-{
- int i;
-
- rcu_read_lock();
-
- for (i = 0; i < INSTANCE_BUCKETS; i++) {
- struct hlist_node *tmp;
- struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
-
- hlist_for_each_entry_rcu(inst, tmp, head, hlist)
- nfqnl_flush(inst, dev_cmp, ifindex);
- }
-
- rcu_read_unlock();
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static int
-nfqnl_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- nfqnl_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block nfqnl_dev_notifier = {
- .notifier_call = nfqnl_rcv_dev_event,
-};
-
-static int
-nfqnl_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
- int i;
-
- /* destroy all instances for this pid */
- spin_lock(&instances_lock);
- for (i = 0; i < INSTANCE_BUCKETS; i++) {
- struct hlist_node *tmp, *t2;
- struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
-
- hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
- if ((n->net == &init_net) &&
- (n->pid == inst->peer_pid))
- __instance_destroy(inst);
- }
- }
- spin_unlock(&instances_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block nfqnl_rtnl_notifier = {
- .notifier_call = nfqnl_rcv_nl_event,
-};
-
-static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
- [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
- [NFQA_MARK] = { .type = NLA_U32 },
- [NFQA_PAYLOAD] = { .type = NLA_UNSPEC },
-};
-
-static int
-nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
-{
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
-
- struct nfqnl_msg_verdict_hdr *vhdr;
- struct nfqnl_instance *queue;
- unsigned int verdict;
- struct nf_queue_entry *entry;
- int err;
-
- rcu_read_lock();
- queue = instance_lookup(queue_num);
- if (!queue) {
- err = -ENODEV;
- goto err_out_unlock;
- }
-
- if (queue->peer_pid != NETLINK_CB(skb).pid) {
- err = -EPERM;
- goto err_out_unlock;
- }
-
- if (!nfqa[NFQA_VERDICT_HDR]) {
- err = -EINVAL;
- goto err_out_unlock;
- }
-
- vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
- verdict = ntohl(vhdr->verdict);
-
- if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
- err = -EINVAL;
- goto err_out_unlock;
- }
-
- entry = find_dequeue_entry(queue, ntohl(vhdr->id));
- if (entry == NULL) {
- err = -ENOENT;
- goto err_out_unlock;
- }
- rcu_read_unlock();
-
- if (nfqa[NFQA_PAYLOAD]) {
- if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
- nla_len(nfqa[NFQA_PAYLOAD]), entry) < 0)
- verdict = NF_DROP;
- }
-
- if (nfqa[NFQA_MARK])
- entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
-
- nf_reinject(entry, verdict);
- return 0;
-
-err_out_unlock:
- rcu_read_unlock();
- return err;
-}
-
-static int
-nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
-{
- return -ENOTSUPP;
-}
-
-static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
- [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) },
- [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) },
-};
-
-static const struct nf_queue_handler nfqh = {
- .name = "nf_queue",
- .outfn = &nfqnl_enqueue_packet,
-};
-
-static int
-nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
-{
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
- struct nfqnl_instance *queue;
- struct nfqnl_msg_config_cmd *cmd = NULL;
- int ret = 0;
-
- if (nfqa[NFQA_CFG_CMD]) {
- cmd = nla_data(nfqa[NFQA_CFG_CMD]);
-
- /* Commands without queue context - might sleep */
- switch (cmd->command) {
- case NFQNL_CFG_CMD_PF_BIND:
- return nf_register_queue_handler(ntohs(cmd->pf),
- &nfqh);
- case NFQNL_CFG_CMD_PF_UNBIND:
- return nf_unregister_queue_handler(ntohs(cmd->pf),
- &nfqh);
- }
- }
-
- rcu_read_lock();
- queue = instance_lookup(queue_num);
- if (queue && queue->peer_pid != NETLINK_CB(skb).pid) {
- ret = -EPERM;
- goto err_out_unlock;
- }
-
- if (cmd != NULL) {
- switch (cmd->command) {
- case NFQNL_CFG_CMD_BIND:
- if (queue) {
- ret = -EBUSY;
- goto err_out_unlock;
- }
- queue = instance_create(queue_num, NETLINK_CB(skb).pid);
- if (IS_ERR(queue)) {
- ret = PTR_ERR(queue);
- goto err_out_unlock;
- }
- break;
- case NFQNL_CFG_CMD_UNBIND:
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
- instance_destroy(queue);
- break;
- case NFQNL_CFG_CMD_PF_BIND:
- case NFQNL_CFG_CMD_PF_UNBIND:
- break;
- default:
- ret = -ENOTSUPP;
- break;
- }
- }
-
- if (nfqa[NFQA_CFG_PARAMS]) {
- struct nfqnl_msg_config_params *params;
-
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
- params = nla_data(nfqa[NFQA_CFG_PARAMS]);
- nfqnl_set_mode(queue, params->copy_mode,
- ntohl(params->copy_range));
- }
-
- if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
- __be32 *queue_maxlen;
-
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
- queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);
- spin_lock_bh(&queue->lock);
- queue->queue_maxlen = ntohl(*queue_maxlen);
- spin_unlock_bh(&queue->lock);
- }
-
-err_out_unlock:
- rcu_read_unlock();
- return ret;
-}
-
-static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
- [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp,
- .attr_count = NFQA_MAX, },
- [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict,
- .attr_count = NFQA_MAX,
- .policy = nfqa_verdict_policy },
- [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
- .attr_count = NFQA_CFG_MAX,
- .policy = nfqa_cfg_policy },
-};
-
-static const struct nfnetlink_subsystem nfqnl_subsys = {
- .name = "nf_queue",
- .subsys_id = NFNL_SUBSYS_QUEUE,
- .cb_count = NFQNL_MSG_MAX,
- .cb = nfqnl_cb,
-};
-
-#ifdef CONFIG_PROC_FS
-struct iter_state {
- unsigned int bucket;
-};
-
-static struct hlist_node *get_first(struct seq_file *seq)
-{
- struct iter_state *st = seq->private;
-
- if (!st)
- return NULL;
-
- for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return instance_table[st->bucket].first;
- }
- return NULL;
-}
-
-static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
-{
- struct iter_state *st = seq->private;
-
- h = h->next;
- while (!h) {
- if (++st->bucket >= INSTANCE_BUCKETS)
- return NULL;
-
- h = instance_table[st->bucket].first;
- }
- return h;
-}
-
-static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_node *head;
- head = get_first(seq);
-
- if (head)
- while (pos && (head = get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
-}
-
-static void *seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(instances_lock)
-{
- spin_lock(&instances_lock);
- return get_idx(seq, *pos);
-}
-
-static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- (*pos)++;
- return get_next(s, v);
-}
-
-static void seq_stop(struct seq_file *s, void *v)
- __releases(instances_lock)
-{
- spin_unlock(&instances_lock);
-}
-
-static int seq_show(struct seq_file *s, void *v)
-{
- const struct nfqnl_instance *inst = v;
-
- return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
- inst->queue_num,
- inst->peer_pid, inst->queue_total,
- inst->copy_mode, inst->copy_range,
- inst->queue_dropped, inst->queue_user_dropped,
- atomic_read(&inst->id_sequence), 1);
-}
-
-static const struct seq_operations nfqnl_seq_ops = {
- .start = seq_start,
- .next = seq_next,
- .stop = seq_stop,
- .show = seq_show,
-};
-
-static int nfqnl_open(struct inode *inode, struct file *file)
-{
- return seq_open_private(file, &nfqnl_seq_ops,
- sizeof(struct iter_state));
-}
-
-static const struct file_operations nfqnl_file_ops = {
- .owner = THIS_MODULE,
- .open = nfqnl_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-#endif /* PROC_FS */
-
-static int __init nfnetlink_queue_init(void)
-{
- int i, status = -ENOMEM;
-
- for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
-
- netlink_register_notifier(&nfqnl_rtnl_notifier);
- status = nfnetlink_subsys_register(&nfqnl_subsys);
- if (status < 0) {
- printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
-#ifdef CONFIG_PROC_FS
- if (!proc_create("nfnetlink_queue", 0440,
- proc_net_netfilter, &nfqnl_file_ops))
- goto cleanup_subsys;
-#endif
-
- register_netdevice_notifier(&nfqnl_dev_notifier);
- return status;
-
-#ifdef CONFIG_PROC_FS
-cleanup_subsys:
- nfnetlink_subsys_unregister(&nfqnl_subsys);
-#endif
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&nfqnl_rtnl_notifier);
- return status;
-}
-
-static void __exit nfnetlink_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
- unregister_netdevice_notifier(&nfqnl_dev_notifier);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
-#endif
- nfnetlink_subsys_unregister(&nfqnl_subsys);
- netlink_unregister_notifier(&nfqnl_rtnl_notifier);
-
- rcu_barrier(); /* Wait for completion of call_rcu()'s */
-}
-
-MODULE_DESCRIPTION("netfilter packet queue handler");
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
-
-module_init(nfnetlink_queue_init);
-module_exit(nfnetlink_queue_fini);
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
new file mode 100644
index 00000000000..108120f216b
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -0,0 +1,1352 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfnetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2007 by Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nfnetlink_queue.h>
+
+#include <linux/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFQNL_QMAX_DEFAULT 1024
+
+/* We're using struct nlattr which has 16bit nla_len. Note that nla_len
+ * includes the header length. Thus, the maximum packet length that we
+ * support is 65531 bytes. We send truncated packets if the specified length
+ * is larger than that. Userspace can check for presence of NFQA_CAP_LEN
+ * attribute to detect truncation.
+ */
+#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)
+
+struct nfqnl_instance {
+ struct hlist_node hlist; /* global list of queues */
+ struct rcu_head rcu;
+
+ int peer_portid;
+ unsigned int queue_maxlen;
+ unsigned int copy_range;
+ unsigned int queue_dropped;
+ unsigned int queue_user_dropped;
+
+
+ u_int16_t queue_num; /* number of this queue */
+ u_int8_t copy_mode;
+ u_int32_t flags; /* Set using NFQA_CFG_FLAGS */
+/*
+ * Following fields are dirtied for each queued packet,
+ * keep them in same cache line if possible.
+ */
+ spinlock_t lock;
+ unsigned int queue_total;
+ unsigned int id_sequence; /* 'sequence' of pkt ids */
+ struct list_head queue_list; /* packets in queue */
+};
+
+typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
+
+static int nfnl_queue_net_id __read_mostly;
+
+#define INSTANCE_BUCKETS 16
+struct nfnl_queue_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+};
+
+static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_queue_net_id);
+}
+
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+ return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
+}
+
+static struct nfqnl_instance *
+instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
+{
+ struct hlist_head *head;
+ struct nfqnl_instance *inst;
+
+ head = &q->instance_table[instance_hashfn(queue_num)];
+ hlist_for_each_entry_rcu(inst, head, hlist) {
+ if (inst->queue_num == queue_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static struct nfqnl_instance *
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num,
+ int portid)
+{
+ struct nfqnl_instance *inst;
+ unsigned int h;
+ int err;
+
+ spin_lock(&q->instances_lock);
+ if (instance_lookup(q, queue_num)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ inst->queue_num = queue_num;
+ inst->peer_portid = portid;
+ inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+ inst->copy_range = NFQNL_MAX_COPY_RANGE;
+ inst->copy_mode = NFQNL_COPY_NONE;
+ spin_lock_init(&inst->lock);
+ INIT_LIST_HEAD(&inst->queue_list);
+
+ if (!try_module_get(THIS_MODULE)) {
+ err = -EAGAIN;
+ goto out_free;
+ }
+
+ h = instance_hashfn(queue_num);
+ hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);
+
+ spin_unlock(&q->instances_lock);
+
+ return inst;
+
+out_free:
+ kfree(inst);
+out_unlock:
+ spin_unlock(&q->instances_lock);
+ return ERR_PTR(err);
+}
+
+static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
+ unsigned long data);
+
+static void
+instance_destroy_rcu(struct rcu_head *head)
+{
+ struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
+ rcu);
+
+ nfqnl_flush(inst, NULL, 0);
+ kfree(inst);
+ module_put(THIS_MODULE);
+}
+
+static void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+ hlist_del_rcu(&inst->hlist);
+ call_rcu(&inst->rcu, instance_destroy_rcu);
+}
+
+static void
+instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
+{
+ spin_lock(&q->instances_lock);
+ __instance_destroy(inst);
+ spin_unlock(&q->instances_lock);
+}
+
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+ list_add_tail(&entry->list, &queue->queue_list);
+ queue->queue_total++;
+}
+
+static void
+__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+ list_del(&entry->list);
+ queue->queue_total--;
+}
+
+static struct nf_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+{
+ struct nf_queue_entry *entry = NULL, *i;
+
+ spin_lock_bh(&queue->lock);
+
+ list_for_each_entry(i, &queue->queue_list, list) {
+ if (i->id == id) {
+ entry = i;
+ break;
+ }
+ }
+
+ if (entry)
+ __dequeue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+
+ return entry;
+}
+
+static void
+nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
+{
+ struct nf_queue_entry *entry, *next;
+
+ spin_lock_bh(&queue->lock);
+ list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
+ if (!cmpfn || cmpfn(entry, data)) {
+ list_del(&entry->list);
+ queue->queue_total--;
+ nf_reinject(entry, NF_DROP);
+ }
+ }
+ spin_unlock_bh(&queue->lock);
+}
+
+static int
+nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
+ bool csum_verify)
+{
+ __u32 flags = 0;
+
+ if (packet->ip_summed == CHECKSUM_PARTIAL)
+ flags = NFQA_SKB_CSUMNOTREADY;
+ else if (csum_verify)
+ flags = NFQA_SKB_CSUM_NOTVERIFIED;
+
+ if (skb_is_gso(packet))
+ flags |= NFQA_SKB_GSO;
+
+ return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0;
+}
+
+static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
+{
+ const struct cred *cred;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return 0;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ cred = sk->sk_socket->file->f_cred;
+ if (nla_put_be32(skb, NFQA_UID,
+ htonl(from_kuid_munged(&init_user_ns, cred->fsuid))))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFQA_GID,
+ htonl(from_kgid_munged(&init_user_ns, cred->fsgid))))
+ goto nla_put_failure;
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+ return 0;
+
+nla_put_failure:
+ read_unlock_bh(&sk->sk_callback_lock);
+ return -1;
+}
+
+static struct sk_buff *
+nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ struct nf_queue_entry *entry,
+ __be32 **packet_id_ptr)
+{
+ size_t size;
+ size_t data_len = 0, cap_len = 0;
+ unsigned int hlen = 0;
+ struct sk_buff *skb;
+ struct nlattr *nla;
+ struct nfqnl_msg_packet_hdr *pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct sk_buff *entskb = entry->skb;
+ struct net_device *indev;
+ struct net_device *outdev;
+ struct nf_conn *ct = NULL;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
+ bool csum_verify;
+
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ + nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ + nla_total_size(sizeof(u_int32_t)); /* cap_len */
+
+ if (entskb->tstamp.tv64)
+ size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
+
+ if (entry->hook <= NF_INET_FORWARD ||
+ (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
+ csum_verify = !skb_csum_unnecessary(entskb);
+ else
+ csum_verify = false;
+
+ outdev = entry->outdev;
+
+ switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
+ case NFQNL_COPY_META:
+ case NFQNL_COPY_NONE:
+ break;
+
+ case NFQNL_COPY_PACKET:
+ if (!(queue->flags & NFQA_CFG_F_GSO) &&
+ entskb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(entskb))
+ return NULL;
+
+ data_len = ACCESS_ONCE(queue->copy_range);
+ if (data_len > entskb->len)
+ data_len = entskb->len;
+
+ hlen = skb_zerocopy_headlen(entskb);
+ hlen = min_t(unsigned int, hlen, data_len);
+ size += sizeof(struct nlattr) + hlen;
+ cap_len = entskb->len;
+ break;
+ }
+
+ if (queue->flags & NFQA_CFG_F_CONNTRACK)
+ ct = nfqnl_ct_get(entskb, &size, &ctinfo);
+
+ if (queue->flags & NFQA_CFG_F_UID_GID) {
+ size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ + nla_total_size(sizeof(u_int32_t))); /* gid */
+ }
+
+ skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+ GFP_ATOMIC);
+ if (!skb) {
+ skb_tx_error(entskb);
+ return NULL;
+ }
+
+ nlh = nlmsg_put(skb, 0, 0,
+ NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+ sizeof(struct nfgenmsg), 0);
+ if (!nlh) {
+ skb_tx_error(entskb);
+ kfree_skb(skb);
+ return NULL;
+ }
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = entry->pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(queue->queue_num);
+
+ nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
+ pmsg = nla_data(nla);
+ pmsg->hw_protocol = entskb->protocol;
+ pmsg->hook = entry->hook;
+ *packet_id_ptr = &pmsg->packet_id;
+
+ indev = entry->indev;
+ if (indev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (entry->pf == PF_BRIDGE) {
+ /* Case 1: indev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
+ htonl(indev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
+ nla_put_be32(skb, NFQA_IFINDEX_INDEV,
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
+ if (entskb->nf_bridge && entskb->nf_bridge->physindev &&
+ nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
+ htonl(entskb->nf_bridge->physindev->ifindex)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (outdev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (entry->pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ htonl(outdev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
+ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ /* Case 2: outdev is bridge group, we need to look for
+ * physical output device (when called from ipv4) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
+ if (entskb->nf_bridge && entskb->nf_bridge->physoutdev &&
+ nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ htonl(entskb->nf_bridge->physoutdev->ifindex)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (entskb->mark &&
+ nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
+ goto nla_put_failure;
+
+ if (indev && entskb->dev &&
+ entskb->mac_header != entskb->network_header) {
+ struct nfqnl_msg_packet_hw phw;
+ int len;
+
+ memset(&phw, 0, sizeof(phw));
+ len = dev_parse_header(entskb, phw.hw_addr);
+ if (len) {
+ phw.hw_addrlen = htons(len);
+ if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw))
+ goto nla_put_failure;
+ }
+ }
+
+ if (entskb->tstamp.tv64) {
+ struct nfqnl_msg_packet_timestamp ts;
+ struct timeval tv = ktime_to_timeval(entskb->tstamp);
+ ts.sec = cpu_to_be64(tv.tv_sec);
+ ts.usec = cpu_to_be64(tv.tv_usec);
+
+ if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts))
+ goto nla_put_failure;
+ }
+
+ if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk &&
+ nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
+ goto nla_put_failure;
+
+ if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+ goto nla_put_failure;
+
+ if (cap_len > data_len &&
+ nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
+ goto nla_put_failure;
+
+ if (nfqnl_put_packet_info(skb, entskb, csum_verify))
+ goto nla_put_failure;
+
+ if (data_len) {
+ struct nlattr *nla;
+
+ if (skb_tailroom(skb) < sizeof(*nla) + hlen)
+ goto nla_put_failure;
+
+ nla = (struct nlattr *)skb_put(skb, sizeof(*nla));
+ nla->nla_type = NFQA_PAYLOAD;
+ nla->nla_len = nla_attr_size(data_len);
+
+ if (skb_zerocopy(skb, entskb, data_len, hlen))
+ goto nla_put_failure;
+ }
+
+ nlh->nlmsg_len = skb->len;
+ return skb;
+
+nla_put_failure:
+ skb_tx_error(entskb);
+ kfree_skb(skb);
+ net_err_ratelimited("nf_queue: error creating packet message\n");
+ return NULL;
+}
+
+static int
+__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
+ struct nf_queue_entry *entry)
+{
+ struct sk_buff *nskb;
+ int err = -ENOBUFS;
+ __be32 *packet_id_ptr;
+ int failopen = 0;
+
+ nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr);
+ if (nskb == NULL) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ spin_lock_bh(&queue->lock);
+
+ if (queue->queue_total >= queue->queue_maxlen) {
+ if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+ failopen = 1;
+ err = 0;
+ } else {
+ queue->queue_dropped++;
+ net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
+ queue->queue_total);
+ }
+ goto err_out_free_nskb;
+ }
+ entry->id = ++queue->id_sequence;
+ *packet_id_ptr = htonl(entry->id);
+
+ /* nfnetlink_unicast will either free the nskb or add it to a socket */
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
+ if (err < 0) {
+ queue->queue_user_dropped++;
+ goto err_out_unlock;
+ }
+
+ __enqueue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+ return 0;
+
+err_out_free_nskb:
+ kfree_skb(nskb);
+err_out_unlock:
+ spin_unlock_bh(&queue->lock);
+ if (failopen)
+ nf_reinject(entry, NF_ACCEPT);
+err_out:
+ return err;
+}
+
+static struct nf_queue_entry *
+nf_queue_entry_dup(struct nf_queue_entry *e)
+{
+ struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
+ if (entry) {
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+ kfree(entry);
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+/* When called from bridge netfilter, skb->data must point to MAC header
+ * before calling skb_gso_segment(). Else, original MAC header is lost
+ * and segmented skbs will be sent to wrong destination.
+ */
+static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_push(skb, skb->network_header - skb->mac_header);
+}
+
+static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_pull(skb, skb->network_header - skb->mac_header);
+}
+#else
+#define nf_bridge_adjust_skb_data(s) do {} while (0)
+#define nf_bridge_adjust_segmented_data(s) do {} while (0)
+#endif
+
+static void free_entry(struct nf_queue_entry *entry)
+{
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
+}
+
+static int
+__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
+ struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+ int ret = -ENOMEM;
+ struct nf_queue_entry *entry_seg;
+
+ nf_bridge_adjust_segmented_data(skb);
+
+ if (skb->next == NULL) { /* last packet, no need to copy entry */
+ struct sk_buff *gso_skb = entry->skb;
+ entry->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry);
+ if (ret)
+ entry->skb = gso_skb;
+ return ret;
+ }
+
+ skb->next = NULL;
+
+ entry_seg = nf_queue_entry_dup(entry);
+ if (entry_seg) {
+ entry_seg->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
+ if (ret)
+ free_entry(entry_seg);
+ }
+ return ret;
+}
+
+static int
+nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+{
+ unsigned int queued;
+ struct nfqnl_instance *queue;
+ struct sk_buff *skb, *segs;
+ int err = -ENOBUFS;
+ struct net *net = dev_net(entry->indev ?
+ entry->indev : entry->outdev);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ /* rcu_read_lock()ed by nf_hook_slow() */
+ queue = instance_lookup(q, queuenum);
+ if (!queue)
+ return -ESRCH;
+
+ if (queue->copy_mode == NFQNL_COPY_NONE)
+ return -EINVAL;
+
+ skb = entry->skb;
+
+ switch (entry->pf) {
+ case NFPROTO_IPV4:
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case NFPROTO_IPV6:
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ }
+
+ if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb))
+ return __nfqnl_enqueue_packet(net, queue, entry);
+
+ nf_bridge_adjust_skb_data(skb);
+ segs = skb_gso_segment(skb, 0);
+ /* Does not use PTR_ERR to limit the number of error codes that can be
+ * returned by nf_queue. For instance, callers rely on -ECANCELED to
+ * mean 'ignore this hook'.
+ */
+ if (IS_ERR(segs))
+ goto out_err;
+ queued = 0;
+ err = 0;
+ do {
+ struct sk_buff *nskb = segs->next;
+ if (err == 0)
+ err = __nfqnl_enqueue_packet_gso(net, queue,
+ segs, entry);
+ if (err == 0)
+ queued++;
+ else
+ kfree_skb(segs);
+ segs = nskb;
+ } while (segs);
+
+ if (queued) {
+ if (err) /* some segments are already queued */
+ free_entry(entry);
+ kfree_skb(skb);
+ return 0;
+ }
+ out_err:
+ nf_bridge_adjust_segmented_data(skb);
+ return err;
+}
+
+static int
+nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+{
+ struct sk_buff *nskb;
+
+ if (diff < 0) {
+ if (pskb_trim(e->skb, data_len))
+ return -ENOMEM;
+ } else if (diff > 0) {
+ if (data_len > 0xFFFF)
+ return -EINVAL;
+ if (diff > skb_tailroom(e->skb)) {
+ nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
+ diff, GFP_ATOMIC);
+ if (!nskb) {
+ printk(KERN_WARNING "nf_queue: OOM "
+ "in mangle, dropping packet\n");
+ return -ENOMEM;
+ }
+ kfree_skb(e->skb);
+ e->skb = nskb;
+ }
+ skb_put(e->skb, diff);
+ }
+ if (!skb_make_writable(e->skb, data_len))
+ return -ENOMEM;
+ skb_copy_to_linear_data(e->skb, data, data_len);
+ e->skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+ unsigned char mode, unsigned int range)
+{
+ int status = 0;
+
+ spin_lock_bh(&queue->lock);
+ switch (mode) {
+ case NFQNL_COPY_NONE:
+ case NFQNL_COPY_META:
+ queue->copy_mode = mode;
+ queue->copy_range = 0;
+ break;
+
+ case NFQNL_COPY_PACKET:
+ queue->copy_mode = mode;
+ if (range == 0 || range > NFQNL_MAX_COPY_RANGE)
+ queue->copy_range = NFQNL_MAX_COPY_RANGE;
+ else
+ queue->copy_range = range;
+ break;
+
+ default:
+ status = -EINVAL;
+
+ }
+ spin_unlock_bh(&queue->lock);
+
+ return status;
+}
+
+static int
+dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
+{
+ if (entry->indev)
+ if (entry->indev->ifindex == ifindex)
+ return 1;
+ if (entry->outdev)
+ if (entry->outdev->ifindex == ifindex)
+ return 1;
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (entry->skb->nf_bridge) {
+ if (entry->skb->nf_bridge->physindev &&
+ entry->skb->nf_bridge->physindev->ifindex == ifindex)
+ return 1;
+ if (entry->skb->nf_bridge->physoutdev &&
+ entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(struct net *net, int ifindex)
+{
+ int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ rcu_read_lock();
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_rcu(inst, head, hlist)
+ nfqnl_flush(inst, dev_cmp, ifindex);
+ }
+
+ rcu_read_unlock();
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Drop any packets associated with the downed device */
+ if (event == NETDEV_DOWN)
+ nfqnl_dev_drop(dev_net(dev), dev->ifindex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_dev_notifier = {
+ .notifier_call = nfqnl_rcv_dev_event,
+};
+
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);
+
+ if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
+ int i;
+
+ /* destroy all instances for this portid */
+ spin_lock(&q->instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *t2;
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_safe(inst, t2, head, hlist) {
+ if (n->portid == inst->peer_portid)
+ __instance_destroy(inst);
+ }
+ }
+ spin_unlock(&q->instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_rtnl_notifier = {
+ .notifier_call = nfqnl_rcv_nl_event,
+};
+
+static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
+ [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+ [NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PAYLOAD] = { .type = NLA_UNSPEC },
+ [NFQA_CT] = { .type = NLA_UNSPEC },
+ [NFQA_EXP] = { .type = NLA_UNSPEC },
+};
+
+static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
+ [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+ [NFQA_MARK] = { .type = NLA_U32 },
+};
+
+static struct nfqnl_instance *
+verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid)
+{
+ struct nfqnl_instance *queue;
+
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ return ERR_PTR(-ENODEV);
+
+ if (queue->peer_portid != nlportid)
+ return ERR_PTR(-EPERM);
+
+ return queue;
+}
+
+static struct nfqnl_msg_verdict_hdr*
+verdicthdr_get(const struct nlattr * const nfqa[])
+{
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ unsigned int verdict;
+
+ if (!nfqa[NFQA_VERDICT_HDR])
+ return NULL;
+
+ vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
+ verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK;
+ if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN)
+ return NULL;
+ return vhdr;
+}
+
+static int nfq_id_after(unsigned int id, unsigned int max)
+{
+ return (int)(id - max) > 0;
+}
+
+static int
+nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nf_queue_entry *entry, *tmp;
+ unsigned int verdict, maxid;
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ LIST_HEAD(batch_list);
+ u16 queue_num = ntohs(nfmsg->res_id);
+
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
+
+ vhdr = verdicthdr_get(nfqa);
+ if (!vhdr)
+ return -EINVAL;
+
+ verdict = ntohl(vhdr->verdict);
+ maxid = ntohl(vhdr->id);
+
+ spin_lock_bh(&queue->lock);
+
+ list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) {
+ if (nfq_id_after(entry->id, maxid))
+ break;
+ __dequeue_entry(queue, entry);
+ list_add_tail(&entry->list, &batch_list);
+ }
+
+ spin_unlock_bh(&queue->lock);
+
+ if (list_empty(&batch_list))
+ return -ENOENT;
+
+ list_for_each_entry_safe(entry, tmp, &batch_list, list) {
+ if (nfqa[NFQA_MARK])
+ entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ nf_reinject(entry, verdict);
+ }
+ return 0;
+}
+
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ unsigned int verdict;
+ struct nf_queue_entry *entry;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
+ struct nf_conn *ct = NULL;
+
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
+
+ vhdr = verdicthdr_get(nfqa);
+ if (!vhdr)
+ return -EINVAL;
+
+ verdict = ntohl(vhdr->verdict);
+
+ entry = find_dequeue_entry(queue, ntohl(vhdr->id));
+ if (entry == NULL)
+ return -ENOENT;
+
+ if (nfqa[NFQA_CT]) {
+ ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
+ if (ct && nfqa[NFQA_EXP]) {
+ nfqnl_attach_expect(ct, nfqa[NFQA_EXP],
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
+ }
+
+ if (nfqa[NFQA_PAYLOAD]) {
+ u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
+ int diff = payload_len - entry->skb->len;
+
+ if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
+ payload_len, entry, diff) < 0)
+ verdict = NF_DROP;
+
+ if (ct)
+ nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff);
+ }
+
+ if (nfqa[NFQA_MARK])
+ entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+
+ nf_reinject(entry, verdict);
+ return 0;
+}
+
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ return -ENOTSUPP;
+}
+
+static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
+ [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) },
+ [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) },
+};
+
+static const struct nf_queue_handler nfqh = {
+ .outfn = &nfqnl_enqueue_packet,
+};
+
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfqnl_instance *queue;
+ struct nfqnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ int ret = 0;
+
+ if (nfqa[NFQA_CFG_CMD]) {
+ cmd = nla_data(nfqa[NFQA_CFG_CMD]);
+
+ /* Obsolete commands without queue context */
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_PF_BIND: return 0;
+ case NFQNL_CFG_CMD_PF_UNBIND: return 0;
+ }
+ }
+
+ rcu_read_lock();
+ queue = instance_lookup(q, queue_num);
+ if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
+ ret = -EPERM;
+ goto err_out_unlock;
+ }
+
+ if (cmd != NULL) {
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_BIND:
+ if (queue) {
+ ret = -EBUSY;
+ goto err_out_unlock;
+ }
+ queue = instance_create(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue)) {
+ ret = PTR_ERR(queue);
+ goto err_out_unlock;
+ }
+ break;
+ case NFQNL_CFG_CMD_UNBIND:
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ instance_destroy(q, queue);
+ break;
+ case NFQNL_CFG_CMD_PF_BIND:
+ case NFQNL_CFG_CMD_PF_UNBIND:
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ }
+
+ if (nfqa[NFQA_CFG_PARAMS]) {
+ struct nfqnl_msg_config_params *params;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ params = nla_data(nfqa[NFQA_CFG_PARAMS]);
+ nfqnl_set_mode(queue, params->copy_mode,
+ ntohl(params->copy_range));
+ }
+
+ if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
+ __be32 *queue_maxlen;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);
+ spin_lock_bh(&queue->lock);
+ queue->queue_maxlen = ntohl(*queue_maxlen);
+ spin_unlock_bh(&queue->lock);
+ }
+
+ if (nfqa[NFQA_CFG_FLAGS]) {
+ __u32 flags, mask;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+
+ if (!nfqa[NFQA_CFG_MASK]) {
+ /* A mask is needed to specify which flags are being
+ * changed.
+ */
+ ret = -EINVAL;
+ goto err_out_unlock;
+ }
+
+ flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
+ mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));
+
+ if (flags >= NFQA_CFG_F_MAX) {
+ ret = -EOPNOTSUPP;
+ goto err_out_unlock;
+ }
+
+ spin_lock_bh(&queue->lock);
+ queue->flags &= ~mask;
+ queue->flags |= flags & mask;
+ spin_unlock_bh(&queue->lock);
+ }
+
+err_out_unlock:
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+ [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp,
+ .attr_count = NFQA_MAX, },
+ [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_policy },
+ [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
+ .attr_count = NFQA_CFG_MAX,
+ .policy = nfqa_cfg_policy },
+ [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_batch_policy },
+};
+
+static const struct nfnetlink_subsystem nfqnl_subsys = {
+ .name = "nf_queue",
+ .subsys_id = NFNL_SUBSYS_QUEUE,
+ .cb_count = NFQNL_MSG_MAX,
+ .cb = nfqnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+ struct iter_state *st = seq->private;
+ struct net *net;
+ struct nfnl_queue_net *q;
+
+ if (!st)
+ return NULL;
+
+ net = seq_file_net(seq);
+ q = nfnl_queue_pernet(net);
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ if (!hlist_empty(&q->instance_table[st->bucket]))
+ return q->instance_table[st->bucket].first;
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+ struct iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ h = h->next;
+ while (!h) {
+ struct nfnl_queue_net *q;
+
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ q = nfnl_queue_pernet(net);
+ h = q->instance_table[st->bucket].first;
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(seq);
+
+ if (head)
+ while (pos && (head = get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
+{
+ spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+ return get_idx(s, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+ __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
+{
+ spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfqnl_instance *inst = v;
+
+ return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
+ inst->queue_num,
+ inst->peer_portid, inst->queue_total,
+ inst->copy_mode, inst->copy_range,
+ inst->queue_dropped, inst->queue_user_dropped,
+ inst->id_sequence, 1);
+}
+
+static const struct seq_operations nfqnl_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &nfqnl_seq_ops,
+ sizeof(struct iter_state));
+}
+
+static const struct file_operations nfqnl_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nfqnl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif /* PROC_FS */
+
+static int __net_init nfnl_queue_net_init(struct net *net)
+{
+ unsigned int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&q->instance_table[i]);
+
+ spin_lock_init(&q->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_queue", 0440,
+ net->nf.proc_netfilter, &nfqnl_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_queue_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
+#endif
+}
+
+static struct pernet_operations nfnl_queue_net_ops = {
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
+};
+
+static int __init nfnetlink_queue_init(void)
+{
+ int status = -ENOMEM;
+
+ netlink_register_notifier(&nfqnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfqnl_subsys);
+ if (status < 0) {
+ pr_err("nf_queue: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+ status = register_pernet_subsys(&nfnl_queue_net_ops);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register pernet ops\n");
+ goto cleanup_subsys;
+ }
+ register_netdevice_notifier(&nfqnl_dev_notifier);
+ nf_register_queue_handler(&nfqh);
+ return status;
+
+cleanup_subsys:
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ return status;
+}
+
+static void __exit nfnetlink_queue_fini(void)
+{
+ nf_unregister_queue_handler();
+ unregister_netdevice_notifier(&nfqnl_dev_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+
+module_init(nfnetlink_queue_init);
+module_exit(nfnetlink_queue_fini);
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
new file mode 100644
index 00000000000..96cac50e0d1
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -0,0 +1,113 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nfnetlink_queue.h>
+
+struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nf_conn *ct;
+
+ /* rcu_read_lock()ed by __nf_queue already. */
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return NULL;
+
+ ct = nf_ct_get(entskb, ctinfo);
+ if (ct) {
+ if (!nf_ct_is_untracked(ct))
+ *size += nfq_ct->build_size(ct);
+ else
+ ct = NULL;
+ }
+ return ct;
+}
+
+struct nf_conn *
+nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nf_conn *ct;
+
+ /* rcu_read_lock()ed by __nf_queue already. */
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return NULL;
+
+ ct = nf_ct_get(skb, ctinfo);
+ if (ct && !nf_ct_is_untracked(ct))
+ nfq_ct->parse(attr, ct);
+
+ return ct;
+}
+
+int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nlattr *nest_parms;
+ u_int32_t tmp;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return 0;
+
+ nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nfq_ct->build(skb, ct) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ tmp = ctinfo;
+ if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int diff)
+{
+ struct nfq_ct_hook *nfq_ct;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return;
+
+ if ((ct->status & IPS_NAT_MASK) && diff)
+ nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
+}
+
+int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+ u32 portid, u32 report)
+{
+ struct nfq_ct_hook *nfq_ct;
+
+ if (nf_ct_is_untracked(ct))
+ return 0;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return -EOPNOTSUPP;
+
+ return nfq_ct->attach_expect(attr, ct, portid, report);
+}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
new file mode 100644
index 00000000000..4fb6ee2c110
--- /dev/null
+++ b/net/netfilter/nft_bitwise.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_bitwise {
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ u8 len;
+ struct nft_data mask;
+ struct nft_data xor;
+};
+
+static void nft_bitwise_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const struct nft_data *src = &data[priv->sreg];
+ struct nft_data *dst = &data[priv->dreg];
+ unsigned int i;
+
+ for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) {
+ dst->data[i] = (src->data[i] & priv->mask.data[i]) ^
+ priv->xor.data[i];
+ }
+}
+
+static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
+ [NFTA_BITWISE_SREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_DREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_LEN] = { .type = NLA_U32 },
+ [NFTA_BITWISE_MASK] = { .type = NLA_NESTED },
+ [NFTA_BITWISE_XOR] = { .type = NLA_NESTED },
+};
+
+static int nft_bitwise_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_bitwise *priv = nft_expr_priv(expr);
+ struct nft_data_desc d1, d2;
+ int err;
+
+ if (tb[NFTA_BITWISE_SREG] == NULL ||
+ tb[NFTA_BITWISE_DREG] == NULL ||
+ tb[NFTA_BITWISE_LEN] == NULL ||
+ tb[NFTA_BITWISE_MASK] == NULL ||
+ tb[NFTA_BITWISE_XOR] == NULL)
+ return -EINVAL;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+ err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+ if (err < 0)
+ return err;
+
+ priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+
+ err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
+ if (err < 0)
+ return err;
+ if (d1.len != priv->len)
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
+ if (err < 0)
+ return err;
+ if (d2.len != priv->len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_bitwise_type;
+static const struct nft_expr_ops nft_bitwise_ops = {
+ .type = &nft_bitwise_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
+ .eval = nft_bitwise_eval,
+ .init = nft_bitwise_init,
+ .dump = nft_bitwise_dump,
+};
+
+static struct nft_expr_type nft_bitwise_type __read_mostly = {
+ .name = "bitwise",
+ .ops = &nft_bitwise_ops,
+ .policy = nft_bitwise_policy,
+ .maxattr = NFTA_BITWISE_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_bitwise_module_init(void)
+{
+ return nft_register_expr(&nft_bitwise_type);
+}
+
+void nft_bitwise_module_exit(void)
+{
+ nft_unregister_expr(&nft_bitwise_type);
+}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
new file mode 100644
index 00000000000..c39ed8d29df
--- /dev/null
+++ b/net/netfilter/nft_byteorder.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_byteorder {
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ enum nft_byteorder_ops op:8;
+ u8 len;
+ u8 size;
+};
+
+static void nft_byteorder_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_byteorder *priv = nft_expr_priv(expr);
+ struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg];
+ union { u32 u32; u16 u16; } *s, *d;
+ unsigned int i;
+
+ s = (void *)src->data;
+ d = (void *)dst->data;
+
+ switch (priv->size) {
+ case 4:
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ for (i = 0; i < priv->len / 4; i++)
+ d[i].u32 = ntohl((__force __be32)s[i].u32);
+ break;
+ case NFT_BYTEORDER_HTON:
+ for (i = 0; i < priv->len / 4; i++)
+ d[i].u32 = (__force __u32)htonl(s[i].u32);
+ break;
+ }
+ break;
+ case 2:
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ for (i = 0; i < priv->len / 2; i++)
+ d[i].u16 = ntohs((__force __be16)s[i].u16);
+ break;
+ case NFT_BYTEORDER_HTON:
+ for (i = 0; i < priv->len / 2; i++)
+ d[i].u16 = (__force __u16)htons(s[i].u16);
+ break;
+ }
+ break;
+ }
+}
+
+static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
+ [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_OP] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 },
+};
+
+static int nft_byteorder_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_byteorder *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_BYTEORDER_SREG] == NULL ||
+ tb[NFTA_BYTEORDER_DREG] == NULL ||
+ tb[NFTA_BYTEORDER_LEN] == NULL ||
+ tb[NFTA_BYTEORDER_SIZE] == NULL ||
+ tb[NFTA_BYTEORDER_OP] == NULL)
+ return -EINVAL;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+ err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+ if (err < 0)
+ return err;
+
+ priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP]));
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ case NFT_BYTEORDER_HTON:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+ if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data))
+ return -EINVAL;
+
+ priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+ switch (priv->size) {
+ case 2:
+ case 4:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_byteorder *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_byteorder_type;
+static const struct nft_expr_ops nft_byteorder_ops = {
+ .type = &nft_byteorder_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
+ .eval = nft_byteorder_eval,
+ .init = nft_byteorder_init,
+ .dump = nft_byteorder_dump,
+};
+
+static struct nft_expr_type nft_byteorder_type __read_mostly = {
+ .name = "byteorder",
+ .ops = &nft_byteorder_ops,
+ .policy = nft_byteorder_policy,
+ .maxattr = NFTA_BYTEORDER_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_byteorder_module_init(void)
+{
+ return nft_register_expr(&nft_byteorder_type);
+}
+
+void nft_byteorder_module_exit(void)
+{
+ nft_unregister_expr(&nft_byteorder_type);
+}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
new file mode 100644
index 00000000000..e2b3f51c81f
--- /dev/null
+++ b/net/netfilter/nft_cmp.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_cmp_expr {
+ struct nft_data data;
+ enum nft_registers sreg:8;
+ u8 len;
+ enum nft_cmp_ops op:8;
+};
+
+static void nft_cmp_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+ int d;
+
+ d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len);
+ switch (priv->op) {
+ case NFT_CMP_EQ:
+ if (d != 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_NEQ:
+ if (d == 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_LT:
+ if (d == 0)
+ goto mismatch;
+ case NFT_CMP_LTE:
+ if (d > 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_GT:
+ if (d == 0)
+ goto mismatch;
+ case NFT_CMP_GTE:
+ if (d < 0)
+ goto mismatch;
+ break;
+ }
+ return;
+
+mismatch:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = {
+ [NFTA_CMP_SREG] = { .type = NLA_U32 },
+ [NFTA_CMP_OP] = { .type = NLA_U32 },
+ [NFTA_CMP_DATA] = { .type = NLA_NESTED },
+};
+
+static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ int err;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ BUG_ON(err < 0);
+
+ priv->len = desc.len;
+ return 0;
+}
+
+static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_cmp_type;
+static const struct nft_expr_ops nft_cmp_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
+ .eval = nft_cmp_eval,
+ .init = nft_cmp_init,
+ .dump = nft_cmp_dump,
+};
+
+static int nft_cmp_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ struct nft_data data;
+ u32 mask;
+ int err;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ BUG_ON(err < 0);
+ desc.len *= BITS_PER_BYTE;
+
+ mask = nft_cmp_fast_mask(desc.len);
+ priv->data = data.data[0] & mask;
+ priv->len = desc.len;
+ return 0;
+}
+
+static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data data;
+
+ if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ)))
+ goto nla_put_failure;
+
+ data.data[0] = priv->data;
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &data,
+ NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+const struct nft_expr_ops nft_cmp_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp_fast_init,
+ .dump = nft_cmp_fast_dump,
+};
+
+static const struct nft_expr_ops *
+nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
+{
+ struct nft_data_desc desc;
+ struct nft_data data;
+ enum nft_registers sreg;
+ enum nft_cmp_ops op;
+ int err;
+
+ if (tb[NFTA_CMP_SREG] == NULL ||
+ tb[NFTA_CMP_OP] == NULL ||
+ tb[NFTA_CMP_DATA] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ err = nft_validate_input_register(sreg);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+ switch (op) {
+ case NFT_CMP_EQ:
+ case NFT_CMP_NEQ:
+ case NFT_CMP_LT:
+ case NFT_CMP_LTE:
+ case NFT_CMP_GT:
+ case NFT_CMP_GTE:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
+ return &nft_cmp_fast_ops;
+ else
+ return &nft_cmp_ops;
+}
+
+static struct nft_expr_type nft_cmp_type __read_mostly = {
+ .name = "cmp",
+ .select_ops = nft_cmp_select_ops,
+ .policy = nft_cmp_policy,
+ .maxattr = NFTA_CMP_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_cmp_module_init(void)
+{
+ return nft_register_expr(&nft_cmp_type);
+}
+
+void nft_cmp_module_exit(void)
+{
+ nft_unregister_expr(&nft_cmp_type);
+}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
new file mode 100644
index 00000000000..1840989092e
--- /dev/null
+++ b/net/netfilter/nft_compat.c
@@ -0,0 +1,793 @@
+/*
+ * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This software has been sponsored by Sophos Astaro <http://www.sophos.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_tables_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <asm/uaccess.h> /* for set_fs */
+#include <net/netfilter/nf_tables.h>
+
+union nft_entry {
+ struct ipt_entry e4;
+ struct ip6t_entry e6;
+};
+
+static inline void
+nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+{
+ par->target = xt;
+ par->targinfo = xt_info;
+ par->hotdrop = false;
+}
+
+static void nft_target_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_target *target = expr->ops->data;
+ struct sk_buff *skb = pkt->skb;
+ int ret;
+
+ nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+
+ ret = target->target(skb, &pkt->xt);
+
+ if (pkt->xt.hotdrop)
+ ret = NF_DROP;
+
+ switch(ret) {
+ case XT_CONTINUE:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ break;
+ default:
+ data[NFT_REG_VERDICT].verdict = ret;
+ break;
+ }
+ return;
+}
+
+static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
+ [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_TARGET_REV] = { .type = NLA_U32 },
+ [NFTA_TARGET_INFO] = { .type = NLA_BINARY },
+};
+
+static void
+nft_target_set_tgchk_param(struct xt_tgchk_param *par,
+ const struct nft_ctx *ctx,
+ struct xt_target *target, void *info,
+ union nft_entry *entry, u8 proto, bool inv)
+{
+ par->net = &init_net;
+ par->table = ctx->table->name;
+ switch (ctx->afi->family) {
+ case AF_INET:
+ entry->e4.ip.proto = proto;
+ entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+ break;
+ case AF_INET6:
+ entry->e6.ipv6.proto = proto;
+ entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+ break;
+ }
+ par->entryinfo = entry;
+ par->target = target;
+ par->targinfo = info;
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ par->hook_mask = 1 << ops->hooknum;
+ }
+ par->family = ctx->afi->family;
+}
+
+static void target_compat_from_user(struct xt_target *t, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+ if (t->compat_from_user) {
+ int pad;
+
+ t->compat_from_user(out, in);
+ pad = XT_ALIGN(t->targetsize) - t->targetsize;
+ if (pad > 0)
+ memset(out + t->targetsize, 0, pad);
+ } else
+#endif
+ memcpy(out, in, XT_ALIGN(t->targetsize));
+}
+
+static inline int nft_compat_target_offset(struct xt_target *target)
+{
+#ifdef CONFIG_COMPAT
+ return xt_compat_target_offset(target);
+#else
+ return 0;
+#endif
+}
+
+static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = {
+ [NFTA_RULE_COMPAT_PROTO] = { .type = NLA_U32 },
+ [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv)
+{
+ struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+ u32 flags;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr,
+ nft_rule_compat_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS])
+ return -EINVAL;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
+ if (flags & ~NFT_RULE_COMPAT_F_MASK)
+ return -EINVAL;
+ if (flags & NFT_RULE_COMPAT_F_INV)
+ *inv = true;
+
+ *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ return 0;
+}
+
+static int
+nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_target *target = expr->ops->data;
+ struct xt_tgchk_param par;
+ size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
+ u8 proto = 0;
+ bool inv = false;
+ union nft_entry e = {};
+ int ret;
+
+ target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info);
+
+ if (ctx->nla[NFTA_RULE_COMPAT]) {
+ ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
+ if (ret < 0)
+ goto err;
+ }
+
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
+ ret = xt_check_target(&par, size, proto, inv);
+ if (ret < 0)
+ goto err;
+
+ /* The standard target cannot be used */
+ if (target->target == NULL) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ return 0;
+err:
+ module_put(target->me);
+ return ret;
+}
+
+static void
+nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_tgdtor_param par;
+
+ par.net = ctx->net;
+ par.target = target;
+ par.targinfo = info;
+ par.family = ctx->afi->family;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+
+ module_put(target->me);
+}
+
+static int
+target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in)
+{
+ int ret;
+
+#ifdef CONFIG_COMPAT
+ if (t->compat_to_user) {
+ mm_segment_t old_fs;
+ void *out;
+
+ out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC);
+ if (out == NULL)
+ return -ENOMEM;
+
+ /* We want to reuse existing compat_to_user */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ t->compat_to_user(out, in);
+ set_fs(old_fs);
+ ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out);
+ kfree(out);
+ } else
+#endif
+ ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in);
+
+ return ret;
+}
+
+static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+
+ if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
+ nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
+ target_dump_info(skb, target, info))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_target_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ struct xt_target *target = expr->ops->data;
+ unsigned int hook_mask = 0;
+
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ hook_mask = 1 << ops->hooknum;
+ if (hook_mask & target->hooks)
+ return 0;
+
+ /* This target is being called from an invalid chain */
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void nft_match_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+ struct sk_buff *skb = pkt->skb;
+ bool ret;
+
+ nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+
+ ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+
+ if (pkt->xt.hotdrop) {
+ data[NFT_REG_VERDICT].verdict = NF_DROP;
+ return;
+ }
+
+ switch(ret) {
+ case true:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ break;
+ case false:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+ break;
+ }
+}
+
+static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
+ [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_MATCH_REV] = { .type = NLA_U32 },
+ [NFTA_MATCH_INFO] = { .type = NLA_BINARY },
+};
+
+/* struct xt_mtchk_param and xt_tgchk_param look very similar */
+static void
+nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
+ struct xt_match *match, void *info,
+ union nft_entry *entry, u8 proto, bool inv)
+{
+ par->net = &init_net;
+ par->table = ctx->table->name;
+ switch (ctx->afi->family) {
+ case AF_INET:
+ entry->e4.ip.proto = proto;
+ entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+ break;
+ case AF_INET6:
+ entry->e6.ipv6.proto = proto;
+ entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+ break;
+ }
+ par->entryinfo = entry;
+ par->match = match;
+ par->matchinfo = info;
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ par->hook_mask = 1 << ops->hooknum;
+ }
+ par->family = ctx->afi->family;
+}
+
+static void match_compat_from_user(struct xt_match *m, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+ if (m->compat_from_user) {
+ int pad;
+
+ m->compat_from_user(out, in);
+ pad = XT_ALIGN(m->matchsize) - m->matchsize;
+ if (pad > 0)
+ memset(out + m->matchsize, 0, pad);
+ } else
+#endif
+ memcpy(out, in, XT_ALIGN(m->matchsize));
+}
+
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+ struct xt_mtchk_param par;
+ size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
+ u8 proto = 0;
+ bool inv = false;
+ union nft_entry e = {};
+ int ret;
+
+ match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info);
+
+ if (ctx->nla[NFTA_RULE_COMPAT]) {
+ ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
+ if (ret < 0)
+ goto err;
+ }
+
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
+ ret = xt_check_match(&par, size, proto, inv);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ module_put(match->me);
+ return ret;
+}
+
+static void
+nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ struct xt_match *match = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_mtdtor_param par;
+
+ par.net = ctx->net;
+ par.match = match;
+ par.matchinfo = info;
+ par.family = ctx->afi->family;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+
+ module_put(match->me);
+}
+
+static int
+match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in)
+{
+ int ret;
+
+#ifdef CONFIG_COMPAT
+ if (m->compat_to_user) {
+ mm_segment_t old_fs;
+ void *out;
+
+ out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC);
+ if (out == NULL)
+ return -ENOMEM;
+
+ /* We want to reuse existing compat_to_user */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ m->compat_to_user(out, in);
+ set_fs(old_fs);
+ ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out);
+ kfree(out);
+ } else
+#endif
+ ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in);
+
+ return ret;
+}
+
+static inline int nft_compat_match_offset(struct xt_match *match)
+{
+#ifdef CONFIG_COMPAT
+ return xt_compat_match_offset(match);
+#else
+ return 0;
+#endif
+}
+
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+
+ if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
+ nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
+ match_dump_info(skb, match, info))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_match_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ struct xt_match *match = expr->ops->data;
+ unsigned int hook_mask = 0;
+
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ hook_mask = 1 << ops->hooknum;
+ if (hook_mask & match->hooks)
+ return 0;
+
+ /* This match is being called from an invalid chain */
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, u16 family, const char *name,
+ int rev, int target)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+ event |= NFNL_SUBSYS_NFT_COMPAT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
+ nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
+ nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = 0, target;
+ struct nfgenmsg *nfmsg;
+ const char *fmt;
+ const char *name;
+ u32 rev;
+ struct sk_buff *skb2;
+
+ if (tb[NFTA_COMPAT_NAME] == NULL ||
+ tb[NFTA_COMPAT_REV] == NULL ||
+ tb[NFTA_COMPAT_TYPE] == NULL)
+ return -EINVAL;
+
+ name = nla_data(tb[NFTA_COMPAT_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
+ target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
+
+ nfmsg = nlmsg_data(nlh);
+
+ switch(nfmsg->nfgen_family) {
+ case AF_INET:
+ fmt = "ipt_%s";
+ break;
+ case AF_INET6:
+ fmt = "ip6t_%s";
+ break;
+ default:
+ pr_err("nft_compat: unsupported protocol %d\n",
+ nfmsg->nfgen_family);
+ return -EINVAL;
+ }
+
+ try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
+ rev, target, &ret),
+ fmt, name);
+
+ if (ret < 0)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ /* include the best revision for this extension in the message */
+ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_COMPAT_GET,
+ nfmsg->nfgen_family,
+ name, ret, target) <= 0) {
+ kfree_skb(skb2);
+ return -ENOSPC;
+ }
+
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+}
+
+static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
+ [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING,
+ .len = NFT_COMPAT_NAME_MAX-1 },
+ [NFTA_COMPAT_REV] = { .type = NLA_U32 },
+ [NFTA_COMPAT_TYPE] = { .type = NLA_U32 },
+};
+
+static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
+ [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get,
+ .attr_count = NFTA_COMPAT_MAX,
+ .policy = nfnl_compat_policy_get },
+};
+
+static const struct nfnetlink_subsystem nfnl_compat_subsys = {
+ .name = "nft-compat",
+ .subsys_id = NFNL_SUBSYS_NFT_COMPAT,
+ .cb_count = NFNL_MSG_COMPAT_MAX,
+ .cb = nfnl_nft_compat_cb,
+};
+
+static LIST_HEAD(nft_match_list);
+
+struct nft_xt {
+ struct list_head head;
+ struct nft_expr_ops ops;
+};
+
+static struct nft_expr_type nft_match_type;
+
+static const struct nft_expr_ops *
+nft_match_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ struct nft_xt *nft_match;
+ struct xt_match *match;
+ char *mt_name;
+ __u32 rev, family;
+
+ if (tb[NFTA_MATCH_NAME] == NULL ||
+ tb[NFTA_MATCH_REV] == NULL ||
+ tb[NFTA_MATCH_INFO] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ mt_name = nla_data(tb[NFTA_MATCH_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
+ family = ctx->afi->family;
+
+ /* Re-use the existing match if it's already loaded. */
+ list_for_each_entry(nft_match, &nft_match_list, head) {
+ struct xt_match *match = nft_match->ops.data;
+
+ if (strcmp(match->name, mt_name) == 0 &&
+ match->revision == rev && match->family == family)
+ return &nft_match->ops;
+ }
+
+ match = xt_request_find_match(family, mt_name, rev);
+ if (IS_ERR(match))
+ return ERR_PTR(-ENOENT);
+
+ /* This is the first time we use this match, allocate operations */
+ nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+ if (nft_match == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ nft_match->ops.type = &nft_match_type;
+ nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) +
+ nft_compat_match_offset(match));
+ nft_match->ops.eval = nft_match_eval;
+ nft_match->ops.init = nft_match_init;
+ nft_match->ops.destroy = nft_match_destroy;
+ nft_match->ops.dump = nft_match_dump;
+ nft_match->ops.validate = nft_match_validate;
+ nft_match->ops.data = match;
+
+ list_add(&nft_match->head, &nft_match_list);
+
+ return &nft_match->ops;
+}
+
+static void nft_match_release(void)
+{
+ struct nft_xt *nft_match, *tmp;
+
+ list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head)
+ kfree(nft_match);
+}
+
+static struct nft_expr_type nft_match_type __read_mostly = {
+ .name = "match",
+ .select_ops = nft_match_select_ops,
+ .policy = nft_match_policy,
+ .maxattr = NFTA_MATCH_MAX,
+ .owner = THIS_MODULE,
+};
+
+static LIST_HEAD(nft_target_list);
+
+static struct nft_expr_type nft_target_type;
+
+static const struct nft_expr_ops *
+nft_target_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ struct nft_xt *nft_target;
+ struct xt_target *target;
+ char *tg_name;
+ __u32 rev, family;
+
+ if (tb[NFTA_TARGET_NAME] == NULL ||
+ tb[NFTA_TARGET_REV] == NULL ||
+ tb[NFTA_TARGET_INFO] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ tg_name = nla_data(tb[NFTA_TARGET_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
+ family = ctx->afi->family;
+
+ /* Re-use the existing target if it's already loaded. */
+ list_for_each_entry(nft_target, &nft_match_list, head) {
+ struct xt_target *target = nft_target->ops.data;
+
+ if (strcmp(target->name, tg_name) == 0 &&
+ target->revision == rev && target->family == family)
+ return &nft_target->ops;
+ }
+
+ target = xt_request_find_target(family, tg_name, rev);
+ if (IS_ERR(target))
+ return ERR_PTR(-ENOENT);
+
+ /* This is the first time we use this target, allocate operations */
+ nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+ if (nft_target == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ nft_target->ops.type = &nft_target_type;
+ nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) +
+ nft_compat_target_offset(target));
+ nft_target->ops.eval = nft_target_eval;
+ nft_target->ops.init = nft_target_init;
+ nft_target->ops.destroy = nft_target_destroy;
+ nft_target->ops.dump = nft_target_dump;
+ nft_target->ops.validate = nft_target_validate;
+ nft_target->ops.data = target;
+
+ list_add(&nft_target->head, &nft_target_list);
+
+ return &nft_target->ops;
+}
+
+static void nft_target_release(void)
+{
+ struct nft_xt *nft_target, *tmp;
+
+ list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head)
+ kfree(nft_target);
+}
+
+static struct nft_expr_type nft_target_type __read_mostly = {
+ .name = "target",
+ .select_ops = nft_target_select_ops,
+ .policy = nft_target_policy,
+ .maxattr = NFTA_TARGET_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_compat_module_init(void)
+{
+ int ret;
+
+ ret = nft_register_expr(&nft_match_type);
+ if (ret < 0)
+ return ret;
+
+ ret = nft_register_expr(&nft_target_type);
+ if (ret < 0)
+ goto err_match;
+
+ ret = nfnetlink_subsys_register(&nfnl_compat_subsys);
+ if (ret < 0) {
+ pr_err("nft_compat: cannot register with nfnetlink.\n");
+ goto err_target;
+ }
+
+ pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n");
+
+ return ret;
+
+err_target:
+ nft_unregister_expr(&nft_target_type);
+err_match:
+ nft_unregister_expr(&nft_match_type);
+ return ret;
+}
+
+static void __exit nft_compat_module_exit(void)
+{
+ nfnetlink_subsys_unregister(&nfnl_compat_subsys);
+ nft_unregister_expr(&nft_target_type);
+ nft_unregister_expr(&nft_match_type);
+ nft_match_release();
+ nft_target_release();
+}
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
+
+module_init(nft_compat_module_init);
+module_exit(nft_compat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("match");
+MODULE_ALIAS_NFT_EXPR("target");
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
new file mode 100644
index 00000000000..c89ee486ce5
--- /dev/null
+++ b/net/netfilter/nft_counter.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_counter {
+ seqlock_t lock;
+ u64 bytes;
+ u64 packets;
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+
+ write_seqlock_bh(&priv->lock);
+ priv->bytes += pkt->skb->len;
+ priv->packets++;
+ write_sequnlock_bh(&priv->lock);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+ unsigned int seq;
+ u64 bytes;
+ u64 packets;
+
+ do {
+ seq = read_seqbegin(&priv->lock);
+ bytes = priv->bytes;
+ packets = priv->packets;
+ } while (read_seqretry(&priv->lock, seq));
+
+ if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes)))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+ [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
+ [NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
+};
+
+static int nft_counter_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_COUNTER_PACKETS])
+ priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ if (tb[NFTA_COUNTER_BYTES])
+ priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+
+ seqlock_init(&priv->lock);
+ return 0;
+}
+
+static struct nft_expr_type nft_counter_type;
+static const struct nft_expr_ops nft_counter_ops = {
+ .type = &nft_counter_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)),
+ .eval = nft_counter_eval,
+ .init = nft_counter_init,
+ .dump = nft_counter_dump,
+};
+
+static struct nft_expr_type nft_counter_type __read_mostly = {
+ .name = "counter",
+ .ops = &nft_counter_ops,
+ .policy = nft_counter_policy,
+ .maxattr = NFTA_COUNTER_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_counter_module_init(void)
+{
+ return nft_register_expr(&nft_counter_type);
+}
+
+static void __exit nft_counter_module_exit(void)
+{
+ nft_unregister_expr(&nft_counter_type);
+}
+
+module_init(nft_counter_module_init);
+module_exit(nft_counter_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("counter");
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
new file mode 100644
index 00000000000..cc560301624
--- /dev/null
+++ b/net/netfilter/nft_ct.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+struct nft_ct {
+ enum nft_ct_keys key:8;
+ enum ip_conntrack_dir dir:8;
+ union {
+ enum nft_registers dreg:8;
+ enum nft_registers sreg:8;
+ };
+};
+
+static void nft_ct_get_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ struct nft_data *dest = &data[priv->dreg];
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_tuple *tuple;
+ const struct nf_conntrack_helper *helper;
+ long diff;
+ unsigned int state;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ if (ct == NULL)
+ state = NF_CT_STATE_INVALID_BIT;
+ else if (nf_ct_is_untracked(ct))
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ else
+ state = NF_CT_STATE_BIT(ctinfo);
+ dest->data[0] = state;
+ return;
+ }
+
+ if (ct == NULL)
+ goto err;
+
+ switch (priv->key) {
+ case NFT_CT_DIRECTION:
+ dest->data[0] = CTINFO2DIR(ctinfo);
+ return;
+ case NFT_CT_STATUS:
+ dest->data[0] = ct->status;
+ return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ dest->data[0] = ct->mark;
+ return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ dest->data[0] = ct->secmark;
+ return;
+#endif
+ case NFT_CT_EXPIRATION:
+ diff = (long)jiffies - (long)ct->timeout.expires;
+ if (diff < 0)
+ diff = 0;
+ dest->data[0] = jiffies_to_msecs(diff);
+ return;
+ case NFT_CT_HELPER:
+ if (ct->master == NULL)
+ goto err;
+ help = nfct_help(ct->master);
+ if (help == NULL)
+ goto err;
+ helper = rcu_dereference(help->helper);
+ if (helper == NULL)
+ goto err;
+ if (strlen(helper->name) >= sizeof(dest->data))
+ goto err;
+ strncpy((char *)dest->data, helper->name, sizeof(dest->data));
+ return;
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS: {
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+ unsigned int size;
+
+ if (!labels) {
+ memset(dest->data, 0, sizeof(dest->data));
+ return;
+ }
+
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data));
+ size = labels->words * sizeof(long);
+
+ memcpy(dest->data, labels->bits, size);
+ if (size < sizeof(dest->data))
+ memset(((char *) dest->data) + size, 0,
+ sizeof(dest->data) - size);
+ return;
+ }
+#endif
+ }
+
+ tuple = &ct->tuplehash[priv->dir].tuple;
+ switch (priv->key) {
+ case NFT_CT_L3PROTOCOL:
+ dest->data[0] = nf_ct_l3num(ct);
+ return;
+ case NFT_CT_SRC:
+ memcpy(dest->data, tuple->src.u3.all,
+ nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ return;
+ case NFT_CT_DST:
+ memcpy(dest->data, tuple->dst.u3.all,
+ nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ return;
+ case NFT_CT_PROTOCOL:
+ dest->data[0] = nf_ct_protonum(ct);
+ return;
+ case NFT_CT_PROTO_SRC:
+ dest->data[0] = (__force __u16)tuple->src.u.all;
+ return;
+ case NFT_CT_PROTO_DST:
+ dest->data[0] = (__force __u16)tuple->dst.u.all;
+ return;
+ }
+ return;
+err:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static void nft_ct_set_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ u32 value = data[priv->sreg].data[0];
+#endif
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return;
+
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ if (ct->mark != value) {
+ ct->mark = value;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+ break;
+#endif
+ }
+}
+
+static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
+ [NFTA_CT_DREG] = { .type = NLA_U32 },
+ [NFTA_CT_KEY] = { .type = NLA_U32 },
+ [NFTA_CT_DIRECTION] = { .type = NLA_U8 },
+ [NFTA_CT_SREG] = { .type = NLA_U32 },
+};
+
+static int nft_ct_l3proto_try_module_get(uint8_t family)
+{
+ int err;
+
+ if (family == NFPROTO_INET) {
+ err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ } else {
+ err = nf_ct_l3proto_try_module_get(family);
+ if (err < 0)
+ goto err1;
+ }
+ return 0;
+
+err2:
+ nf_ct_l3proto_module_put(NFPROTO_IPV4);
+err1:
+ return err;
+}
+
+static void nft_ct_l3proto_module_put(uint8_t family)
+{
+ if (family == NFPROTO_INET) {
+ nf_ct_l3proto_module_put(NFPROTO_IPV4);
+ nf_ct_l3proto_module_put(NFPROTO_IPV6);
+ } else
+ nf_ct_l3proto_module_put(family);
+}
+
+static int nft_ct_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ct *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ case NFT_CT_DIRECTION:
+ case NFT_CT_STATUS:
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+#endif
+ case NFT_CT_EXPIRATION:
+ case NFT_CT_HELPER:
+ if (tb[NFTA_CT_DIRECTION] != NULL)
+ return -EINVAL;
+ break;
+ case NFT_CT_L3PROTOCOL:
+ case NFT_CT_PROTOCOL:
+ case NFT_CT_SRC:
+ case NFT_CT_DST:
+ case NFT_CT_PROTO_SRC:
+ case NFT_CT_PROTO_DST:
+ if (tb[NFTA_CT_DIRECTION] == NULL)
+ return -EINVAL;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[NFTA_CT_DIRECTION] != NULL) {
+ priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+ switch (priv->dir) {
+ case IP_CT_DIR_ORIGINAL:
+ case IP_CT_DIR_REPLY:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+ if (err < 0)
+ return err;
+
+ err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_ct_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ct *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ break;
+#endif
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void nft_ct_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ nft_ct_l3proto_module_put(ctx->afi->family);
+}
+
+static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+
+ switch (priv->key) {
+ case NFT_CT_PROTOCOL:
+ case NFT_CT_SRC:
+ case NFT_CT_DST:
+ case NFT_CT_PROTO_SRC:
+ case NFT_CT_PROTO_DST:
+ if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+ goto nla_put_failure;
+ default:
+ break;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_ct_type;
+static const struct nft_expr_ops nft_ct_get_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_get_eval,
+ .init = nft_ct_get_init,
+ .destroy = nft_ct_destroy,
+ .dump = nft_ct_get_dump,
+};
+
+static const struct nft_expr_ops nft_ct_set_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_set_eval,
+ .init = nft_ct_set_init,
+ .destroy = nft_ct_destroy,
+ .dump = nft_ct_set_dump,
+};
+
+static const struct nft_expr_ops *
+nft_ct_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_CT_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_CT_DREG])
+ return &nft_ct_get_ops;
+
+ if (tb[NFTA_CT_SREG])
+ return &nft_ct_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_ct_type __read_mostly = {
+ .name = "ct",
+ .select_ops = &nft_ct_select_ops,
+ .policy = nft_ct_policy,
+ .maxattr = NFTA_CT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_ct_module_init(void)
+{
+ return nft_register_expr(&nft_ct_type);
+}
+
+static void __exit nft_ct_module_exit(void)
+{
+ nft_unregister_expr(&nft_ct_type);
+}
+
+module_init(nft_ct_module_init);
+module_exit(nft_ct_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("ct");
diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c
new file mode 100644
index 00000000000..b6eed4d5a09
--- /dev/null
+++ b/net/netfilter/nft_expr_template.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_template {
+
+};
+
+static void nft_template_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = {
+ [NFTA_TEMPLATE_ATTR] = { .type = NLA_U32 },
+};
+
+static int nft_template_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_template *priv = nft_expr_priv(expr);
+
+ return 0;
+}
+
+static void nft_template_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_template *priv = nft_expr_priv(expr);
+
+ NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_template_type;
+static const struct nft_expr_ops nft_template_ops = {
+ .type = &nft_template_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_template)),
+ .eval = nft_template_eval,
+ .init = nft_template_init,
+ .destroy = nft_template_destroy,
+ .dump = nft_template_dump,
+};
+
+static struct nft_expr_type nft_template_type __read_mostly = {
+ .name = "template",
+ .ops = &nft_template_ops,
+ .policy = nft_template_policy,
+ .maxattr = NFTA_TEMPLATE_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_template_module_init(void)
+{
+ return nft_register_expr(&nft_template_type);
+}
+
+static void __exit nft_template_module_exit(void)
+{
+ nft_unregister_expr(&nft_template_type);
+}
+
+module_init(nft_template_module_init);
+module_exit(nft_template_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("template");
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
new file mode 100644
index 00000000000..55c939f5371
--- /dev/null
+++ b/net/netfilter/nft_exthdr.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+// FIXME:
+#include <net/ipv6.h>
+
+struct nft_exthdr {
+ u8 type;
+ u8 offset;
+ u8 len;
+ enum nft_registers dreg:8;
+};
+
+static void nft_exthdr_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ struct nft_data *dest = &data[priv->dreg];
+ unsigned int offset = 0;
+ int err;
+
+ err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
+ if (err < 0)
+ goto err;
+ offset += priv->offset;
+
+ if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0)
+ goto err;
+ return;
+err:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
+ [NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
+ [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
+};
+
+static int nft_exthdr_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_EXTHDR_DREG] == NULL ||
+ tb[NFTA_EXTHDR_TYPE] == NULL ||
+ tb[NFTA_EXTHDR_OFFSET] == NULL ||
+ tb[NFTA_EXTHDR_LEN] == NULL)
+ return -EINVAL;
+
+ priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+ if (priv->len == 0 ||
+ priv->len > FIELD_SIZEOF(struct nft_data, data))
+ return -EINVAL;
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+ return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_exthdr_type;
+static const struct nft_expr_ops nft_exthdr_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_eval,
+ .init = nft_exthdr_init,
+ .dump = nft_exthdr_dump,
+};
+
+static struct nft_expr_type nft_exthdr_type __read_mostly = {
+ .name = "exthdr",
+ .ops = &nft_exthdr_ops,
+ .policy = nft_exthdr_policy,
+ .maxattr = NFTA_EXTHDR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_exthdr_module_init(void)
+{
+ return nft_register_expr(&nft_exthdr_type);
+}
+
+static void __exit nft_exthdr_module_exit(void)
+{
+ nft_unregister_expr(&nft_exthdr_type);
+}
+
+module_init(nft_exthdr_module_init);
+module_exit(nft_exthdr_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("exthdr");
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
new file mode 100644
index 00000000000..4080ed6a072
--- /dev/null
+++ b/net/netfilter/nft_hash.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/jhash.h>
+#include <linux/netlink.h>
+#include <linux/vmalloc.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+#define NFT_HASH_MIN_SIZE 4UL
+
+struct nft_hash {
+ struct nft_hash_table __rcu *tbl;
+};
+
+struct nft_hash_table {
+ unsigned int size;
+ struct nft_hash_elem __rcu *buckets[];
+};
+
+struct nft_hash_elem {
+ struct nft_hash_elem __rcu *next;
+ struct nft_data key;
+ struct nft_data data[];
+};
+
+#define nft_hash_for_each_entry(i, head) \
+ for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next))
+#define nft_hash_for_each_entry_rcu(i, head) \
+ for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next))
+
+static u32 nft_hash_rnd __read_mostly;
+static bool nft_hash_rnd_initted __read_mostly;
+
+static unsigned int nft_hash_data(const struct nft_data *data,
+ unsigned int hsize, unsigned int len)
+{
+ unsigned int h;
+
+ h = jhash(data->data, len, nft_hash_rnd);
+ return h & (hsize - 1);
+}
+
+static bool nft_hash_lookup(const struct nft_set *set,
+ const struct nft_data *key,
+ struct nft_data *data)
+{
+ const struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_table *tbl = rcu_dereference(priv->tbl);
+ const struct nft_hash_elem *he;
+ unsigned int h;
+
+ h = nft_hash_data(key, tbl->size, set->klen);
+ nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) {
+ if (nft_data_cmp(&he->key, key, set->klen))
+ continue;
+ if (set->flags & NFT_SET_MAP)
+ nft_data_copy(data, he->data);
+ return true;
+ }
+ return false;
+}
+
+static void nft_hash_tbl_free(const struct nft_hash_table *tbl)
+{
+ kvfree(tbl);
+}
+
+static unsigned int nft_hash_tbl_size(unsigned int nelem)
+{
+ return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE);
+}
+
+static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets)
+{
+ struct nft_hash_table *tbl;
+ size_t size;
+
+ size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
+ tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN);
+ if (tbl == NULL)
+ tbl = vzalloc(size);
+ if (tbl == NULL)
+ return NULL;
+ tbl->size = nbuckets;
+
+ return tbl;
+}
+
+static void nft_hash_chain_unzip(const struct nft_set *set,
+ const struct nft_hash_table *ntbl,
+ struct nft_hash_table *tbl, unsigned int n)
+{
+ struct nft_hash_elem *he, *last, *next;
+ unsigned int h;
+
+ he = nft_dereference(tbl->buckets[n]);
+ if (he == NULL)
+ return;
+ h = nft_hash_data(&he->key, ntbl->size, set->klen);
+
+ /* Find last element of first chain hashing to bucket h */
+ last = he;
+ nft_hash_for_each_entry(he, he->next) {
+ if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
+ break;
+ last = he;
+ }
+
+ /* Unlink first chain from the old table */
+ RCU_INIT_POINTER(tbl->buckets[n], last->next);
+
+ /* If end of chain reached, done */
+ if (he == NULL)
+ return;
+
+ /* Find first element of second chain hashing to bucket h */
+ next = NULL;
+ nft_hash_for_each_entry(he, he->next) {
+ if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
+ continue;
+ next = he;
+ break;
+ }
+
+ /* Link the two chains */
+ RCU_INIT_POINTER(last->next, next);
+}
+
+static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv)
+{
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
+ struct nft_hash_elem *he;
+ unsigned int i, h;
+ bool complete;
+
+ ntbl = nft_hash_tbl_alloc(tbl->size * 2);
+ if (ntbl == NULL)
+ return -ENOMEM;
+
+ /* Link new table's buckets to first element in the old table
+ * hashing to the new bucket.
+ */
+ for (i = 0; i < ntbl->size; i++) {
+ h = i < tbl->size ? i : i - tbl->size;
+ nft_hash_for_each_entry(he, tbl->buckets[h]) {
+ if (nft_hash_data(&he->key, ntbl->size, set->klen) != i)
+ continue;
+ RCU_INIT_POINTER(ntbl->buckets[i], he);
+ break;
+ }
+ }
+
+ /* Publish new table */
+ rcu_assign_pointer(priv->tbl, ntbl);
+
+ /* Unzip interleaved hash chains */
+ do {
+ /* Wait for readers to use new table/unzipped chains */
+ synchronize_rcu();
+
+ complete = true;
+ for (i = 0; i < tbl->size; i++) {
+ nft_hash_chain_unzip(set, ntbl, tbl, i);
+ if (tbl->buckets[i] != NULL)
+ complete = false;
+ }
+ } while (!complete);
+
+ nft_hash_tbl_free(tbl);
+ return 0;
+}
+
+static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv)
+{
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
+ struct nft_hash_elem __rcu **pprev;
+ unsigned int i;
+
+ ntbl = nft_hash_tbl_alloc(tbl->size / 2);
+ if (ntbl == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ntbl->size; i++) {
+ ntbl->buckets[i] = tbl->buckets[i];
+
+ for (pprev = &ntbl->buckets[i]; *pprev != NULL;
+ pprev = &nft_dereference(*pprev)->next)
+ ;
+ RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
+ }
+
+ /* Publish new table */
+ rcu_assign_pointer(priv->tbl, ntbl);
+ synchronize_rcu();
+
+ nft_hash_tbl_free(tbl);
+ return 0;
+}
+
+static int nft_hash_insert(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ struct nft_hash_elem *he;
+ unsigned int size, h;
+
+ if (elem->flags != 0)
+ return -EINVAL;
+
+ size = sizeof(*he);
+ if (set->flags & NFT_SET_MAP)
+ size += sizeof(he->data[0]);
+
+ he = kzalloc(size, GFP_KERNEL);
+ if (he == NULL)
+ return -ENOMEM;
+
+ nft_data_copy(&he->key, &elem->key);
+ if (set->flags & NFT_SET_MAP)
+ nft_data_copy(he->data, &elem->data);
+
+ h = nft_hash_data(&he->key, tbl->size, set->klen);
+ RCU_INIT_POINTER(he->next, tbl->buckets[h]);
+ rcu_assign_pointer(tbl->buckets[h], he);
+
+ /* Expand table when exceeding 75% load */
+ if (set->nelems + 1 > tbl->size / 4 * 3)
+ nft_hash_tbl_expand(set, priv);
+
+ return 0;
+}
+
+static void nft_hash_elem_destroy(const struct nft_set *set,
+ struct nft_hash_elem *he)
+{
+ nft_data_uninit(&he->key, NFT_DATA_VALUE);
+ if (set->flags & NFT_SET_MAP)
+ nft_data_uninit(he->data, set->dtype);
+ kfree(he);
+}
+
+static void nft_hash_remove(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ struct nft_hash_elem *he, __rcu **pprev;
+
+ pprev = elem->cookie;
+ he = nft_dereference((*pprev));
+
+ RCU_INIT_POINTER(*pprev, he->next);
+ synchronize_rcu();
+ kfree(he);
+
+ /* Shrink table beneath 30% load */
+ if (set->nelems - 1 < tbl->size * 3 / 10 &&
+ tbl->size > NFT_HASH_MIN_SIZE)
+ nft_hash_tbl_shrink(set, priv);
+}
+
+static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
+{
+ const struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ struct nft_hash_elem __rcu * const *pprev;
+ struct nft_hash_elem *he;
+ unsigned int h;
+
+ h = nft_hash_data(&elem->key, tbl->size, set->klen);
+ pprev = &tbl->buckets[h];
+ nft_hash_for_each_entry(he, tbl->buckets[h]) {
+ if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
+ pprev = &he->next;
+ continue;
+ }
+
+ elem->cookie = (void *)pprev;
+ elem->flags = 0;
+ if (set->flags & NFT_SET_MAP)
+ nft_data_copy(&elem->data, he->data);
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ const struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ const struct nft_hash_elem *he;
+ struct nft_set_elem elem;
+ unsigned int i;
+
+ for (i = 0; i < tbl->size; i++) {
+ nft_hash_for_each_entry(he, tbl->buckets[i]) {
+ if (iter->count < iter->skip)
+ goto cont;
+
+ memcpy(&elem.key, &he->key, sizeof(elem.key));
+ if (set->flags & NFT_SET_MAP)
+ memcpy(&elem.data, he->data, sizeof(elem.data));
+ elem.flags = 0;
+
+ iter->err = iter->fn(ctx, set, iter, &elem);
+ if (iter->err < 0)
+ return;
+cont:
+ iter->count++;
+ }
+ }
+}
+
+static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
+{
+ return sizeof(struct nft_hash);
+}
+
+static int nft_hash_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const tb[])
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_table *tbl;
+ unsigned int size;
+
+ if (unlikely(!nft_hash_rnd_initted)) {
+ get_random_bytes(&nft_hash_rnd, 4);
+ nft_hash_rnd_initted = true;
+ }
+
+ size = NFT_HASH_MIN_SIZE;
+ if (desc->size)
+ size = nft_hash_tbl_size(desc->size);
+
+ tbl = nft_hash_tbl_alloc(size);
+ if (tbl == NULL)
+ return -ENOMEM;
+ RCU_INIT_POINTER(priv->tbl, tbl);
+ return 0;
+}
+
+static void nft_hash_destroy(const struct nft_set *set)
+{
+ const struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ struct nft_hash_elem *he, *next;
+ unsigned int i;
+
+ for (i = 0; i < tbl->size; i++) {
+ for (he = nft_dereference(tbl->buckets[i]); he != NULL;
+ he = next) {
+ next = nft_dereference(he->next);
+ nft_hash_elem_destroy(set, he);
+ }
+ }
+ kfree(tbl);
+}
+
+static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ unsigned int esize;
+
+ esize = sizeof(struct nft_hash_elem);
+ if (features & NFT_SET_MAP)
+ esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]);
+
+ if (desc->size) {
+ est->size = sizeof(struct nft_hash) +
+ nft_hash_tbl_size(desc->size) *
+ sizeof(struct nft_hash_elem *) +
+ desc->size * esize;
+ } else {
+ /* Resizing happens when the load drops below 30% or goes
+ * above 75%. The average of 52.5% load (approximated by 50%)
+ * is used for the size estimation of the hash buckets,
+ * meaning we calculate two buckets per element.
+ */
+ est->size = esize + 2 * sizeof(struct nft_hash_elem *);
+ }
+
+ est->class = NFT_SET_CLASS_O_1;
+
+ return true;
+}
+
+static struct nft_set_ops nft_hash_ops __read_mostly = {
+ .privsize = nft_hash_privsize,
+ .estimate = nft_hash_estimate,
+ .init = nft_hash_init,
+ .destroy = nft_hash_destroy,
+ .get = nft_hash_get,
+ .insert = nft_hash_insert,
+ .remove = nft_hash_remove,
+ .lookup = nft_hash_lookup,
+ .walk = nft_hash_walk,
+ .features = NFT_SET_MAP,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_hash_module_init(void)
+{
+ return nft_register_set(&nft_hash_ops);
+}
+
+static void __exit nft_hash_module_exit(void)
+{
+ nft_unregister_set(&nft_hash_ops);
+}
+
+module_init(nft_hash_module_init);
+module_exit(nft_hash_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
new file mode 100644
index 00000000000..810385eb724
--- /dev/null
+++ b/net/netfilter/nft_immediate.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_immediate_expr {
+ struct nft_data data;
+ enum nft_registers dreg:8;
+ u8 dlen;
+};
+
+static void nft_immediate_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ nft_data_copy(&data[priv->dreg], &priv->data);
+}
+
+static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
+ [NFTA_IMMEDIATE_DREG] = { .type = NLA_U32 },
+ [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED },
+};
+
+static int nft_immediate_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ int err;
+
+ if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
+ tb[NFTA_IMMEDIATE_DATA] == NULL)
+ return -EINVAL;
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+
+ err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
+ if (err < 0)
+ return err;
+ priv->dlen = desc.len;
+
+ err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+
+err1:
+ nft_data_uninit(&priv->data, desc.type);
+ return err;
+}
+
+static void nft_immediate_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
+}
+
+static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+
+ return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data,
+ nft_dreg_to_type(priv->dreg), priv->dlen);
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_immediate_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ *data = &priv->data;
+
+ return 0;
+}
+
+static struct nft_expr_type nft_imm_type;
+static const struct nft_expr_ops nft_imm_ops = {
+ .type = &nft_imm_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
+ .eval = nft_immediate_eval,
+ .init = nft_immediate_init,
+ .destroy = nft_immediate_destroy,
+ .dump = nft_immediate_dump,
+ .validate = nft_immediate_validate,
+};
+
+static struct nft_expr_type nft_imm_type __read_mostly = {
+ .name = "immediate",
+ .ops = &nft_imm_ops,
+ .policy = nft_immediate_policy,
+ .maxattr = NFTA_IMMEDIATE_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_immediate_module_init(void)
+{
+ return nft_register_expr(&nft_imm_type);
+}
+
+void nft_immediate_module_exit(void)
+{
+ nft_unregister_expr(&nft_imm_type);
+}
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
new file mode 100644
index 00000000000..85da5bd02f6
--- /dev/null
+++ b/net/netfilter/nft_limit.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static DEFINE_SPINLOCK(limit_lock);
+
+struct nft_limit {
+ u64 tokens;
+ u64 rate;
+ u64 unit;
+ unsigned long stamp;
+};
+
+static void nft_limit_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+
+ spin_lock_bh(&limit_lock);
+ if (time_after_eq(jiffies, priv->stamp)) {
+ priv->tokens = priv->rate;
+ priv->stamp = jiffies + priv->unit * HZ;
+ }
+
+ if (priv->tokens >= 1) {
+ priv->tokens--;
+ spin_unlock_bh(&limit_lock);
+ return;
+ }
+ spin_unlock_bh(&limit_lock);
+
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
+ [NFTA_LIMIT_RATE] = { .type = NLA_U64 },
+ [NFTA_LIMIT_UNIT] = { .type = NLA_U64 },
+};
+
+static int nft_limit_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_LIMIT_RATE] == NULL ||
+ tb[NFTA_LIMIT_UNIT] == NULL)
+ return -EINVAL;
+
+ priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+ priv->stamp = jiffies + priv->unit * HZ;
+ priv->tokens = priv->rate;
+ return 0;
+}
+
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_limit *priv = nft_expr_priv(expr);
+
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_limit_type;
+static const struct nft_expr_ops nft_limit_ops = {
+ .type = &nft_limit_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+ .eval = nft_limit_eval,
+ .init = nft_limit_init,
+ .dump = nft_limit_dump,
+};
+
+static struct nft_expr_type nft_limit_type __read_mostly = {
+ .name = "limit",
+ .ops = &nft_limit_ops,
+ .policy = nft_limit_policy,
+ .maxattr = NFTA_LIMIT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_limit_module_init(void)
+{
+ return nft_register_expr(&nft_limit_type);
+}
+
+static void __exit nft_limit_module_exit(void)
+{
+ nft_unregister_expr(&nft_limit_type);
+}
+
+module_init(nft_limit_module_init);
+module_exit(nft_limit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("limit");
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
new file mode 100644
index 00000000000..10cfb156cdf
--- /dev/null
+++ b/net/netfilter/nft_log.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netdevice.h>
+
+static const char *nft_log_null_prefix = "";
+
+struct nft_log {
+ struct nf_loginfo loginfo;
+ char *prefix;
+};
+
+static void nft_log_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_log *priv = nft_expr_priv(expr);
+ struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+ nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in,
+ pkt->out, &priv->loginfo, "%s", priv->prefix);
+}
+
+static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
+ [NFTA_LOG_GROUP] = { .type = NLA_U16 },
+ [NFTA_LOG_PREFIX] = { .type = NLA_STRING },
+ [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 },
+ [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 },
+};
+
+static int nft_log_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_log *priv = nft_expr_priv(expr);
+ struct nf_loginfo *li = &priv->loginfo;
+ const struct nlattr *nla;
+
+ nla = tb[NFTA_LOG_PREFIX];
+ if (nla != NULL) {
+ priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+ if (priv->prefix == NULL)
+ return -ENOMEM;
+ nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+ } else
+ priv->prefix = (char *)nft_log_null_prefix;
+
+ li->type = NF_LOG_TYPE_ULOG;
+ if (tb[NFTA_LOG_GROUP] != NULL)
+ li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP]));
+
+ if (tb[NFTA_LOG_SNAPLEN] != NULL)
+ li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN]));
+ if (tb[NFTA_LOG_QTHRESHOLD] != NULL) {
+ li->u.ulog.qthreshold =
+ ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD]));
+ }
+
+ return 0;
+}
+
+static void nft_log_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_log *priv = nft_expr_priv(expr);
+
+ if (priv->prefix != nft_log_null_prefix)
+ kfree(priv->prefix);
+}
+
+static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_log *priv = nft_expr_priv(expr);
+ const struct nf_loginfo *li = &priv->loginfo;
+
+ if (priv->prefix != nft_log_null_prefix)
+ if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix))
+ goto nla_put_failure;
+ if (li->u.ulog.group)
+ if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group)))
+ goto nla_put_failure;
+ if (li->u.ulog.copy_len)
+ if (nla_put_be32(skb, NFTA_LOG_SNAPLEN,
+ htonl(li->u.ulog.copy_len)))
+ goto nla_put_failure;
+ if (li->u.ulog.qthreshold)
+ if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD,
+ htons(li->u.ulog.qthreshold)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_log_type;
+static const struct nft_expr_ops nft_log_ops = {
+ .type = &nft_log_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_log)),
+ .eval = nft_log_eval,
+ .init = nft_log_init,
+ .destroy = nft_log_destroy,
+ .dump = nft_log_dump,
+};
+
+static struct nft_expr_type nft_log_type __read_mostly = {
+ .name = "log",
+ .ops = &nft_log_ops,
+ .policy = nft_log_policy,
+ .maxattr = NFTA_LOG_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_log_module_init(void)
+{
+ return nft_register_expr(&nft_log_type);
+}
+
+static void __exit nft_log_module_exit(void)
+{
+ nft_unregister_expr(&nft_log_type);
+}
+
+module_init(nft_log_module_init);
+module_exit(nft_log_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("log");
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
new file mode 100644
index 00000000000..6404a726d17
--- /dev/null
+++ b/net/netfilter/nft_lookup.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+struct nft_lookup {
+ struct nft_set *set;
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ struct nft_set_binding binding;
+};
+
+static void nft_lookup_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+ const struct nft_set *set = priv->set;
+
+ if (set->ops->lookup(set, &data[priv->sreg], &data[priv->dreg]))
+ return;
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
+ [NFTA_LOOKUP_SET] = { .type = NLA_STRING },
+ [NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
+ [NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
+};
+
+static int nft_lookup_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_lookup *priv = nft_expr_priv(expr);
+ struct nft_set *set;
+ int err;
+
+ if (tb[NFTA_LOOKUP_SET] == NULL ||
+ tb[NFTA_LOOKUP_SREG] == NULL)
+ return -EINVAL;
+
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]);
+ if (IS_ERR(set)) {
+ if (tb[NFTA_LOOKUP_SET_ID]) {
+ set = nf_tables_set_lookup_byid(ctx->net,
+ tb[NFTA_LOOKUP_SET_ID]);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_LOOKUP_DREG] != NULL) {
+ if (!(set->flags & NFT_SET_MAP))
+ return -EINVAL;
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+
+ if (priv->dreg == NFT_REG_VERDICT) {
+ if (set->dtype != NFT_DATA_VERDICT)
+ return -EINVAL;
+ } else if (set->dtype == NFT_DATA_VERDICT)
+ return -EINVAL;
+ } else if (set->flags & NFT_SET_MAP)
+ return -EINVAL;
+
+ err = nf_tables_bind_set(ctx, set, &priv->binding);
+ if (err < 0)
+ return err;
+
+ priv->set = set;
+ return 0;
+}
+
+static void nft_lookup_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_lookup *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
+static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+
+ if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_LOOKUP_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (priv->set->flags & NFT_SET_MAP)
+ if (nla_put_be32(skb, NFTA_LOOKUP_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_lookup_type;
+static const struct nft_expr_ops nft_lookup_ops = {
+ .type = &nft_lookup_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
+ .eval = nft_lookup_eval,
+ .init = nft_lookup_init,
+ .destroy = nft_lookup_destroy,
+ .dump = nft_lookup_dump,
+};
+
+static struct nft_expr_type nft_lookup_type __read_mostly = {
+ .name = "lookup",
+ .ops = &nft_lookup_ops,
+ .policy = nft_lookup_policy,
+ .maxattr = NFTA_LOOKUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_lookup_module_init(void)
+{
+ return nft_register_expr(&nft_lookup_type);
+}
+
+void nft_lookup_module_exit(void)
+{
+ nft_unregister_expr(&nft_lookup_type);
+}
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
new file mode 100644
index 00000000000..852b178c6ae
--- /dev/null
+++ b/net/netfilter/nft_meta.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+
+void nft_meta_get_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ const struct net_device *in = pkt->in, *out = pkt->out;
+ struct nft_data *dest = &data[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_LEN:
+ dest->data[0] = skb->len;
+ break;
+ case NFT_META_PROTOCOL:
+ *(__be16 *)dest->data = skb->protocol;
+ break;
+ case NFT_META_NFPROTO:
+ dest->data[0] = pkt->ops->pf;
+ break;
+ case NFT_META_L4PROTO:
+ dest->data[0] = pkt->tprot;
+ break;
+ case NFT_META_PRIORITY:
+ dest->data[0] = skb->priority;
+ break;
+ case NFT_META_MARK:
+ dest->data[0] = skb->mark;
+ break;
+ case NFT_META_IIF:
+ if (in == NULL)
+ goto err;
+ dest->data[0] = in->ifindex;
+ break;
+ case NFT_META_OIF:
+ if (out == NULL)
+ goto err;
+ dest->data[0] = out->ifindex;
+ break;
+ case NFT_META_IIFNAME:
+ if (in == NULL)
+ goto err;
+ strncpy((char *)dest->data, in->name, sizeof(dest->data));
+ break;
+ case NFT_META_OIFNAME:
+ if (out == NULL)
+ goto err;
+ strncpy((char *)dest->data, out->name, sizeof(dest->data));
+ break;
+ case NFT_META_IIFTYPE:
+ if (in == NULL)
+ goto err;
+ *(u16 *)dest->data = in->type;
+ break;
+ case NFT_META_OIFTYPE:
+ if (out == NULL)
+ goto err;
+ *(u16 *)dest->data = out->type;
+ break;
+ case NFT_META_SKUID:
+ if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+ goto err;
+
+ read_lock_bh(&skb->sk->sk_callback_lock);
+ if (skb->sk->sk_socket == NULL ||
+ skb->sk->sk_socket->file == NULL) {
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ goto err;
+ }
+
+ dest->data[0] =
+ from_kuid_munged(&init_user_ns,
+ skb->sk->sk_socket->file->f_cred->fsuid);
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ break;
+ case NFT_META_SKGID:
+ if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+ goto err;
+
+ read_lock_bh(&skb->sk->sk_callback_lock);
+ if (skb->sk->sk_socket == NULL ||
+ skb->sk->sk_socket->file == NULL) {
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ goto err;
+ }
+ dest->data[0] =
+ from_kgid_munged(&init_user_ns,
+ skb->sk->sk_socket->file->f_cred->fsgid);
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ break;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_META_RTCLASSID: {
+ const struct dst_entry *dst = skb_dst(skb);
+
+ if (dst == NULL)
+ goto err;
+ dest->data[0] = dst->tclassid;
+ break;
+ }
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+ case NFT_META_SECMARK:
+ dest->data[0] = skb->secmark;
+ break;
+#endif
+ default:
+ WARN_ON(1);
+ goto err;
+ }
+ return;
+
+err:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_eval);
+
+void nft_meta_set_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *meta = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ u32 value = data[meta->sreg].data[0];
+
+ switch (meta->key) {
+ case NFT_META_MARK:
+ skb->mark = value;
+ break;
+ case NFT_META_PRIORITY:
+ skb->priority = value;
+ break;
+ case NFT_META_NFTRACE:
+ skb->nf_trace = 1;
+ break;
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL_GPL(nft_meta_set_eval);
+
+const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+ [NFTA_META_DREG] = { .type = NLA_U32 },
+ [NFTA_META_KEY] = { .type = NLA_U32 },
+ [NFTA_META_SREG] = { .type = NLA_U32 },
+};
+EXPORT_SYMBOL_GPL(nft_meta_policy);
+
+int nft_meta_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_LEN:
+ case NFT_META_PROTOCOL:
+ case NFT_META_NFPROTO:
+ case NFT_META_L4PROTO:
+ case NFT_META_PRIORITY:
+ case NFT_META_MARK:
+ case NFT_META_IIF:
+ case NFT_META_OIF:
+ case NFT_META_IIFNAME:
+ case NFT_META_OIFNAME:
+ case NFT_META_IIFTYPE:
+ case NFT_META_OIFTYPE:
+ case NFT_META_SKUID:
+ case NFT_META_SKGID:
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_META_RTCLASSID:
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+ case NFT_META_SECMARK:
+#endif
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_init);
+
+int nft_meta_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_MARK:
+ case NFT_META_PRIORITY:
+ case NFT_META_NFTRACE:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_meta_set_init);
+
+int nft_meta_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_dump);
+
+int nft_meta_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_META_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_meta_set_dump);
+
+static struct nft_expr_type nft_meta_type;
+static const struct nft_expr_ops nft_meta_get_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_get_eval,
+ .init = nft_meta_get_init,
+ .dump = nft_meta_get_dump,
+};
+
+static const struct nft_expr_ops nft_meta_set_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_set_eval,
+ .init = nft_meta_set_init,
+ .dump = nft_meta_set_dump,
+};
+
+static const struct nft_expr_ops *
+nft_meta_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_META_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG])
+ return &nft_meta_get_ops;
+
+ if (tb[NFTA_META_SREG])
+ return &nft_meta_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_meta_type __read_mostly = {
+ .name = "meta",
+ .select_ops = &nft_meta_select_ops,
+ .policy = nft_meta_policy,
+ .maxattr = NFTA_META_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_meta_module_init(void)
+{
+ return nft_register_expr(&nft_meta_type);
+}
+
+static void __exit nft_meta_module_exit(void)
+{
+ nft_unregister_expr(&nft_meta_type);
+}
+
+module_init(nft_meta_module_init);
+module_exit(nft_meta_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
new file mode 100644
index 00000000000..79ff58cd36d
--- /dev/null
+++ b/net/netfilter/nft_nat.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/string.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+ enum nft_registers sreg_addr_min:8;
+ enum nft_registers sreg_addr_max:8;
+ enum nft_registers sreg_proto_min:8;
+ enum nft_registers sreg_proto_max:8;
+ enum nf_nat_manip_type type:8;
+ u8 family;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+ struct nf_nat_range range;
+
+ memset(&range, 0, sizeof(range));
+ if (priv->sreg_addr_min) {
+ if (priv->family == AF_INET) {
+ range.min_addr.ip = (__force __be32)
+ data[priv->sreg_addr_min].data[0];
+ range.max_addr.ip = (__force __be32)
+ data[priv->sreg_addr_max].data[0];
+
+ } else {
+ memcpy(range.min_addr.ip6,
+ data[priv->sreg_addr_min].data,
+ sizeof(struct nft_data));
+ memcpy(range.max_addr.ip6,
+ data[priv->sreg_addr_max].data,
+ sizeof(struct nft_data));
+ }
+ range.flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (priv->sreg_proto_min) {
+ range.min_proto.all = (__force __be16)
+ data[priv->sreg_proto_min].data[0];
+ range.max_proto.all = (__force __be16)
+ data[priv->sreg_proto_max].data[0];
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+
+ data[NFT_REG_VERDICT].verdict =
+ nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+ [NFTA_NAT_TYPE] = { .type = NLA_U32 },
+ [NFTA_NAT_FAMILY] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_nat *priv = nft_expr_priv(expr);
+ u32 family;
+ int err;
+
+ if (tb[NFTA_NAT_TYPE] == NULL)
+ return -EINVAL;
+
+ switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+ case NFT_NAT_SNAT:
+ priv->type = NF_NAT_MANIP_SRC;
+ break;
+ case NFT_NAT_DNAT:
+ priv->type = NF_NAT_MANIP_DST;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_NAT_FAMILY] == NULL)
+ return -EINVAL;
+
+ family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+ if (family != ctx->afi->family)
+ return -EOPNOTSUPP;
+ priv->family = family;
+
+ if (tb[NFTA_NAT_REG_ADDR_MIN]) {
+ priv->sreg_addr_min = ntohl(nla_get_be32(
+ tb[NFTA_NAT_REG_ADDR_MIN]));
+ err = nft_validate_input_register(priv->sreg_addr_min);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[NFTA_NAT_REG_ADDR_MAX]) {
+ priv->sreg_addr_max = ntohl(nla_get_be32(
+ tb[NFTA_NAT_REG_ADDR_MAX]));
+ err = nft_validate_input_register(priv->sreg_addr_max);
+ if (err < 0)
+ return err;
+ } else
+ priv->sreg_addr_max = priv->sreg_addr_min;
+
+ if (tb[NFTA_NAT_REG_PROTO_MIN]) {
+ priv->sreg_proto_min = ntohl(nla_get_be32(
+ tb[NFTA_NAT_REG_PROTO_MIN]));
+ err = nft_validate_input_register(priv->sreg_proto_min);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[NFTA_NAT_REG_PROTO_MAX]) {
+ priv->sreg_proto_max = ntohl(nla_get_be32(
+ tb[NFTA_NAT_REG_PROTO_MAX]));
+ err = nft_validate_input_register(priv->sreg_proto_max);
+ if (err < 0)
+ return err;
+ } else
+ priv->sreg_proto_max = priv->sreg_proto_min;
+
+ return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NF_NAT_MANIP_SRC:
+ if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+ goto nla_put_failure;
+ break;
+ case NF_NAT_MANIP_DST:
+ if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+ goto nla_put_failure;
+ break;
+ }
+
+ if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb,
+ NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb,
+ NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max)))
+ goto nla_put_failure;
+ if (priv->sreg_proto_min) {
+ if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN,
+ htonl(priv->sreg_proto_min)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX,
+ htonl(priv->sreg_proto_max)))
+ goto nla_put_failure;
+ }
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+ .type = &nft_nat_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+ .eval = nft_nat_eval,
+ .init = nft_nat_init,
+ .dump = nft_nat_dump,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+ .name = "nat",
+ .ops = &nft_nat_ops,
+ .policy = nft_nat_policy,
+ .maxattr = NFTA_NAT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_nat_module_init(void)
+{
+ return nft_register_expr(&nft_nat_type);
+}
+
+static void __exit nft_nat_module_exit(void)
+{
+ nft_unregister_expr(&nft_nat_type);
+}
+
+module_init(nft_nat_module_init);
+module_exit(nft_nat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
new file mode 100644
index 00000000000..85daa84bfdf
--- /dev/null
+++ b/net/netfilter/nft_payload.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+static void nft_payload_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ struct nft_data *dest = &data[priv->dreg];
+ int offset;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ if (!skb_mac_header_was_set(skb))
+ goto err;
+ offset = skb_mac_header(skb) - skb->data;
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ offset = skb_network_offset(skb);
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ offset = pkt->xt.thoff;
+ break;
+ default:
+ BUG();
+ }
+ offset += priv->offset;
+
+ if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0)
+ goto err;
+ return;
+err:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
+ [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+};
+
+static int nft_payload_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_payload *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+ return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_payload_type;
+static const struct nft_expr_ops nft_payload_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .eval = nft_payload_eval,
+ .init = nft_payload_init,
+ .dump = nft_payload_dump,
+};
+
+const struct nft_expr_ops nft_payload_fast_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .eval = nft_payload_eval,
+ .init = nft_payload_init,
+ .dump = nft_payload_dump,
+};
+
+static const struct nft_expr_ops *
+nft_payload_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_payload_bases base;
+ unsigned int offset, len;
+
+ if (tb[NFTA_PAYLOAD_DREG] == NULL ||
+ tb[NFTA_PAYLOAD_BASE] == NULL ||
+ tb[NFTA_PAYLOAD_OFFSET] == NULL ||
+ tb[NFTA_PAYLOAD_LEN] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ switch (base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ break;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data))
+ return ERR_PTR(-EINVAL);
+
+ if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) &&
+ base != NFT_PAYLOAD_LL_HEADER)
+ return &nft_payload_fast_ops;
+ else
+ return &nft_payload_ops;
+}
+
+static struct nft_expr_type nft_payload_type __read_mostly = {
+ .name = "payload",
+ .select_ops = nft_payload_select_ops,
+ .policy = nft_payload_policy,
+ .maxattr = NFTA_PAYLOAD_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_payload_module_init(void)
+{
+ return nft_register_expr(&nft_payload_type);
+}
+
+void nft_payload_module_exit(void)
+{
+ nft_unregister_expr(&nft_payload_type);
+}
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
new file mode 100644
index 00000000000..e8ae2f6bf23
--- /dev/null
+++ b/net/netfilter/nft_queue.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code partly funded by OISF
+ * (http://www.openinfosecfoundation.org/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/jhash.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_queue.h>
+
+static u32 jhash_initval __read_mostly;
+
+struct nft_queue {
+ u16 queuenum;
+ u16 queues_total;
+ u16 flags;
+};
+
+static void nft_queue_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+ u32 queue = priv->queuenum;
+ u32 ret;
+
+ if (priv->queues_total > 1) {
+ if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) {
+ int cpu = smp_processor_id();
+
+ queue = priv->queuenum + cpu % priv->queues_total;
+ } else {
+ queue = nfqueue_hash(pkt->skb, queue,
+ priv->queues_total, pkt->ops->pf,
+ jhash_initval);
+ }
+ }
+
+ ret = NF_QUEUE_NR(queue);
+ if (priv->flags & NFT_QUEUE_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ data[NFT_REG_VERDICT].verdict = ret;
+}
+
+static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
+ [NFTA_QUEUE_NUM] = { .type = NLA_U16 },
+ [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
+ [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 },
+};
+
+static int nft_queue_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_QUEUE_NUM] == NULL)
+ return -EINVAL;
+
+ init_hashrandom(&jhash_initval);
+ priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM]));
+
+ if (tb[NFTA_QUEUE_TOTAL] != NULL)
+ priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
+ if (tb[NFTA_QUEUE_FLAGS] != NULL) {
+ priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
+ if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_queue *priv = nft_expr_priv(expr);
+
+ if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) ||
+ nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) ||
+ nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_queue_type;
+static const struct nft_expr_ops nft_queue_ops = {
+ .type = &nft_queue_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)),
+ .eval = nft_queue_eval,
+ .init = nft_queue_init,
+ .dump = nft_queue_dump,
+};
+
+static struct nft_expr_type nft_queue_type __read_mostly = {
+ .name = "queue",
+ .ops = &nft_queue_ops,
+ .policy = nft_queue_policy,
+ .maxattr = NFTA_QUEUE_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_queue_module_init(void)
+{
+ return nft_register_expr(&nft_queue_type);
+}
+
+static void __exit nft_queue_module_exit(void)
+{
+ nft_unregister_expr(&nft_queue_type);
+}
+
+module_init(nft_queue_module_init);
+module_exit(nft_queue_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Leblond <eric@regit.org>");
+MODULE_ALIAS_NFT_EXPR("queue");
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
new file mode 100644
index 00000000000..e1836ff8819
--- /dev/null
+++ b/net/netfilter/nft_rbtree.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static DEFINE_SPINLOCK(nft_rbtree_lock);
+
+struct nft_rbtree {
+ struct rb_root root;
+};
+
+struct nft_rbtree_elem {
+ struct rb_node node;
+ u16 flags;
+ struct nft_data key;
+ struct nft_data data[];
+};
+
+static bool nft_rbtree_lookup(const struct nft_set *set,
+ const struct nft_data *key,
+ struct nft_data *data)
+{
+ const struct nft_rbtree *priv = nft_set_priv(set);
+ const struct nft_rbtree_elem *rbe, *interval = NULL;
+ const struct rb_node *parent = priv->root.rb_node;
+ int d;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ while (parent != NULL) {
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+ d = nft_data_cmp(&rbe->key, key, set->klen);
+ if (d < 0) {
+ parent = parent->rb_left;
+ interval = rbe;
+ } else if (d > 0)
+ parent = parent->rb_right;
+ else {
+found:
+ if (rbe->flags & NFT_SET_ELEM_INTERVAL_END)
+ goto out;
+ if (set->flags & NFT_SET_MAP)
+ nft_data_copy(data, rbe->data);
+
+ spin_unlock_bh(&nft_rbtree_lock);
+ return true;
+ }
+ }
+
+ if (set->flags & NFT_SET_INTERVAL && interval != NULL) {
+ rbe = interval;
+ goto found;
+ }
+out:
+ spin_unlock_bh(&nft_rbtree_lock);
+ return false;
+}
+
+static void nft_rbtree_elem_destroy(const struct nft_set *set,
+ struct nft_rbtree_elem *rbe)
+{
+ nft_data_uninit(&rbe->key, NFT_DATA_VALUE);
+ if (set->flags & NFT_SET_MAP &&
+ !(rbe->flags & NFT_SET_ELEM_INTERVAL_END))
+ nft_data_uninit(rbe->data, set->dtype);
+
+ kfree(rbe);
+}
+
+static int __nft_rbtree_insert(const struct nft_set *set,
+ struct nft_rbtree_elem *new)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *parent, **p;
+ int d;
+
+ parent = NULL;
+ p = &priv->root.rb_node;
+ while (*p != NULL) {
+ parent = *p;
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+ d = nft_data_cmp(&rbe->key, &new->key, set->klen);
+ if (d < 0)
+ p = &parent->rb_left;
+ else if (d > 0)
+ p = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, &priv->root);
+ return 0;
+}
+
+static int nft_rbtree_insert(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_rbtree_elem *rbe;
+ unsigned int size;
+ int err;
+
+ size = sizeof(*rbe);
+ if (set->flags & NFT_SET_MAP &&
+ !(elem->flags & NFT_SET_ELEM_INTERVAL_END))
+ size += sizeof(rbe->data[0]);
+
+ rbe = kzalloc(size, GFP_KERNEL);
+ if (rbe == NULL)
+ return -ENOMEM;
+
+ rbe->flags = elem->flags;
+ nft_data_copy(&rbe->key, &elem->key);
+ if (set->flags & NFT_SET_MAP &&
+ !(rbe->flags & NFT_SET_ELEM_INTERVAL_END))
+ nft_data_copy(rbe->data, &elem->data);
+
+ spin_lock_bh(&nft_rbtree_lock);
+ err = __nft_rbtree_insert(set, rbe);
+ if (err < 0)
+ kfree(rbe);
+
+ spin_unlock_bh(&nft_rbtree_lock);
+ return err;
+}
+
+static void nft_rbtree_remove(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe = elem->cookie;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ rb_erase(&rbe->node, &priv->root);
+ spin_unlock_bh(&nft_rbtree_lock);
+ kfree(rbe);
+}
+
+static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem)
+{
+ const struct nft_rbtree *priv = nft_set_priv(set);
+ const struct rb_node *parent = priv->root.rb_node;
+ struct nft_rbtree_elem *rbe;
+ int d;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ while (parent != NULL) {
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+ d = nft_data_cmp(&rbe->key, &elem->key, set->klen);
+ if (d < 0)
+ parent = parent->rb_left;
+ else if (d > 0)
+ parent = parent->rb_right;
+ else {
+ elem->cookie = rbe;
+ if (set->flags & NFT_SET_MAP &&
+ !(rbe->flags & NFT_SET_ELEM_INTERVAL_END))
+ nft_data_copy(&elem->data, rbe->data);
+ elem->flags = rbe->flags;
+ spin_unlock_bh(&nft_rbtree_lock);
+ return 0;
+ }
+ }
+ spin_unlock_bh(&nft_rbtree_lock);
+ return -ENOENT;
+}
+
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ const struct nft_rbtree *priv = nft_set_priv(set);
+ const struct nft_rbtree_elem *rbe;
+ struct nft_set_elem elem;
+ struct rb_node *node;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+ if (iter->count < iter->skip)
+ goto cont;
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ nft_data_copy(&elem.key, &rbe->key);
+ if (set->flags & NFT_SET_MAP &&
+ !(rbe->flags & NFT_SET_ELEM_INTERVAL_END))
+ nft_data_copy(&elem.data, rbe->data);
+ elem.flags = rbe->flags;
+
+ iter->err = iter->fn(ctx, set, iter, &elem);
+ if (iter->err < 0) {
+ spin_unlock_bh(&nft_rbtree_lock);
+ return;
+ }
+cont:
+ iter->count++;
+ }
+ spin_unlock_bh(&nft_rbtree_lock);
+}
+
+static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[])
+{
+ return sizeof(struct nft_rbtree);
+}
+
+static int nft_rbtree_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const nla[])
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ priv->root = RB_ROOT;
+ return 0;
+}
+
+static void nft_rbtree_destroy(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *node;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ while ((node = priv->root.rb_node) != NULL) {
+ rb_erase(node, &priv->root);
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ nft_rbtree_elem_destroy(set, rbe);
+ }
+ spin_unlock_bh(&nft_rbtree_lock);
+}
+
+static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ unsigned int nsize;
+
+ nsize = sizeof(struct nft_rbtree_elem);
+ if (features & NFT_SET_MAP)
+ nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]);
+
+ if (desc->size)
+ est->size = sizeof(struct nft_rbtree) + desc->size * nsize;
+ else
+ est->size = nsize;
+
+ est->class = NFT_SET_CLASS_O_LOG_N;
+
+ return true;
+}
+
+static struct nft_set_ops nft_rbtree_ops __read_mostly = {
+ .privsize = nft_rbtree_privsize,
+ .estimate = nft_rbtree_estimate,
+ .init = nft_rbtree_init,
+ .destroy = nft_rbtree_destroy,
+ .insert = nft_rbtree_insert,
+ .remove = nft_rbtree_remove,
+ .get = nft_rbtree_get,
+ .lookup = nft_rbtree_lookup,
+ .walk = nft_rbtree_walk,
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_rbtree_module_init(void)
+{
+ return nft_register_set(&nft_rbtree_ops);
+}
+
+static void __exit nft_rbtree_module_exit(void)
+{
+ nft_unregister_set(&nft_rbtree_ops);
+}
+
+module_init(nft_rbtree_module_init);
+module_exit(nft_rbtree_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
new file mode 100644
index 00000000000..f3448c29644
--- /dev/null
+++ b/net/netfilter/nft_reject.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+
+const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
+ [NFTA_REJECT_TYPE] = { .type = NLA_U32 },
+ [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 },
+};
+EXPORT_SYMBOL_GPL(nft_reject_policy);
+
+int nft_reject_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_REJECT_TYPE] == NULL)
+ return -EINVAL;
+
+ priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
+ return -EINVAL;
+ priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ case NFT_REJECT_TCP_RST:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_reject_init);
+
+int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_reject *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
+ goto nla_put_failure;
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
+ goto nla_put_failure;
+ break;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_reject_dump);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
new file mode 100644
index 00000000000..b718a52a465
--- /dev/null
+++ b/net/netfilter/nft_reject_inet.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+
+static void nft_reject_inet_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ switch (pkt->ops->pf) {
+ case NFPROTO_IPV4:
+ return nft_reject_ipv4_eval(expr, data, pkt);
+ case NFPROTO_IPV6:
+ return nft_reject_ipv6_eval(expr, data, pkt);
+ }
+}
+
+static struct nft_expr_type nft_reject_inet_type;
+static const struct nft_expr_ops nft_reject_inet_ops = {
+ .type = &nft_reject_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_inet_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+};
+
+static struct nft_expr_type nft_reject_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "reject",
+ .ops = &nft_reject_inet_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_inet_module_init(void)
+{
+ return nft_register_expr(&nft_reject_inet_type);
+}
+
+static void __exit nft_reject_inet_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_inet_type);
+}
+
+module_init(nft_reject_inet_module_init);
+module_exit(nft_reject_inet_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(1, "reject");
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 80463507420..227aa11e840 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -2,6 +2,7 @@
* x_tables core - Backend for {ip,ip6,arp}_tables
*
* Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org>
+ * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on existing ip_tables code which is
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
@@ -14,6 +15,7 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/proc_fs.h>
@@ -23,6 +25,7 @@
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/audit.h>
#include <net/net_namespace.h>
#include <linux/netfilter/x_tables.h>
@@ -38,9 +41,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
struct compat_delta {
- struct compat_delta *next;
- unsigned int offset;
- int delta;
+ unsigned int offset; /* offset in kernel */
+ int delta; /* delta in 32bit user land */
};
struct xt_af {
@@ -49,7 +51,9 @@ struct xt_af {
struct list_head target;
#ifdef CONFIG_COMPAT
struct mutex compat_mutex;
- struct compat_delta *compat_offsets;
+ struct compat_delta *compat_tab;
+ unsigned int number; /* number of slots in compat_tab[] */
+ unsigned int cur; /* number of used slots in compat_tab[] */
#endif
};
@@ -181,14 +185,14 @@ EXPORT_SYMBOL(xt_unregister_matches);
/*
* These are weird, but module loading must not be done with mutex
* held (since they will register), and we have to have a single
- * function to use try_then_request_module().
+ * function to use.
*/
/* Find match, grabs ref. Returns ERR_PTR() on error. */
struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
{
struct xt_match *m;
- int err = 0;
+ int err = -ENOENT;
if (mutex_lock_interruptible(&xt[af].mutex) != 0)
return ERR_PTR(-EINTR);
@@ -219,9 +223,13 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision)
{
struct xt_match *match;
- match = try_then_request_module(xt_find_match(nfproto, name, revision),
- "%st_%s", xt_prefix[nfproto], name);
- return (match != NULL) ? match : ERR_PTR(-ENOENT);
+ match = xt_find_match(nfproto, name, revision);
+ if (IS_ERR(match)) {
+ request_module("%st_%s", xt_prefix[nfproto], name);
+ match = xt_find_match(nfproto, name, revision);
+ }
+
+ return match;
}
EXPORT_SYMBOL_GPL(xt_request_find_match);
@@ -229,7 +237,7 @@ EXPORT_SYMBOL_GPL(xt_request_find_match);
struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
{
struct xt_target *t;
- int err = 0;
+ int err = -ENOENT;
if (mutex_lock_interruptible(&xt[af].mutex) != 0)
return ERR_PTR(-EINTR);
@@ -259,9 +267,13 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
{
struct xt_target *target;
- target = try_then_request_module(xt_find_target(af, name, revision),
- "%st_%s", xt_prefix[af], name);
- return (target != NULL) ? target : ERR_PTR(-ENOENT);
+ target = xt_find_target(af, name, revision);
+ if (IS_ERR(target)) {
+ request_module("%st_%s", xt_prefix[af], name);
+ target = xt_find_target(af, name, revision);
+ }
+
+ return target;
}
EXPORT_SYMBOL_GPL(xt_request_find_target);
@@ -334,19 +346,27 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
}
EXPORT_SYMBOL_GPL(xt_find_revision);
-static char *textify_hooks(char *buf, size_t size, unsigned int mask)
+static char *
+textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto)
{
- static const char *const names[] = {
+ static const char *const inetbr_names[] = {
"PREROUTING", "INPUT", "FORWARD",
"OUTPUT", "POSTROUTING", "BROUTING",
};
- unsigned int i;
+ static const char *const arp_names[] = {
+ "INPUT", "FORWARD", "OUTPUT",
+ };
+ const char *const *names;
+ unsigned int i, max;
char *p = buf;
bool np = false;
int res;
+ names = (nfproto == NFPROTO_ARP) ? arp_names : inetbr_names;
+ max = (nfproto == NFPROTO_ARP) ? ARRAY_SIZE(arp_names) :
+ ARRAY_SIZE(inetbr_names);
*p = '\0';
- for (i = 0; i < ARRAY_SIZE(names); ++i) {
+ for (i = 0; i < max; ++i) {
if (!(mask & (1 << i)))
continue;
res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]);
@@ -391,8 +411,10 @@ int xt_check_match(struct xt_mtchk_param *par,
pr_err("%s_tables: %s match: used from hooks %s, but only "
"valid from %s\n",
xt_prefix[par->family], par->match->name,
- textify_hooks(used, sizeof(used), par->hook_mask),
- textify_hooks(allow, sizeof(allow), par->match->hooks));
+ textify_hooks(used, sizeof(used), par->hook_mask,
+ par->family),
+ textify_hooks(allow, sizeof(allow), par->match->hooks,
+ par->family));
return -EINVAL;
}
if (par->match->proto && (par->match->proto != proto || inv_proto)) {
@@ -414,54 +436,67 @@ int xt_check_match(struct xt_mtchk_param *par,
EXPORT_SYMBOL_GPL(xt_check_match);
#ifdef CONFIG_COMPAT
-int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta)
+int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
{
- struct compat_delta *tmp;
+ struct xt_af *xp = &xt[af];
- tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
+ if (!xp->compat_tab) {
+ if (!xp->number)
+ return -EINVAL;
+ xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
+ if (!xp->compat_tab)
+ return -ENOMEM;
+ xp->cur = 0;
+ }
- tmp->offset = offset;
- tmp->delta = delta;
+ if (xp->cur >= xp->number)
+ return -EINVAL;
- if (xt[af].compat_offsets) {
- tmp->next = xt[af].compat_offsets->next;
- xt[af].compat_offsets->next = tmp;
- } else {
- xt[af].compat_offsets = tmp;
- tmp->next = NULL;
- }
+ if (xp->cur)
+ delta += xp->compat_tab[xp->cur - 1].delta;
+ xp->compat_tab[xp->cur].offset = offset;
+ xp->compat_tab[xp->cur].delta = delta;
+ xp->cur++;
return 0;
}
EXPORT_SYMBOL_GPL(xt_compat_add_offset);
void xt_compat_flush_offsets(u_int8_t af)
{
- struct compat_delta *tmp, *next;
-
- if (xt[af].compat_offsets) {
- for (tmp = xt[af].compat_offsets; tmp; tmp = next) {
- next = tmp->next;
- kfree(tmp);
- }
- xt[af].compat_offsets = NULL;
+ if (xt[af].compat_tab) {
+ vfree(xt[af].compat_tab);
+ xt[af].compat_tab = NULL;
+ xt[af].number = 0;
+ xt[af].cur = 0;
}
}
EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
{
- struct compat_delta *tmp;
- int delta;
-
- for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next)
- if (tmp->offset < offset)
- delta += tmp->delta;
- return delta;
+ struct compat_delta *tmp = xt[af].compat_tab;
+ int mid, left = 0, right = xt[af].cur - 1;
+
+ while (left <= right) {
+ mid = (left + right) >> 1;
+ if (offset > tmp[mid].offset)
+ left = mid + 1;
+ else if (offset < tmp[mid].offset)
+ right = mid - 1;
+ else
+ return mid ? tmp[mid - 1].delta : 0;
+ }
+ return left ? tmp[left - 1].delta : 0;
}
EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
+void xt_compat_init_offsets(u_int8_t af, unsigned int number)
+{
+ xt[af].number = number;
+ xt[af].cur = 0;
+}
+EXPORT_SYMBOL(xt_compat_init_offsets);
+
int xt_compat_match_offset(const struct xt_match *match)
{
u_int16_t csize = match->compatsize ? : match->matchsize;
@@ -551,8 +586,10 @@ int xt_check_target(struct xt_tgchk_param *par,
pr_err("%s_tables: %s target: used from hooks %s, but only "
"usable from %s\n",
xt_prefix[par->family], par->target->name,
- textify_hooks(used, sizeof(used), par->hook_mask),
- textify_hooks(allow, sizeof(allow), par->target->hooks));
+ textify_hooks(used, sizeof(used), par->hook_mask,
+ par->family),
+ textify_hooks(allow, sizeof(allow), par->target->hooks,
+ par->family));
return -EINVAL;
}
if (par->target->proto && (par->target->proto != proto || inv_proto)) {
@@ -739,8 +776,8 @@ void xt_compat_unlock(u_int8_t af)
EXPORT_SYMBOL_GPL(xt_compat_unlock);
#endif
-DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
-EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
static int xt_jumpstack_alloc(struct xt_table_info *i)
{
@@ -753,12 +790,11 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
size = sizeof(void **) * nr_cpu_ids;
if (size > PAGE_SIZE)
- i->jumpstack = vmalloc(size);
+ i->jumpstack = vzalloc(size);
else
- i->jumpstack = kmalloc(size, GFP_KERNEL);
+ i->jumpstack = kzalloc(size, GFP_KERNEL);
if (i->jumpstack == NULL)
return -ENOMEM;
- memset(i->jumpstack, 0, size);
i->stacksize *= xt_jumpstack_multiplier;
size = sizeof(void *) * i->stacksize;
@@ -809,8 +845,13 @@ xt_replace_table(struct xt_table *table,
return NULL;
}
- table->private = newinfo;
newinfo->initial_entries = private->initial_entries;
+ /*
+ * Ensure contents of newinfo are visible before assigning to
+ * private.
+ */
+ smp_wmb();
+ table->private = newinfo;
/*
* Even though table entries have now been swapped, other CPU's
@@ -820,6 +861,21 @@ xt_replace_table(struct xt_table *table,
*/
local_bh_enable();
+#ifdef CONFIG_AUDIT
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_NETFILTER_CFG);
+ if (ab) {
+ audit_log_format(ab, "table=%s family=%u entries=%u",
+ table->name, table->af,
+ private->number);
+ audit_log_end(ab);
+ }
+ }
+#endif
+
return private;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -949,7 +1005,7 @@ static int xt_table_open(struct inode *inode, struct file *file)
sizeof(struct xt_names_priv));
if (!ret) {
priv = ((struct seq_file *)file->private_data)->private;
- priv->af = (unsigned long)PDE(inode)->data;
+ priv->af = (unsigned long)PDE_DATA(inode);
}
return ret;
}
@@ -1097,7 +1153,7 @@ static int xt_match_open(struct inode *inode, struct file *file)
seq = file->private_data;
seq->private = trav;
- trav->nfproto = (unsigned long)PDE(inode)->data;
+ trav->nfproto = (unsigned long)PDE_DATA(inode);
return 0;
}
@@ -1161,7 +1217,7 @@ static int xt_target_open(struct inode *inode, struct file *file)
seq = file->private_data;
seq->private = trav;
- trav->nfproto = (unsigned long)PDE(inode)->data;
+ trav->nfproto = (unsigned long)PDE_DATA(inode);
return 0;
}
@@ -1273,12 +1329,12 @@ int xt_proto_init(struct net *net, u_int8_t af)
out_remove_matches:
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
- proc_net_remove(net, buf);
+ remove_proc_entry(buf, net->proc_net);
out_remove_tables:
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
- proc_net_remove(net, buf);
+ remove_proc_entry(buf, net->proc_net);
out:
return -1;
#endif
@@ -1292,15 +1348,15 @@ void xt_proto_fini(struct net *net, u_int8_t af)
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
- proc_net_remove(net, buf);
+ remove_proc_entry(buf, net->proc_net);
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
- proc_net_remove(net, buf);
+ remove_proc_entry(buf, net->proc_net);
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
- proc_net_remove(net, buf);
+ remove_proc_entry(buf, net->proc_net);
#endif /*CONFIG_PROC_FS*/
}
EXPORT_SYMBOL_GPL(xt_proto_fini);
@@ -1324,9 +1380,7 @@ static int __init xt_init(void)
int rv;
for_each_possible_cpu(i) {
- struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
- spin_lock_init(&lock->lock);
- lock->readers = 0;
+ seqcount_init(&per_cpu(xt_recseq, i));
}
xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
@@ -1337,7 +1391,7 @@ static int __init xt_init(void)
mutex_init(&xt[i].mutex);
#ifdef CONFIG_COMPAT
mutex_init(&xt[i].compat_mutex);
- xt[i].compat_offsets = NULL;
+ xt[i].compat_tab = NULL;
#endif
INIT_LIST_HEAD(&xt[i].target);
INIT_LIST_HEAD(&xt[i].match);
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
new file mode 100644
index 00000000000..4973cbddc44
--- /dev/null
+++ b/net/netfilter/xt_AUDIT.c
@@ -0,0 +1,231 @@
+/*
+ * Creates audit record for dropped/accepted packets
+ *
+ * (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
+ * (C) 2010-2011 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/audit.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_AUDIT.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>");
+MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets");
+MODULE_ALIAS("ipt_AUDIT");
+MODULE_ALIAS("ip6t_AUDIT");
+MODULE_ALIAS("ebt_AUDIT");
+MODULE_ALIAS("arpt_AUDIT");
+
+static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb,
+ unsigned int proto, unsigned int offset)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ const __be16 *pptr;
+ __be16 _ports[2];
+
+ pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " sport=%hu dport=%hu",
+ ntohs(pptr[0]), ntohs(pptr[1]));
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6: {
+ const u8 *iptr;
+ u8 _ih[2];
+
+ iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih);
+ if (iptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu",
+ iptr[0], iptr[1]);
+
+ }
+ break;
+ }
+}
+
+static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu",
+ &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol);
+
+ if (ntohs(ih->frag_off) & IP_OFFSET) {
+ audit_log_format(ab, " frag=1");
+ return;
+ }
+
+ audit_proto(ab, skb, ih->protocol, ih->ihl * 4);
+}
+
+static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ u8 nexthdr;
+ __be16 frag_off;
+ int offset;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ nexthdr = ih->nexthdr;
+ offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h),
+ &nexthdr, &frag_off);
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+ &ih->saddr, &ih->daddr, nexthdr);
+
+ if (offset)
+ audit_proto(ab, skb, nexthdr, offset);
+}
+
+static unsigned int
+audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+ struct audit_buffer *ab;
+
+ if (audit_enabled == 0)
+ goto errout;
+
+ ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+ if (ab == NULL)
+ goto errout;
+
+ audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
+ info->type, par->hooknum, skb->len,
+ par->in ? par->in->name : "?",
+ par->out ? par->out->name : "?");
+
+ if (skb->mark)
+ audit_log_format(ab, " mark=%#x", skb->mark);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+ audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+
+ if (par->family == NFPROTO_BRIDGE) {
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ audit_ip4(ab, skb);
+ break;
+
+ case htons(ETH_P_IPV6):
+ audit_ip6(ab, skb);
+ break;
+ }
+ }
+ }
+
+ switch (par->family) {
+ case NFPROTO_IPV4:
+ audit_ip4(ab, skb);
+ break;
+
+ case NFPROTO_IPV6:
+ audit_ip6(ab, skb);
+ break;
+ }
+
+#ifdef CONFIG_NETWORK_SECMARK
+ if (skb->secmark)
+ audit_log_secctx(ab, skb->secmark);
+#endif
+
+ audit_log_end(ab);
+
+errout:
+ return XT_CONTINUE;
+}
+
+static unsigned int
+audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ audit_tg(skb, par);
+ return EBT_CONTINUE;
+}
+
+static int audit_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+
+ if (info->type > XT_AUDIT_TYPE_MAX) {
+ pr_info("Audit type out of range (valid range: 0..%hhu)\n",
+ XT_AUDIT_TYPE_MAX);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static struct xt_target audit_tg_reg[] __read_mostly = {
+ {
+ .name = "AUDIT",
+ .family = NFPROTO_UNSPEC,
+ .target = audit_tg,
+ .targetsize = sizeof(struct xt_audit_info),
+ .checkentry = audit_tg_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "AUDIT",
+ .family = NFPROTO_BRIDGE,
+ .target = audit_tg_ebt,
+ .targetsize = sizeof(struct xt_audit_info),
+ .checkentry = audit_tg_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init audit_tg_init(void)
+{
+ return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg));
+}
+
+static void __exit audit_tg_exit(void)
+{
+ xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg));
+}
+
+module_init(audit_tg_init);
+module_exit(audit_tg_exit);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index c2c0e4abeb9..af9c4dadf81 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -19,12 +19,14 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_CLASSIFY.h>
+#include <linux/netfilter_arp.h>
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Xtables: Qdisc classification");
MODULE_ALIAS("ipt_CLASSIFY");
MODULE_ALIAS("ip6t_CLASSIFY");
+MODULE_ALIAS("arpt_CLASSIFY");
static unsigned int
classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
@@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static struct xt_target classify_tg_reg __read_mostly = {
- .name = "CLASSIFY",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .table = "mangle",
- .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
- (1 << NF_INET_POST_ROUTING),
- .target = classify_tg,
- .targetsize = sizeof(struct xt_classify_target_info),
- .me = THIS_MODULE,
+static struct xt_target classify_tg_reg[] __read_mostly = {
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_POST_ROUTING),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_ARP,
+ .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
};
static int __init classify_tg_init(void)
{
- return xt_register_target(&classify_tg_reg);
+ return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
}
static void __exit classify_tg_exit(void)
{
- xt_unregister_target(&classify_tg_reg);
+ xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
}
module_init(classify_tg_init);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 782e51986a6..75747aecdeb 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -5,7 +5,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/skbuff.h>
@@ -14,20 +14,21 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_CT.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
-static unsigned int xt_ct_target(struct sk_buff *skb,
- const struct xt_action_param *par)
+static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
{
- const struct xt_ct_target_info *info = par->targinfo;
- struct nf_conn *ct = info->ct;
-
/* Previously seen (loopback)? Ignore. */
if (skb->nfct != NULL)
return XT_CONTINUE;
+ /* special case the untracked ct : we want the percpu object */
+ if (!ct)
+ ct = nf_ct_untracked_get();
atomic_inc(&ct->ct_general.use);
skb->nfct = &ct->ct_general;
skb->nfctinfo = IP_CT_NEW;
@@ -35,6 +36,24 @@ static unsigned int xt_ct_target(struct sk_buff *skb,
return XT_CONTINUE;
}
+static unsigned int xt_ct_target_v0(struct sk_buff *skb,
+ const struct xt_action_param *par)
+{
+ const struct xt_ct_target_info *info = par->targinfo;
+ struct nf_conn *ct = info->ct;
+
+ return xt_ct_target(skb, ct);
+}
+
+static unsigned int xt_ct_target_v1(struct sk_buff *skb,
+ const struct xt_action_param *par)
+{
+ const struct xt_ct_target_info_v1 *info = par->targinfo;
+ struct nf_conn *ct = info->ct;
+
+ return xt_ct_target(skb, ct);
+}
+
static u8 xt_ct_find_proto(const struct xt_tgchk_param *par)
{
if (par->family == NFPROTO_IPV4) {
@@ -53,21 +72,124 @@ static u8 xt_ct_find_proto(const struct xt_tgchk_param *par)
return 0;
}
-static int xt_ct_tg_check(const struct xt_tgchk_param *par)
+static int
+xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
+ const struct xt_tgchk_param *par)
{
- struct xt_ct_target_info *info = par->targinfo;
- struct nf_conntrack_tuple t;
+ struct nf_conntrack_helper *helper;
struct nf_conn_help *help;
- struct nf_conn *ct;
+ u8 proto;
+
+ proto = xt_ct_find_proto(par);
+ if (!proto) {
+ pr_info("You must specify a L4 protocol, and not use "
+ "inversions on it.\n");
+ return -ENOENT;
+ }
+
+ helper = nf_conntrack_helper_try_module_get(helper_name, par->family,
+ proto);
+ if (helper == NULL) {
+ pr_info("No such helper \"%s\"\n", helper_name);
+ return -ENOENT;
+ }
+
+ help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
+ if (help == NULL) {
+ module_put(helper->me);
+ return -ENOMEM;
+ }
+
+ help->helper = helper;
+ return 0;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout)
+{
+ typeof(nf_ct_timeout_put_hook) timeout_put;
+
+ timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+ if (timeout_put)
+ timeout_put(timeout);
+}
+#endif
+
+static int
+xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
+ const char *timeout_name)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+ struct ctnl_timeout *timeout;
+ struct nf_conn_timeout *timeout_ext;
+ struct nf_conntrack_l4proto *l4proto;
int ret = 0;
u8 proto;
- if (info->flags & ~XT_CT_NOTRACK)
- return -EINVAL;
+ rcu_read_lock();
+ timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
+ if (timeout_find_get == NULL) {
+ ret = -ENOENT;
+ pr_info("Timeout policy base is empty\n");
+ goto out;
+ }
+
+ proto = xt_ct_find_proto(par);
+ if (!proto) {
+ ret = -EINVAL;
+ pr_info("You must specify a L4 protocol, and not use "
+ "inversions on it.\n");
+ goto out;
+ }
+
+ timeout = timeout_find_get(timeout_name);
+ if (timeout == NULL) {
+ ret = -ENOENT;
+ pr_info("No such timeout policy \"%s\"\n", timeout_name);
+ goto out;
+ }
+
+ if (timeout->l3num != par->family) {
+ ret = -EINVAL;
+ pr_info("Timeout policy `%s' can only be used by L3 protocol "
+ "number %d\n", timeout_name, timeout->l3num);
+ goto err_put_timeout;
+ }
+ /* Make sure the timeout policy matches any existing protocol tracker,
+ * otherwise default to generic.
+ */
+ l4proto = __nf_ct_l4proto_find(par->family, proto);
+ if (timeout->l4proto->l4proto != l4proto->l4proto) {
+ ret = -EINVAL;
+ pr_info("Timeout policy `%s' can only be used by L4 protocol "
+ "number %d\n",
+ timeout_name, timeout->l4proto->l4proto);
+ goto err_put_timeout;
+ }
+ timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC);
+ if (timeout_ext == NULL)
+ ret = -ENOMEM;
+
+err_put_timeout:
+ __xt_ct_tg_timeout_put(timeout);
+out:
+ rcu_read_unlock();
+ return ret;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int xt_ct_tg_check(const struct xt_tgchk_param *par,
+ struct xt_ct_target_info_v1 *info)
+{
+ struct nf_conntrack_tuple t;
+ struct nf_conn *ct;
+ int ret = -EOPNOTSUPP;
if (info->flags & XT_CT_NOTRACK) {
- ct = nf_ct_untracked_get();
- atomic_inc(&ct->ct_general.use);
+ ct = NULL;
goto out;
}
@@ -89,30 +211,24 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par)
ret = 0;
if ((info->ct_events || info->exp_events) &&
!nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events,
- GFP_KERNEL))
+ GFP_KERNEL)) {
+ ret = -EINVAL;
goto err3;
+ }
if (info->helper[0]) {
- ret = -ENOENT;
- proto = xt_ct_find_proto(par);
- if (!proto)
- goto err3;
-
- ret = -ENOMEM;
- help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
- if (help == NULL)
+ ret = xt_ct_set_helper(ct, info->helper, par);
+ if (ret < 0)
goto err3;
+ }
- ret = -ENOENT;
- help->helper = nf_conntrack_helper_try_module_get(info->helper,
- par->family,
- proto);
- if (help->helper == NULL)
+ if (info->timeout[0]) {
+ ret = xt_ct_set_timeout(ct, par, info->timeout);
+ if (ret < 0)
goto err3;
}
- __set_bit(IPS_TEMPLATE_BIT, &ct->status);
- __set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ nf_conntrack_tmpl_insert(par->net, ct);
out:
info->ct = ct;
return 0;
@@ -125,41 +241,196 @@ err1:
return ret;
}
-static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par)
+static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par)
{
struct xt_ct_target_info *info = par->targinfo;
+ struct xt_ct_target_info_v1 info_v1 = {
+ .flags = info->flags,
+ .zone = info->zone,
+ .ct_events = info->ct_events,
+ .exp_events = info->exp_events,
+ };
+ int ret;
+
+ if (info->flags & ~XT_CT_NOTRACK)
+ return -EINVAL;
+
+ memcpy(info_v1.helper, info->helper, sizeof(info->helper));
+
+ ret = xt_ct_tg_check(par, &info_v1);
+ if (ret < 0)
+ return ret;
+
+ info->ct = info_v1.ct;
+
+ return ret;
+}
+
+static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par)
+{
+ struct xt_ct_target_info_v1 *info = par->targinfo;
+
+ if (info->flags & ~XT_CT_NOTRACK)
+ return -EINVAL;
+
+ return xt_ct_tg_check(par, par->targinfo);
+}
+
+static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par)
+{
+ struct xt_ct_target_info_v1 *info = par->targinfo;
+
+ if (info->flags & ~XT_CT_MASK)
+ return -EINVAL;
+
+ return xt_ct_tg_check(par, par->targinfo);
+}
+
+static void xt_ct_destroy_timeout(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ struct nf_conn_timeout *timeout_ext;
+ typeof(nf_ct_timeout_put_hook) timeout_put;
+
+ rcu_read_lock();
+ timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+
+ if (timeout_put) {
+ timeout_ext = nf_ct_timeout_find(ct);
+ if (timeout_ext)
+ timeout_put(timeout_ext->timeout);
+ }
+ rcu_read_unlock();
+#endif
+}
+
+static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
+ struct xt_ct_target_info_v1 *info)
+{
struct nf_conn *ct = info->ct;
struct nf_conn_help *help;
- if (!nf_ct_is_untracked(ct)) {
+ if (ct && !nf_ct_is_untracked(ct)) {
help = nfct_help(ct);
if (help)
module_put(help->helper->me);
nf_ct_l3proto_module_put(par->family);
+
+ xt_ct_destroy_timeout(ct);
+ nf_ct_put(info->ct);
}
- nf_ct_put(info->ct);
}
-static struct xt_target xt_ct_tg __read_mostly = {
- .name = "CT",
+static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par)
+{
+ struct xt_ct_target_info *info = par->targinfo;
+ struct xt_ct_target_info_v1 info_v1 = {
+ .flags = info->flags,
+ .zone = info->zone,
+ .ct_events = info->ct_events,
+ .exp_events = info->exp_events,
+ .ct = info->ct,
+ };
+ memcpy(info_v1.helper, info->helper, sizeof(info->helper));
+
+ xt_ct_tg_destroy(par, &info_v1);
+}
+
+static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
+{
+ xt_ct_tg_destroy(par, par->targinfo);
+}
+
+static struct xt_target xt_ct_tg_reg[] __read_mostly = {
+ {
+ .name = "CT",
+ .family = NFPROTO_UNSPEC,
+ .targetsize = sizeof(struct xt_ct_target_info),
+ .checkentry = xt_ct_tg_check_v0,
+ .destroy = xt_ct_tg_destroy_v0,
+ .target = xt_ct_target_v0,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .checkentry = xt_ct_tg_check_v1,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_UNSPEC,
+ .revision = 2,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .checkentry = xt_ct_tg_check_v2,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+};
+
+static unsigned int
+notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ /* Previously seen (loopback)? Ignore. */
+ if (skb->nfct != NULL)
+ return XT_CONTINUE;
+
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+
+ return XT_CONTINUE;
+}
+
+static int notrack_chk(const struct xt_tgchk_param *par)
+{
+ if (!par->net->xt.notrack_deprecated_warning) {
+ pr_info("netfilter: NOTRACK target is deprecated, "
+ "use CT instead or upgrade iptables\n");
+ par->net->xt.notrack_deprecated_warning = true;
+ }
+ return 0;
+}
+
+static struct xt_target notrack_tg_reg __read_mostly = {
+ .name = "NOTRACK",
+ .revision = 0,
.family = NFPROTO_UNSPEC,
- .targetsize = sizeof(struct xt_ct_target_info),
- .checkentry = xt_ct_tg_check,
- .destroy = xt_ct_tg_destroy,
- .target = xt_ct_target,
+ .checkentry = notrack_chk,
+ .target = notrack_tg,
.table = "raw",
.me = THIS_MODULE,
};
static int __init xt_ct_tg_init(void)
{
- return xt_register_target(&xt_ct_tg);
+ int ret;
+
+ ret = xt_register_target(&notrack_tg_reg);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
+ if (ret < 0) {
+ xt_unregister_target(&notrack_tg_reg);
+ return ret;
+ }
+ return 0;
}
static void __exit xt_ct_tg_exit(void)
{
- xt_unregister_target(&xt_ct_tg);
+ xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
+ xt_unregister_target(&notrack_tg_reg);
}
module_init(xt_ct_tg_init);
@@ -169,3 +440,5 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Xtables: connection tracking target");
MODULE_ALIAS("ipt_CT");
MODULE_ALIAS("ip6t_CT");
+MODULE_ALIAS("ipt_NOTRACK");
+MODULE_ALIAS("ip6t_NOTRACK");
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 0a229191e55..ae8271652ef 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)
u_int8_t orig, nv;
orig = ipv6_get_dsfield(iph);
- nv = (orig & info->tos_mask) ^ info->tos_value;
+ nv = (orig & ~info->tos_mask) ^ info->tos_value;
if (orig != nv) {
if (!skb_make_writable(skb, sizeof(struct iphdr)))
diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c
index 95b084800fc..1535e87ed9b 100644
--- a/net/netfilter/xt_HL.c
+++ b/net/netfilter/xt_HL.c
@@ -38,22 +38,22 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par)
iph = ip_hdr(skb);
switch (info->mode) {
- case IPT_TTL_SET:
- new_ttl = info->ttl;
- break;
- case IPT_TTL_INC:
- new_ttl = iph->ttl + info->ttl;
- if (new_ttl > 255)
- new_ttl = 255;
- break;
- case IPT_TTL_DEC:
- new_ttl = iph->ttl - info->ttl;
- if (new_ttl < 0)
- new_ttl = 0;
- break;
- default:
- new_ttl = iph->ttl;
- break;
+ case IPT_TTL_SET:
+ new_ttl = info->ttl;
+ break;
+ case IPT_TTL_INC:
+ new_ttl = iph->ttl + info->ttl;
+ if (new_ttl > 255)
+ new_ttl = 255;
+ break;
+ case IPT_TTL_DEC:
+ new_ttl = iph->ttl - info->ttl;
+ if (new_ttl < 0)
+ new_ttl = 0;
+ break;
+ default:
+ new_ttl = iph->ttl;
+ break;
}
if (new_ttl != iph->ttl) {
@@ -78,22 +78,22 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par)
ip6h = ipv6_hdr(skb);
switch (info->mode) {
- case IP6T_HL_SET:
- new_hl = info->hop_limit;
- break;
- case IP6T_HL_INC:
- new_hl = ip6h->hop_limit + info->hop_limit;
- if (new_hl > 255)
- new_hl = 255;
- break;
- case IP6T_HL_DEC:
- new_hl = ip6h->hop_limit - info->hop_limit;
- if (new_hl < 0)
- new_hl = 0;
- break;
- default:
- new_hl = ip6h->hop_limit;
- break;
+ case IP6T_HL_SET:
+ new_hl = info->hop_limit;
+ break;
+ case IP6T_HL_INC:
+ new_hl = ip6h->hop_limit + info->hop_limit;
+ if (new_hl > 255)
+ new_hl = 255;
+ break;
+ case IP6T_HL_DEC:
+ new_hl = ip6h->hop_limit - info->hop_limit;
+ if (new_hl < 0)
+ new_hl = 0;
+ break;
+ default:
+ new_hl = ip6h->hop_limit;
+ break;
}
ip6h->hop_limit = new_hl;
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
new file mode 100644
index 00000000000..73b73f687c5
--- /dev/null
+++ b/net/netfilter/xt_HMARK.c
@@ -0,0 +1,372 @@
+/*
+ * xt_HMARK - Netfilter module to set mark by means of hashing
+ *
+ * (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_HMARK.h>
+
+#include <net/ip.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>");
+MODULE_DESCRIPTION("Xtables: packet marking using hash calculation");
+MODULE_ALIAS("ipt_HMARK");
+MODULE_ALIAS("ip6t_HMARK");
+
+struct hmark_tuple {
+ __be32 src;
+ __be32 dst;
+ union hmark_ports uports;
+ u8 proto;
+};
+
+static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask)
+{
+ return (addr32[0] & mask[0]) ^
+ (addr32[1] & mask[1]) ^
+ (addr32[2] & mask[2]) ^
+ (addr32[3] & mask[3]);
+}
+
+static inline __be32
+hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask)
+{
+ switch (l3num) {
+ case AF_INET:
+ return *addr32 & *mask;
+ case AF_INET6:
+ return hmark_addr6_mask(addr32, mask);
+ }
+ return 0;
+}
+
+static inline void hmark_swap_ports(union hmark_ports *uports,
+ const struct xt_hmark_info *info)
+{
+ union hmark_ports hp;
+ u16 src, dst;
+
+ hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32;
+ src = ntohs(hp.b16.src);
+ dst = ntohs(hp.b16.dst);
+
+ if (dst > src)
+ uports->v32 = (dst << 16) | src;
+ else
+ uports->v32 = (src << 16) | dst;
+}
+
+static int
+hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t,
+ const struct xt_hmark_info *info)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple *otuple;
+ struct nf_conntrack_tuple *rtuple;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return -1;
+
+ otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6,
+ info->src_mask.ip6);
+ t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6,
+ info->dst_mask.ip6);
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))
+ return 0;
+
+ t->proto = nf_ct_protonum(ct);
+ if (t->proto != IPPROTO_ICMP) {
+ t->uports.b16.src = otuple->src.u.all;
+ t->uports.b16.dst = rtuple->src.u.all;
+ hmark_swap_ports(&t->uports, info);
+ }
+
+ return 0;
+#else
+ return -1;
+#endif
+}
+
+/* This hash function is endian independent, to ensure consistent hashing if
+ * the cluster is composed of big and little endian systems. */
+static inline u32
+hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info)
+{
+ u32 hash;
+ u32 src = ntohl(t->src);
+ u32 dst = ntohl(t->dst);
+
+ if (dst < src)
+ swap(src, dst);
+
+ hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd);
+ hash = hash ^ (t->proto & info->proto_mask);
+
+ return (((u64)hash * info->hmodulus) >> 32) + info->hoffset;
+}
+
+static void
+hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff,
+ struct hmark_tuple *t, const struct xt_hmark_info *info)
+{
+ int protoff;
+
+ protoff = proto_ports_offset(t->proto);
+ if (protoff < 0)
+ return;
+
+ nhoff += protoff;
+ if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0)
+ return;
+
+ hmark_swap_ports(&t->uports, info);
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static int get_inner6_hdr(const struct sk_buff *skb, int *offset)
+{
+ struct icmp6hdr *icmp6h, _ih6;
+
+ icmp6h = skb_header_pointer(skb, *offset, sizeof(_ih6), &_ih6);
+ if (icmp6h == NULL)
+ return 0;
+
+ if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) {
+ *offset += sizeof(struct icmp6hdr);
+ return 1;
+ }
+ return 0;
+}
+
+static int
+hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t,
+ const struct xt_hmark_info *info)
+{
+ struct ipv6hdr *ip6, _ip6;
+ int flag = IP6_FH_F_AUTH;
+ unsigned int nhoff = 0;
+ u16 fragoff = 0;
+ int nexthdr;
+
+ ip6 = (struct ipv6hdr *) (skb->data + skb_network_offset(skb));
+ nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag);
+ if (nexthdr < 0)
+ return 0;
+ /* No need to check for icmp errors on fragments */
+ if ((flag & IP6_FH_F_FRAG) || (nexthdr != IPPROTO_ICMPV6))
+ goto noicmp;
+ /* Use inner header in case of ICMP errors */
+ if (get_inner6_hdr(skb, &nhoff)) {
+ ip6 = skb_header_pointer(skb, nhoff, sizeof(_ip6), &_ip6);
+ if (ip6 == NULL)
+ return -1;
+ /* If AH present, use SPI like in ESP. */
+ flag = IP6_FH_F_AUTH;
+ nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag);
+ if (nexthdr < 0)
+ return -1;
+ }
+noicmp:
+ t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6);
+ t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6);
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))
+ return 0;
+
+ t->proto = nexthdr;
+ if (t->proto == IPPROTO_ICMPV6)
+ return 0;
+
+ if (flag & IP6_FH_F_FRAG)
+ return 0;
+
+ hmark_set_tuple_ports(skb, nhoff, t, info);
+ return 0;
+}
+
+static unsigned int
+hmark_tg_v6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_hmark_info *info = par->targinfo;
+ struct hmark_tuple t;
+
+ memset(&t, 0, sizeof(struct hmark_tuple));
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) {
+ if (hmark_ct_set_htuple(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ } else {
+ if (hmark_pkt_set_htuple_ipv6(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ }
+
+ skb->mark = hmark_hash(&t, info);
+ return XT_CONTINUE;
+}
+#endif
+
+static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff)
+{
+ const struct icmphdr *icmph;
+ struct icmphdr _ih;
+
+ /* Not enough header? */
+ icmph = skb_header_pointer(skb, *nhoff + iphsz, sizeof(_ih), &_ih);
+ if (icmph == NULL || icmph->type > NR_ICMP_TYPES)
+ return 0;
+
+ /* Error message? */
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_SOURCE_QUENCH &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB &&
+ icmph->type != ICMP_REDIRECT)
+ return 0;
+
+ *nhoff += iphsz + sizeof(_ih);
+ return 1;
+}
+
+static int
+hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
+ const struct xt_hmark_info *info)
+{
+ struct iphdr *ip, _ip;
+ int nhoff = skb_network_offset(skb);
+
+ ip = (struct iphdr *) (skb->data + nhoff);
+ if (ip->protocol == IPPROTO_ICMP) {
+ /* Use inner header in case of ICMP errors */
+ if (get_inner_hdr(skb, ip->ihl * 4, &nhoff)) {
+ ip = skb_header_pointer(skb, nhoff, sizeof(_ip), &_ip);
+ if (ip == NULL)
+ return -1;
+ }
+ }
+
+ t->src = ip->saddr & info->src_mask.ip;
+ t->dst = ip->daddr & info->dst_mask.ip;
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))
+ return 0;
+
+ t->proto = ip->protocol;
+
+ /* ICMP has no ports, skip */
+ if (t->proto == IPPROTO_ICMP)
+ return 0;
+
+ /* follow-up fragments don't contain ports, skip all fragments */
+ if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ return 0;
+
+ hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info);
+
+ return 0;
+}
+
+static unsigned int
+hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_hmark_info *info = par->targinfo;
+ struct hmark_tuple t;
+
+ memset(&t, 0, sizeof(struct hmark_tuple));
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) {
+ if (hmark_ct_set_htuple(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ } else {
+ if (hmark_pkt_set_htuple_ipv4(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ }
+
+ skb->mark = hmark_hash(&t, info);
+ return XT_CONTINUE;
+}
+
+static int hmark_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_hmark_info *info = par->targinfo;
+
+ if (!info->hmodulus) {
+ pr_info("xt_HMARK: hash modulus can't be zero\n");
+ return -EINVAL;
+ }
+ if (info->proto_mask &&
+ (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) {
+ pr_info("xt_HMARK: proto mask must be zero with L3 mode\n");
+ return -EINVAL;
+ }
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI_MASK) &&
+ (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT_MASK) |
+ XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) {
+ pr_info("xt_HMARK: spi-mask and port-mask can't be combined\n");
+ return -EINVAL;
+ }
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI) &&
+ (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT) |
+ XT_HMARK_FLAG(XT_HMARK_DPORT)))) {
+ pr_info("xt_HMARK: spi-set and port-set can't be combined\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target hmark_tg_reg[] __read_mostly = {
+ {
+ .name = "HMARK",
+ .family = NFPROTO_IPV4,
+ .target = hmark_tg_v4,
+ .targetsize = sizeof(struct xt_hmark_info),
+ .checkentry = hmark_tg_check,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "HMARK",
+ .family = NFPROTO_IPV6,
+ .target = hmark_tg_v6,
+ .targetsize = sizeof(struct xt_hmark_info),
+ .checkentry = hmark_tg_check,
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init hmark_tg_init(void)
+{
+ return xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg));
+}
+
+static void __exit hmark_tg_exit(void)
+{
+ xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg));
+}
+
+module_init(hmark_tg_init);
+module_exit(hmark_tg_exit);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index be1f22e1354..f407ebc1348 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -122,14 +122,12 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
if (!info->timer) {
- pr_debug("couldn't alloc timer\n");
ret = -ENOMEM;
goto out;
}
info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
if (!info->timer->attr.attr.name) {
- pr_debug("couldn't alloc attribute name\n");
ret = -ENOMEM;
goto out_free_timer;
}
@@ -313,3 +311,5 @@ MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
MODULE_DESCRIPTION("Xtables: idle time monitor");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ipt_IDLETIMER");
+MODULE_ALIAS("ip6t_IDLETIMER");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index a4140509eea..993de2ba89d 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -31,6 +31,8 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");
MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match");
+MODULE_ALIAS("ipt_LED");
+MODULE_ALIAS("ip6t_LED");
static LIST_HEAD(xt_led_triggers);
static DEFINE_MUTEX(xt_led_mutex);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
new file mode 100644
index 00000000000..5ab24843370
--- /dev/null
+++ b/net/netfilter/xt_LOG.c
@@ -0,0 +1,975 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 5,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+static int dump_udp_header(struct sbuff *m, const struct sk_buff *skb,
+ u8 proto, int fragment, unsigned int offset)
+{
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ if (proto == IPPROTO_UDP)
+ /* Max length: 10 "PROTO=UDP " */
+ sb_add(m, "PROTO=UDP ");
+ else /* Max length: 14 "PROTO=UDPLITE " */
+ sb_add(m, "PROTO=UDPLITE ");
+
+ if (fragment)
+ goto out;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest),
+ ntohs(uh->len));
+
+out:
+ return 0;
+}
+
+static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb,
+ u8 proto, int fragment, unsigned int offset,
+ unsigned int logflags)
+{
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ /* Max length: 10 "PROTO=TCP " */
+ sb_add(m, "PROTO=TCP ");
+
+ if (fragment)
+ return 0;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest));
+ /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ if (logflags & XT_LOG_TCPSEQ)
+ sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq));
+
+ /* Max length: 13 "WINDOW=65535 " */
+ sb_add(m, "WINDOW=%u ", ntohs(th->window));
+ /* Max length: 9 "RES=0x3C " */
+ sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
+ TCP_RESERVED_BITS) >> 22));
+ /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+ if (th->cwr)
+ sb_add(m, "CWR ");
+ if (th->ece)
+ sb_add(m, "ECE ");
+ if (th->urg)
+ sb_add(m, "URG ");
+ if (th->ack)
+ sb_add(m, "ACK ");
+ if (th->psh)
+ sb_add(m, "PSH ");
+ if (th->rst)
+ sb_add(m, "RST ");
+ if (th->syn)
+ sb_add(m, "SYN ");
+ if (th->fin)
+ sb_add(m, "FIN ");
+ /* Max length: 11 "URGP=65535 " */
+ sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+ if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
+ u_int8_t _opt[60 - sizeof(struct tcphdr)];
+ const u_int8_t *op;
+ unsigned int i;
+ unsigned int optsize = th->doff*4 - sizeof(struct tcphdr);
+
+ op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
+ optsize, _opt);
+ if (op == NULL) {
+ sb_add(m, "OPT (TRUNCATED)");
+ return 1;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ sb_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ sb_add(m, "%02X", op[i]);
+
+ sb_add(m, ") ");
+ }
+
+ return 0;
+}
+
+static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk)
+{
+ if (!sk || sk->sk_state == TCP_TIME_WAIT)
+ return;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ const struct cred *cred = sk->sk_socket->file->f_cred;
+ sb_add(m, "UID=%u GID=%u ",
+ from_kuid_munged(&init_user_ns, cred->fsuid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/* One level of recursion won't kill us */
+static void dump_ipv4_packet(struct sbuff *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb,
+ unsigned int iphoff)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ sb_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+ sb_add(m, "SRC=%pI4 DST=%pI4 ",
+ &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ sb_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ sb_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ sb_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & XT_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ const unsigned char *op;
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+ optsize, _opt);
+ if (op == NULL) {
+ sb_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ sb_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ struct icmphdr _icmph;
+ const struct icmphdr *ich;
+ static const size_t required_len[NR_ICMP_TYPES+1]
+ = { [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+
+ /* Max length: 11 "PROTO=ICMP " */
+ sb_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (ich == NULL) {
+ sb_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+ sb_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ sb_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ sb_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ /* Fall through */
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ sb_add(m, "[");
+ dump_ipv4_packet(m, info, skb,
+ iphoff + ih->ihl*4+sizeof(_icmph));
+ sb_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED)
+ sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ sb_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ sb_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 10 "PROTO=ESP " */
+ sb_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_esph), &_esph);
+ if (eh == NULL) {
+ sb_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ sb_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & XT_LOG_UID) && !iphoff)
+ dump_sk_uid_gid(m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ sb_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 252 = 803 */
+}
+
+static void dump_ipv4_mac_header(struct sbuff *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & XT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ sb_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ sb_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++, p++)
+ sb_add(m, ":%02x", *p);
+ }
+ sb_add(m, " ");
+}
+
+static void
+log_packet_common(struct sbuff *m,
+ u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
+ '0' + loginfo->u.log.level, prefix,
+ in ? in->name : "",
+ out ? out->name : "");
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (skb->nf_bridge) {
+ const struct net_device *physindev;
+ const struct net_device *physoutdev;
+
+ physindev = skb->nf_bridge->physindev;
+ if (physindev && in != physindev)
+ sb_add(m, "PHYSIN=%s ", physindev->name);
+ physoutdev = skb->nf_bridge->physoutdev;
+ if (physoutdev && out != physoutdev)
+ sb_add(m, "PHYSOUT=%s ", physoutdev->name);
+ }
+#endif
+}
+
+
+static void
+ipt_log_packet(struct net *net,
+ u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct sbuff *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = sb_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix);
+
+ if (in != NULL)
+ dump_ipv4_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(m, loginfo, skb, 0);
+
+ sb_close(m);
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+/* One level of recursion won't kill us */
+static void dump_ipv6_packet(struct sbuff *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int ip6hoff,
+ int recurse)
+{
+ u_int8_t currenthdr;
+ int fragment;
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ unsigned int ptr;
+ unsigned int hdrlen = 0;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
+
+ ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
+ if (ih == NULL) {
+ sb_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+ sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
+ sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+ (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+ ih->hop_limit,
+ (ntohl(*(__be32 *)ih) & 0x000fffff));
+
+ fragment = 0;
+ ptr = ip6hoff + sizeof(struct ipv6hdr);
+ currenthdr = ih->nexthdr;
+ while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
+ struct ipv6_opt_hdr _hdr;
+ const struct ipv6_opt_hdr *hp;
+
+ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ sb_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 48 "OPT (...) " */
+ if (logflags & XT_LOG_IPOPT)
+ sb_add(m, "OPT ( ");
+
+ switch (currenthdr) {
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr _fhdr;
+ const struct frag_hdr *fh;
+
+ sb_add(m, "FRAG:");
+ fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
+ &_fhdr);
+ if (fh == NULL) {
+ sb_add(m, "TRUNCATED ");
+ return;
+ }
+
+ /* Max length: 6 "65535 " */
+ sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
+
+ /* Max length: 11 "INCOMPLETE " */
+ if (fh->frag_off & htons(0x0001))
+ sb_add(m, "INCOMPLETE ");
+
+ sb_add(m, "ID:%08x ", ntohl(fh->identification));
+
+ if (ntohs(fh->frag_off) & 0xFFF8)
+ fragment = 1;
+
+ hdrlen = 8;
+
+ break;
+ }
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_HOPOPTS:
+ if (fragment) {
+ if (logflags & XT_LOG_IPOPT)
+ sb_add(m, ")");
+ return;
+ }
+ hdrlen = ipv6_optlen(hp);
+ break;
+ /* Max Length */
+ case IPPROTO_AH:
+ if (logflags & XT_LOG_IPOPT) {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ /* Max length: 3 "AH " */
+ sb_add(m, "AH ");
+
+ if (fragment) {
+ sb_add(m, ")");
+ return;
+ }
+
+ ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
+ &_ahdr);
+ if (ah == NULL) {
+ /*
+ * Max length: 26 "INCOMPLETE [65535
+ * bytes] )"
+ */
+ sb_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 */
+ sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
+
+ }
+
+ hdrlen = (hp->hdrlen+2)<<2;
+ break;
+ case IPPROTO_ESP:
+ if (logflags & XT_LOG_IPOPT) {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 4 "ESP " */
+ sb_add(m, "ESP ");
+
+ if (fragment) {
+ sb_add(m, ")");
+ return;
+ }
+
+ /*
+ * Max length: 26 "INCOMPLETE [65535 bytes] )"
+ */
+ eh = skb_header_pointer(skb, ptr, sizeof(_esph),
+ &_esph);
+ if (eh == NULL) {
+ sb_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 16 "SPI=0xF1234567 )" */
+ sb_add(m, "SPI=0x%x )", ntohl(eh->spi));
+
+ }
+ return;
+ default:
+ /* Max length: 20 "Unknown Ext Hdr 255" */
+ sb_add(m, "Unknown Ext Hdr %u", currenthdr);
+ return;
+ }
+ if (logflags & XT_LOG_IPOPT)
+ sb_add(m, ") ");
+
+ currenthdr = hp->nexthdr;
+ ptr += hdrlen;
+ }
+
+ switch (currenthdr) {
+ case IPPROTO_TCP:
+ if (dump_tcp_header(m, skb, currenthdr, fragment, ptr,
+ logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (dump_udp_header(m, skb, currenthdr, fragment, ptr))
+ return;
+ break;
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _icmp6h;
+ const struct icmp6hdr *ic;
+
+ /* Max length: 13 "PROTO=ICMPv6 " */
+ sb_add(m, "PROTO=ICMPv6 ");
+
+ if (fragment)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
+ if (ic == NULL) {
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
+ return;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+
+ switch (ic->icmp6_type) {
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ sb_add(m, "ID=%u SEQ=%u ",
+ ntohs(ic->icmp6_identifier),
+ ntohs(ic->icmp6_sequence));
+ break;
+ case ICMPV6_MGM_QUERY:
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MGM_REDUCTION:
+ break;
+
+ case ICMPV6_PARAMPROB:
+ /* Max length: 17 "POINTER=ffffffff " */
+ sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
+ /* Fall through */
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_PKT_TOOBIG:
+ case ICMPV6_TIME_EXCEED:
+ /* Max length: 3+maxlen */
+ if (recurse) {
+ sb_add(m, "[");
+ dump_ipv6_packet(m, info, skb,
+ ptr + sizeof(_icmp6h), 0);
+ sb_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
+ sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
+ }
+ break;
+ }
+ /* Max length: 10 "PROTO=255 " */
+ default:
+ sb_add(m, "PROTO=%u ", currenthdr);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & XT_LOG_UID) && recurse)
+ dump_sk_uid_gid(m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (recurse && skb->mark)
+ sb_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_ipv6_mac_header(struct sbuff *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & XT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ sb_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int len = dev->hard_header_len;
+ unsigned int i;
+
+ if (dev->type == ARPHRD_SIT) {
+ p -= ETH_HLEN;
+
+ if (p < skb->head)
+ p = NULL;
+ }
+
+ if (p != NULL) {
+ sb_add(m, "%02x", *p++);
+ for (i = 1; i < len; i++)
+ sb_add(m, ":%02x", *p++);
+ }
+ sb_add(m, " ");
+
+ if (dev->type == ARPHRD_SIT) {
+ const struct iphdr *iph =
+ (struct iphdr *)skb_mac_header(skb);
+ sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr,
+ &iph->daddr);
+ }
+ } else
+ sb_add(m, " ");
+}
+
+static void
+ip6t_log_packet(struct net *net,
+ u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct sbuff *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = sb_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix);
+
+ if (in != NULL)
+ dump_ipv6_mac_header(m, loginfo, skb);
+
+ dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
+
+ sb_close(m);
+}
+#endif
+
+static unsigned int
+log_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_log_info *loginfo = par->targinfo;
+ struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
+
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = loginfo->level;
+ li.u.log.logflags = loginfo->logflags;
+
+ if (par->family == NFPROTO_IPV4)
+ ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in,
+ par->out, &li, loginfo->prefix);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ else if (par->family == NFPROTO_IPV6)
+ ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in,
+ par->out, &li, loginfo->prefix);
+#endif
+ else
+ WARN_ON_ONCE(1);
+
+ return XT_CONTINUE;
+}
+
+static int log_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_log_info *loginfo = par->targinfo;
+
+ if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6)
+ return -EINVAL;
+
+ if (loginfo->level >= 8) {
+ pr_debug("level %u >= 8\n", loginfo->level);
+ return -EINVAL;
+ }
+
+ if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+ pr_debug("prefix is not null-terminated\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_target log_tg_regs[] __read_mostly = {
+ {
+ .name = "LOG",
+ .family = NFPROTO_IPV4,
+ .target = log_tg,
+ .targetsize = sizeof(struct xt_log_info),
+ .checkentry = log_tg_check,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "LOG",
+ .family = NFPROTO_IPV6,
+ .target = log_tg,
+ .targetsize = sizeof(struct xt_log_info),
+ .checkentry = log_tg_check,
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static struct nf_logger ipt_log_logger __read_mostly = {
+ .name = "ipt_LOG",
+ .logfn = &ipt_log_packet,
+ .me = THIS_MODULE,
+};
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static struct nf_logger ip6t_log_logger __read_mostly = {
+ .name = "ip6t_LOG",
+ .logfn = &ip6t_log_packet,
+ .me = THIS_MODULE,
+};
+#endif
+
+static int __net_init log_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger);
+#endif
+ return 0;
+}
+
+static void __net_exit log_net_exit(struct net *net)
+{
+ nf_log_unset(net, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_unset(net, &ip6t_log_logger);
+#endif
+}
+
+static struct pernet_operations log_net_ops = {
+ .init = log_net_init,
+ .exit = log_net_exit,
+};
+
+static int __init log_tg_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&log_net_ops);
+ if (ret < 0)
+ goto err_pernet;
+
+ ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
+ if (ret < 0)
+ goto err_target;
+
+ nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_register(NFPROTO_IPV6, &ip6t_log_logger);
+#endif
+ return 0;
+
+err_target:
+ unregister_pernet_subsys(&log_net_ops);
+err_pernet:
+ return ret;
+}
+
+static void __exit log_tg_exit(void)
+{
+ unregister_pernet_subsys(&log_net_ops);
+ nf_log_unregister(&ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_unregister(&ip6t_log_logger);
+#endif
+ xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
+}
+
+module_init(log_tg_init);
+module_exit(log_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
+MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging");
+MODULE_ALIAS("ipt_LOG");
+MODULE_ALIAS("ip6t_LOG");
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
new file mode 100644
index 00000000000..b253e07cb1c
--- /dev/null
+++ b/net/netfilter/xt_NETMAP.c
@@ -0,0 +1,165 @@
+/*
+ * (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+
+static unsigned int
+netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ struct nf_nat_range newrange;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ union nf_inet_addr new_addr, netmask;
+ unsigned int i;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++)
+ netmask.ip6[i] = ~(range->min_addr.ip6[i] ^
+ range->max_addr.ip6[i]);
+
+ if (par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT)
+ new_addr.in6 = ipv6_hdr(skb)->daddr;
+ else
+ new_addr.in6 = ipv6_hdr(skb)->saddr;
+
+ for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) {
+ new_addr.ip6[i] &= ~netmask.ip6[i];
+ new_addr.ip6[i] |= range->min_addr.ip6[i] &
+ netmask.ip6[i];
+ }
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr = new_addr;
+ newrange.max_addr = new_addr;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+}
+
+static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+
+ if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
+ return -EINVAL;
+ return 0;
+}
+
+static unsigned int
+netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ __be32 new_ip, netmask;
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range newrange;
+
+ NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_POST_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT ||
+ par->hooknum == NF_INET_LOCAL_IN);
+ ct = nf_ct_get(skb, &ctinfo);
+
+ netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
+
+ if (par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT)
+ new_ip = ip_hdr(skb)->daddr & ~netmask;
+ else
+ new_ip = ip_hdr(skb)->saddr & ~netmask;
+ new_ip |= mr->range[0].min_ip & netmask;
+
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = new_ip;
+ newrange.max_addr.ip = new_ip;
+ newrange.min_proto = mr->range[0].min;
+ newrange.max_proto = mr->range[0].max;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+}
+
+static int netmap_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ if (mr->rangesize != 1) {
+ pr_debug("bad rangesize %u.\n", mr->rangesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target netmap_tg_reg[] __read_mostly = {
+ {
+ .name = "NETMAP",
+ .family = NFPROTO_IPV6,
+ .revision = 0,
+ .target = netmap_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .checkentry = netmap_tg6_checkentry,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "NETMAP",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .target = netmap_tg4,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .checkentry = netmap_tg4_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init netmap_tg_init(void)
+{
+ return xt_register_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg));
+}
+
+static void netmap_tg_exit(void)
+{
+ xt_unregister_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg));
+}
+
+module_init(netmap_tg_init);
+module_exit(netmap_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of subnets");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS("ip6t_NETMAP");
+MODULE_ALIAS("ipt_NETMAP");
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index a17dd0f589b..fb7497c928a 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -26,13 +26,14 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
li.u.ulog.group = info->group;
li.u.ulog.qthreshold = info->threshold;
- nfulnl_log_packet(par->family, par->hooknum, skb, par->in,
+ nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
par->out, &li, info->prefix);
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 039cce1bde3..8f1779ff7e3 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -11,15 +11,13 @@
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/jhash.h>
-
#include <linux/netfilter.h>
#include <linux/netfilter_arp.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_NFQUEUE.h>
+#include <net/netfilter/nf_queue.h>
+
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("Xtables: packet forwarding to netlink");
MODULE_LICENSE("GPL");
@@ -28,7 +26,6 @@ MODULE_ALIAS("ip6t_NFQUEUE");
MODULE_ALIAS("arpt_NFQUEUE");
static u32 jhash_initval __read_mostly;
-static bool rnd_inited __read_mostly;
static unsigned int
nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par)
@@ -38,32 +35,6 @@ nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par)
return NF_QUEUE_NR(tinfo->queuenum);
}
-static u32 hash_v4(const struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
- __be32 ipaddr;
-
- /* packets in either direction go into same queue */
- ipaddr = iph->saddr ^ iph->daddr;
-
- return jhash_2words((__force u32)ipaddr, iph->protocol, jhash_initval);
-}
-
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-static u32 hash_v6(const struct sk_buff *skb)
-{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- __be32 addr[4];
-
- addr[0] = ip6h->saddr.s6_addr32[0] ^ ip6h->daddr.s6_addr32[0];
- addr[1] = ip6h->saddr.s6_addr32[1] ^ ip6h->daddr.s6_addr32[1];
- addr[2] = ip6h->saddr.s6_addr32[2] ^ ip6h->daddr.s6_addr32[2];
- addr[3] = ip6h->saddr.s6_addr32[3] ^ ip6h->daddr.s6_addr32[3];
-
- return jhash2((__force u32 *)addr, ARRAY_SIZE(addr), jhash_initval);
-}
-#endif
-
static unsigned int
nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -71,25 +42,30 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
u32 queue = info->queuenum;
if (info->queues_total > 1) {
- if (par->family == NFPROTO_IPV4)
- queue = hash_v4(skb) % info->queues_total + queue;
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
- else if (par->family == NFPROTO_IPV6)
- queue = hash_v6(skb) % info->queues_total + queue;
-#endif
+ queue = nfqueue_hash(skb, queue, info->queues_total,
+ par->family, jhash_initval);
}
return NF_QUEUE_NR(queue);
}
-static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
+static unsigned int
+nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct xt_NFQ_info_v1 *info = par->targinfo;
+ const struct xt_NFQ_info_v2 *info = par->targinfo;
+ unsigned int ret = nfqueue_tg_v1(skb, par);
+
+ if (info->bypass)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+ return ret;
+}
+
+static int nfqueue_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
u32 maxid;
- if (unlikely(!rnd_inited)) {
- get_random_bytes(&jhash_initval, sizeof(jhash_initval));
- rnd_inited = true;
- }
+ init_hashrandom(&jhash_initval);
+
if (info->queues_total == 0) {
pr_err("NFQUEUE: number of total queues is 0\n");
return -EINVAL;
@@ -100,9 +76,39 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
info->queues_total, maxid);
return -ERANGE;
}
+ if (par->target->revision == 2 && info->flags > 1)
+ return -EINVAL;
+ if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK)
+ return -EINVAL;
+
return 0;
}
+static unsigned int
+nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
+ u32 queue = info->queuenum;
+ int ret;
+
+ if (info->queues_total > 1) {
+ if (info->flags & NFQ_FLAG_CPU_FANOUT) {
+ int cpu = smp_processor_id();
+
+ queue = info->queuenum + cpu % info->queues_total;
+ } else {
+ queue = nfqueue_hash(skb, queue, info->queues_total,
+ par->family, jhash_initval);
+ }
+ }
+
+ ret = NF_QUEUE_NR(queue);
+ if (info->flags & NFQ_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ return ret;
+}
+
static struct xt_target nfqueue_tg_reg[] __read_mostly = {
{
.name = "NFQUEUE",
@@ -115,11 +121,29 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
.name = "NFQUEUE",
.revision = 1,
.family = NFPROTO_UNSPEC,
- .checkentry = nfqueue_tg_v1_check,
+ .checkentry = nfqueue_tg_check,
.target = nfqueue_tg_v1,
.targetsize = sizeof(struct xt_NFQ_info_v1),
.me = THIS_MODULE,
},
+ {
+ .name = "NFQUEUE",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v2,
+ .targetsize = sizeof(struct xt_NFQ_info_v2),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "NFQUEUE",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v3,
+ .targetsize = sizeof(struct xt_NFQ_info_v3),
+ .me = THIS_MODULE,
+ },
};
static int __init nfqueue_tg_init(void)
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
deleted file mode 100644
index 9d782181b6c..00000000000
--- a/net/netfilter/xt_NOTRACK.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/* This is a module which is used for setting up fake conntracks
- * on packets so that they are not seen by the conntrack/NAT code.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-
-MODULE_DESCRIPTION("Xtables: Disabling connection tracking for packets");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ipt_NOTRACK");
-MODULE_ALIAS("ip6t_NOTRACK");
-
-static unsigned int
-notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- /* Previously seen (loopback)? Ignore. */
- if (skb->nfct != NULL)
- return XT_CONTINUE;
-
- /* Attach fake conntrack entry.
- If there is a real ct entry correspondig to this packet,
- it'll hang aroun till timing out. We don't deal with it
- for performance reasons. JK */
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
-
- return XT_CONTINUE;
-}
-
-static struct xt_target notrack_tg_reg __read_mostly = {
- .name = "NOTRACK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = notrack_tg,
- .table = "raw",
- .me = THIS_MODULE,
-};
-
-static int __init notrack_tg_init(void)
-{
- return xt_register_target(&notrack_tg_reg);
-}
-
-static void __exit notrack_tg_exit(void)
-{
- xt_unregister_target(&notrack_tg_reg);
-}
-
-module_init(notrack_tg_init);
-module_exit(notrack_tg_exit);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index de079abd5bc..370adf622ce 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -43,12 +43,11 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
struct xt_rateest *xt_rateest_lookup(const char *name)
{
struct xt_rateest *est;
- struct hlist_node *n;
unsigned int h;
h = xt_rateest_hash(name);
mutex_lock(&xt_rateest_mutex);
- hlist_for_each_entry(est, n, &rateest_hash[h], list) {
+ hlist_for_each_entry(est, &rateest_hash[h], list) {
if (strcmp(est->name, name) == 0) {
est->refcnt++;
mutex_unlock(&xt_rateest_mutex);
@@ -60,11 +59,6 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
}
EXPORT_SYMBOL_GPL(xt_rateest_lookup);
-static void xt_rateest_free_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct xt_rateest, rcu));
-}
-
void xt_rateest_put(struct xt_rateest *est)
{
mutex_lock(&xt_rateest_mutex);
@@ -75,7 +69,7 @@ void xt_rateest_put(struct xt_rateest *est)
* gen_estimator est_timer() might access est->lock or bstats,
* wait a RCU grace period before freeing 'est'
*/
- call_rcu(&est->rcu, xt_rateest_free_rcu);
+ kfree_rcu(est, rcu);
}
mutex_unlock(&xt_rateest_mutex);
}
@@ -188,7 +182,6 @@ static int __init xt_rateest_tg_init(void)
static void __exit xt_rateest_tg_fini(void)
{
xt_unregister_target(&xt_rateest_tg_reg);
- rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */
}
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
new file mode 100644
index 00000000000..22a10309297
--- /dev/null
+++ b/net/netfilter/xt_REDIRECT.c
@@ -0,0 +1,190 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/types.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/addrconf.h>
+#include <net/checksum.h>
+#include <net/protocol.h>
+#include <net/netfilter/nf_nat.h>
+
+static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
+
+static unsigned int
+redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ struct nf_nat_range newrange;
+ struct in6_addr newdst;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (par->hooknum == NF_INET_LOCAL_OUT)
+ newdst = loopback_addr;
+ else {
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ bool addr = false;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(skb->dev);
+ if (idev != NULL) {
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ newdst = ifa->addr;
+ addr = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!addr)
+ return NF_DROP;
+ }
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.in6 = newdst;
+ newrange.max_addr.in6 = newdst;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+
+static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS)
+ return -EINVAL;
+ return 0;
+}
+
+/* FIXME: Take multiple ranges --RR */
+static int redirect_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ if (mr->rangesize != 1) {
+ pr_debug("bad rangesize %u.\n", mr->rangesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static unsigned int
+redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ __be32 newdst;
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range newrange;
+
+ NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ /* Local packets: make them go to loopback */
+ if (par->hooknum == NF_INET_LOCAL_OUT)
+ newdst = htonl(0x7F000001);
+ else {
+ struct in_device *indev;
+ struct in_ifaddr *ifa;
+
+ newdst = 0;
+
+ rcu_read_lock();
+ indev = __in_dev_get_rcu(skb->dev);
+ if (indev && (ifa = indev->ifa_list))
+ newdst = ifa->ifa_local;
+ rcu_read_unlock();
+
+ if (!newdst)
+ return NF_DROP;
+ }
+
+ /* Transfer from original range. */
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = newdst;
+ newrange.max_addr.ip = newdst;
+ newrange.min_proto = mr->range[0].min;
+ newrange.max_proto = mr->range[0].max;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+
+static struct xt_target redirect_tg_reg[] __read_mostly = {
+ {
+ .name = "REDIRECT",
+ .family = NFPROTO_IPV6,
+ .revision = 0,
+ .table = "nat",
+ .checkentry = redirect_tg6_checkentry,
+ .target = redirect_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "REDIRECT",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .table = "nat",
+ .target = redirect_tg4,
+ .checkentry = redirect_tg4_check,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init redirect_tg_init(void)
+{
+ return xt_register_targets(redirect_tg_reg,
+ ARRAY_SIZE(redirect_tg_reg));
+}
+
+static void __exit redirect_tg_exit(void)
+{
+ xt_unregister_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg));
+}
+
+module_init(redirect_tg_init);
+module_exit(redirect_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
+MODULE_ALIAS("ip6t_REDIRECT");
+MODULE_ALIAS("ipt_REDIRECT");
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index eb81c380da1..e762de5ee89 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -2,6 +2,7 @@
* This is a module which is used for setting the MSS option in TCP packets.
*
* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ * Copyright (C) 2007 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -42,40 +43,82 @@ optlen(const u_int8_t *opt, unsigned int offset)
return opt[offset+1];
}
+static u_int32_t tcpmss_reverse_mtu(struct net *net,
+ const struct sk_buff *skb,
+ unsigned int family)
+{
+ struct flowi fl;
+ const struct nf_afinfo *ai;
+ struct rtable *rt = NULL;
+ u_int32_t mtu = ~0U;
+
+ if (family == PF_INET) {
+ struct flowi4 *fl4 = &fl.u.ip4;
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = ip_hdr(skb)->saddr;
+ } else {
+ struct flowi6 *fl6 = &fl.u.ip6;
+
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->daddr = ipv6_hdr(skb)->saddr;
+ }
+ rcu_read_lock();
+ ai = nf_get_afinfo(family);
+ if (ai != NULL)
+ ai->route(net, (struct dst_entry **)&rt, &fl, false);
+ rcu_read_unlock();
+
+ if (rt != NULL) {
+ mtu = dst_mtu(&rt->dst);
+ dst_release(&rt->dst);
+ }
+ return mtu;
+}
+
static int
tcpmss_mangle_packet(struct sk_buff *skb,
- const struct xt_tcpmss_info *info,
- unsigned int in_mtu,
+ const struct xt_action_param *par,
+ unsigned int family,
unsigned int tcphoff,
unsigned int minlen)
{
+ const struct xt_tcpmss_info *info = par->targinfo;
struct tcphdr *tcph;
- unsigned int tcplen, i;
+ int len, tcp_hdrlen;
+ unsigned int i;
__be16 oldval;
u16 newmss;
u8 *opt;
+ /* This is a fragment, no TCP header is available */
+ if (par->fragoff != 0)
+ return 0;
+
if (!skb_make_writable(skb, skb->len))
return -1;
- tcplen = skb->len - tcphoff;
+ len = skb->len - tcphoff;
+ if (len < (int)sizeof(struct tcphdr))
+ return -1;
+
tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ tcp_hdrlen = tcph->doff * 4;
- /* Header cannot be larger than the packet */
- if (tcplen < tcph->doff*4)
+ if (len < tcp_hdrlen)
return -1;
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
+
if (dst_mtu(skb_dst(skb)) <= minlen) {
- if (net_ratelimit())
- pr_err("unknown or invalid path-MTU (%u)\n",
- dst_mtu(skb_dst(skb)));
+ net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
+ dst_mtu(skb_dst(skb)));
return -1;
}
if (in_mtu <= minlen) {
- if (net_ratelimit())
- pr_err("unknown or invalid path-MTU (%u)\n",
- in_mtu);
+ net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
+ in_mtu);
return -1;
}
newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
@@ -83,9 +126,8 @@ tcpmss_mangle_packet(struct sk_buff *skb,
newmss = info->mss;
opt = (u_int8_t *)tcph;
- for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) {
- if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS &&
- opt[i+1] == TCPOLEN_MSS) {
+ for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) {
+ if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) {
u_int16_t oldmss;
oldmss = (opt[i+2] << 8) | opt[i+3];
@@ -108,9 +150,10 @@ tcpmss_mangle_packet(struct sk_buff *skb,
}
/* There is data after the header so the option can't be added
- without moving it, and doing so may make the SYN packet
- itself too large. Accept the packet unmodified instead. */
- if (tcplen > tcph->doff*4)
+ * without moving it, and doing so may make the SYN packet
+ * itself too large. Accept the packet unmodified instead.
+ */
+ if (len > tcp_hdrlen)
return 0;
/*
@@ -126,11 +169,23 @@ tcpmss_mangle_packet(struct sk_buff *skb,
skb_put(skb, TCPOLEN_MSS);
+ /*
+ * IPv4: RFC 1122 states "If an MSS option is not received at
+ * connection setup, TCP MUST assume a default send MSS of 536".
+ * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum
+ * length IPv6 header of 60, ergo the default MSS value is 1220
+ * Since no MSS was provided, we must use the default values
+ */
+ if (par->family == NFPROTO_IPV4)
+ newmss = min(newmss, (u16)536);
+ else
+ newmss = min(newmss, (u16)1220);
+
opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
- memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
+ memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr));
inet_proto_csum_replace2(&tcph->check, skb,
- htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1);
+ htons(len), htons(len + TCPOLEN_MSS), 1);
opt[0] = TCPOPT_MSS;
opt[1] = TCPOLEN_MSS;
opt[2] = (newmss & 0xff00) >> 8;
@@ -145,32 +200,6 @@ tcpmss_mangle_packet(struct sk_buff *skb,
return TCPOLEN_MSS;
}
-static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
- unsigned int family)
-{
- struct flowi fl = {};
- const struct nf_afinfo *ai;
- struct rtable *rt = NULL;
- u_int32_t mtu = ~0U;
-
- if (family == PF_INET)
- fl.fl4_dst = ip_hdr(skb)->saddr;
- else
- fl.fl6_dst = ipv6_hdr(skb)->saddr;
-
- rcu_read_lock();
- ai = nf_get_afinfo(family);
- if (ai != NULL)
- ai->route((struct dst_entry **)&rt, &fl);
- rcu_read_unlock();
-
- if (rt != NULL) {
- mtu = dst_mtu(&rt->dst);
- dst_release(&rt->dst);
- }
- return mtu;
-}
-
static unsigned int
tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -178,8 +207,8 @@ tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)
__be16 newlen;
int ret;
- ret = tcpmss_mangle_packet(skb, par->targinfo,
- tcpmss_reverse_mtu(skb, PF_INET),
+ ret = tcpmss_mangle_packet(skb, par,
+ PF_INET,
iph->ihl * 4,
sizeof(*iph) + sizeof(struct tcphdr));
if (ret < 0)
@@ -193,21 +222,22 @@ tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u8 nexthdr;
+ __be16 frag_off;
int tcphoff;
int ret;
nexthdr = ipv6h->nexthdr;
- tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr);
+ tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);
if (tcphoff < 0)
return NF_DROP;
- ret = tcpmss_mangle_packet(skb, par->targinfo,
- tcpmss_reverse_mtu(skb, PF_INET6),
+ ret = tcpmss_mangle_packet(skb, par,
+ PF_INET6,
tcphoff,
sizeof(*ipv6h) + sizeof(struct tcphdr));
if (ret < 0)
@@ -254,7 +284,7 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
{
const struct xt_tcpmss_info *info = par->targinfo;
@@ -287,7 +317,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
.proto = IPPROTO_TCP,
.me = THIS_MODULE,
},
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.family = NFPROTO_IPV6,
.name = "TCPMSS",
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 9dc9ecfdd54..625fa1d636a 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -30,28 +30,43 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
static unsigned int
tcpoptstrip_mangle_packet(struct sk_buff *skb,
- const struct xt_tcpoptstrip_target_info *info,
+ const struct xt_action_param *par,
unsigned int tcphoff, unsigned int minlen)
{
+ const struct xt_tcpoptstrip_target_info *info = par->targinfo;
unsigned int optl, i, j;
struct tcphdr *tcph;
u_int16_t n, o;
u_int8_t *opt;
+ int len, tcp_hdrlen;
+
+ /* This is a fragment, no TCP header is available */
+ if (par->fragoff != 0)
+ return XT_CONTINUE;
if (!skb_make_writable(skb, skb->len))
return NF_DROP;
+ len = skb->len - tcphoff;
+ if (len < (int)sizeof(struct tcphdr))
+ return NF_DROP;
+
tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ tcp_hdrlen = tcph->doff * 4;
+
+ if (len < tcp_hdrlen)
+ return NF_DROP;
+
opt = (u_int8_t *)tcph;
/*
* Walk through all TCP options - if we find some option to remove,
* set all octets to %TCPOPT_NOP and adjust checksum.
*/
- for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) {
+ for (i = sizeof(struct tcphdr); i < tcp_hdrlen - 1; i += optl) {
optl = optlen(opt, i);
- if (i + optl > tcp_hdrlen(skb))
+ if (i + optl > tcp_hdrlen)
break;
if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i]))
@@ -76,24 +91,25 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
static unsigned int
tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb),
+ return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),
sizeof(struct iphdr) + sizeof(struct tcphdr));
}
-#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
static unsigned int
tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
int tcphoff;
u_int8_t nexthdr;
+ __be16 frag_off;
nexthdr = ipv6h->nexthdr;
- tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr);
+ tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);
if (tcphoff < 0)
return NF_DROP;
- return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff,
+ return tcpoptstrip_mangle_packet(skb, par, tcphoff,
sizeof(*ipv6h) + sizeof(struct tcphdr));
}
#endif
@@ -108,7 +124,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
.me = THIS_MODULE,
},
-#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
{
.name = "TCPOPTSTRIP",
.family = NFPROTO_IPV6,
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 5128a6c4cb2..292934d2348 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -25,13 +25,10 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_TEE.h>
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
# define WITH_CONNTRACK 1
# include <net/netfilter/nf_conntrack.h>
#endif
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-# define WITH_IPV6 1
-#endif
struct xt_tee_priv {
struct notifier_block notifier;
@@ -62,18 +59,20 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
const struct iphdr *iph = ip_hdr(skb);
struct net *net = pick_net(skb);
struct rtable *rt;
- struct flowi fl;
+ struct flowi4 fl4;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl4, 0, sizeof(fl4));
if (info->priv) {
if (info->priv->oif == -1)
return false;
- fl.oif = info->priv->oif;
+ fl4.flowi4_oif = info->priv->oif;
}
- fl.fl4_dst = info->gw.ip;
- fl.fl4_tos = RT_TOS(iph->tos);
- fl.fl4_scope = RT_SCOPE_UNIVERSE;
- if (ip_route_output_key(net, &rt, &fl) != 0)
+ fl4.daddr = info->gw.ip;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
return false;
skb_dst_drop(skb);
@@ -89,7 +88,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_tee_tginfo *info = par->targinfo;
struct iphdr *iph;
- if (percpu_read(tee_active))
+ if (__this_cpu_read(tee_active))
return XT_CONTINUE;
/*
* Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@@ -126,37 +125,38 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
ip_send_check(iph);
if (tee_tg_route4(skb, info)) {
- percpu_write(tee_active, true);
+ __this_cpu_write(tee_active, true);
ip_local_out(skb);
- percpu_write(tee_active, false);
+ __this_cpu_write(tee_active, false);
} else {
kfree_skb(skb);
}
return XT_CONTINUE;
}
-#ifdef WITH_IPV6
+#if IS_ENABLED(CONFIG_IPV6)
static bool
tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = pick_net(skb);
struct dst_entry *dst;
- struct flowi fl;
+ struct flowi6 fl6;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (info->priv) {
if (info->priv->oif == -1)
return false;
- fl.oif = info->priv->oif;
+ fl6.flowi6_oif = info->priv->oif;
}
- fl.fl6_dst = info->gw.in6;
- fl.fl6_flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
+ fl6.daddr = info->gw.in6;
+ fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
(iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
- dst = ip6_route_output(net, NULL, &fl);
- if (dst == NULL)
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
return false;
-
+ }
skb_dst_drop(skb);
skb_dst_set(skb, dst);
skb->dev = dst->dev;
@@ -169,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
- if (percpu_read(tee_active))
+ if (__this_cpu_read(tee_active))
return XT_CONTINUE;
skb = pskb_copy(skb, GFP_ATOMIC);
if (skb == NULL)
@@ -187,20 +187,20 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
--iph->hop_limit;
}
if (tee_tg_route6(skb, info)) {
- percpu_write(tee_active, true);
+ __this_cpu_write(tee_active, true);
ip6_local_out(skb);
- percpu_write(tee_active, false);
+ __this_cpu_write(tee_active, false);
} else {
kfree_skb(skb);
}
return XT_CONTINUE;
}
-#endif /* WITH_IPV6 */
+#endif
static int tee_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct xt_tee_priv *priv;
priv = container_of(this, struct xt_tee_priv, notifier);
@@ -275,7 +275,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
},
-#ifdef WITH_IPV6
+#if IS_ENABLED(CONFIG_IPV6)
{
.name = "TEE",
.revision = 1,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 640678f47a2..ef8a926752a 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -15,24 +15,45 @@
#include <linux/ip.h>
#include <net/checksum.h>
#include <net/udp.h>
+#include <net/tcp.h>
#include <net/inet_sock.h>
+#include <net/inet_hashtables.h>
#include <linux/inetdevice.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
#define XT_TPROXY_HAVE_IPV6 1
#include <net/if_inet6.h>
#include <net/addrconf.h>
+#include <net/inet6_hashtables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
-#include <net/netfilter/nf_tproxy_core.h>
#include <linux/netfilter/xt_TPROXY.h>
+enum nf_tproxy_lookup_t {
+ NFT_LOOKUP_LISTENER,
+ NFT_LOOKUP_ESTABLISHED,
+};
+
+static bool tproxy_sk_is_transparent(struct sock *sk)
+{
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ if (inet_sk(sk)->transparent)
+ return true;
+ sock_put(sk);
+ } else {
+ if (inet_twsk(sk)->tw_transparent)
+ return true;
+ inet_twsk_put(inet_twsk(sk));
+ }
+ return false;
+}
+
static inline __be32
tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
@@ -54,8 +75,159 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
return laddr ? laddr : daddr;
}
+/*
+ * This is used when the user wants to intercept a connection matching
+ * an explicit iptables rule. In this case the sockets are assumed
+ * matching in preference order:
+ *
+ * - match: if there's a fully established connection matching the
+ * _packet_ tuple, it is returned, assuming the redirection
+ * already took place and we process a packet belonging to an
+ * established connection
+ *
+ * - match: if there's a listening socket matching the redirection
+ * (e.g. on-port & on-ip of the connection), it is returned,
+ * regardless if it was bound to 0.0.0.0 or an explicit
+ * address. The reasoning is that if there's an explicit rule, it
+ * does not really matter if the listener is bound to an interface
+ * or to 0. The user already stated that he wants redirection
+ * (since he added the rule).
+ *
+ * Please note that there's an overlap between what a TPROXY target
+ * and a socket match will match. Normally if you have both rules the
+ * "socket" match will be the first one, effectively all packets
+ * belonging to established connections going through that one.
+ */
+static inline struct sock *
+nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ switch (lookup_type) {
+ case NFT_LOOKUP_LISTENER:
+ sk = inet_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
+ daddr, dport,
+ in->ifindex);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NFT_LOOKUP_ESTABLISHED:
+ sk = inet_lookup_established(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case IPPROTO_UDP:
+ sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+ (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+ protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+
+#ifdef XT_TPROXY_HAVE_IPV6
+static inline struct sock *
+nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ switch (lookup_type) {
+ case NFT_LOOKUP_LISTENER:
+ sk = inet6_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
+ daddr, ntohs(dport),
+ in->ifindex);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NFT_LOOKUP_ESTABLISHED:
+ sk = __inet6_lookup_established(net, &tcp_hashinfo,
+ saddr, sport, daddr, ntohs(dport),
+ in->ifindex);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case IPPROTO_UDP:
+ sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+ (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+ protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+#endif
+
/**
- * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
* @skb: The skb being processed.
* @laddr: IPv4 address to redirect to or zero.
* @lport: TCP port to redirect to or zero.
@@ -103,6 +275,15 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
return sk;
}
+/* assign a socket to the skb -- consumes sk */
+static void
+nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+}
+
static unsigned int
tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
u_int32_t mark_mask, u_int32_t mark_value)
@@ -141,7 +322,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
skb->dev, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ if (sk && tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -149,6 +330,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
iph->protocol, &iph->daddr, ntohs(hp->dest),
&laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
@@ -204,7 +387,7 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
}
/**
- * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
* @skb: The skb being processed.
* @tproto: Transport protocol.
* @thoff: Transport protocol header offset.
@@ -266,10 +449,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
struct sock *sk;
const struct in6_addr *laddr;
__be16 lport;
- int thoff;
+ int thoff = 0;
int tproto;
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
if (tproto < 0) {
pr_debug("unable to find transport header in IPv6 packet, dropping\n");
return NF_DROP;
@@ -306,7 +489,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
par->in, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ if (sk && tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
@@ -314,6 +497,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
tproto, &iph->saddr, ntohs(hp->source),
laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
new file mode 100644
index 00000000000..fab6eea1bf3
--- /dev/null
+++ b/net/netfilter/xt_addrtype.c
@@ -0,0 +1,248 @@
+/*
+ * iptables module to match inet_addr_type() of an ip.
+ *
+ * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
+ * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/ip6_fib.h>
+#endif
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/xt_addrtype.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: address type match");
+MODULE_ALIAS("ipt_addrtype");
+MODULE_ALIAS("ip6t_addrtype");
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
+ const struct in6_addr *addr, u16 mask)
+{
+ const struct nf_afinfo *afinfo;
+ struct flowi6 flow;
+ struct rt6_info *rt;
+ u32 ret = 0;
+ int route_err;
+
+ memset(&flow, 0, sizeof(flow));
+ flow.daddr = *addr;
+ if (dev)
+ flow.flowi6_oif = dev->ifindex;
+
+ rcu_read_lock();
+
+ afinfo = nf_get_afinfo(NFPROTO_IPV6);
+ if (afinfo != NULL) {
+ const struct nf_ipv6_ops *v6ops;
+
+ if (dev && (mask & XT_ADDRTYPE_LOCAL)) {
+ v6ops = nf_get_ipv6_ops();
+ if (v6ops && v6ops->chk_addr(net, addr, dev, true))
+ ret = XT_ADDRTYPE_LOCAL;
+ }
+ route_err = afinfo->route(net, (struct dst_entry **)&rt,
+ flowi6_to_flowi(&flow), false);
+ } else {
+ route_err = 1;
+ }
+ rcu_read_unlock();
+
+ if (route_err)
+ return XT_ADDRTYPE_UNREACHABLE;
+
+ if (rt->rt6i_flags & RTF_REJECT)
+ ret = XT_ADDRTYPE_UNREACHABLE;
+
+ if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
+ ret |= XT_ADDRTYPE_LOCAL;
+ if (rt->rt6i_flags & RTF_ANYCAST)
+ ret |= XT_ADDRTYPE_ANYCAST;
+
+ dst_release(&rt->dst);
+ return ret;
+}
+
+static bool match_type6(struct net *net, const struct net_device *dev,
+ const struct in6_addr *addr, u16 mask)
+{
+ int addr_type = ipv6_addr_type(addr);
+
+ if ((mask & XT_ADDRTYPE_MULTICAST) &&
+ !(addr_type & IPV6_ADDR_MULTICAST))
+ return false;
+ if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST))
+ return false;
+ if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY)
+ return false;
+
+ if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST |
+ XT_ADDRTYPE_UNREACHABLE) & mask)
+ return !!(mask & match_lookup_rt6(net, dev, addr, mask));
+ return true;
+}
+
+static bool
+addrtype_mt6(struct net *net, const struct net_device *dev,
+ const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ bool ret = true;
+
+ if (info->source)
+ ret &= match_type6(net, dev, &iph->saddr, info->source) ^
+ (info->flags & XT_ADDRTYPE_INVERT_SOURCE);
+ if (ret && info->dest)
+ ret &= match_type6(net, dev, &iph->daddr, info->dest) ^
+ !!(info->flags & XT_ADDRTYPE_INVERT_DEST);
+ return ret;
+}
+#endif
+
+static inline bool match_type(struct net *net, const struct net_device *dev,
+ __be32 addr, u_int16_t mask)
+{
+ return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
+}
+
+static bool
+addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ const struct xt_addrtype_info *info = par->matchinfo;
+ const struct iphdr *iph = ip_hdr(skb);
+ bool ret = true;
+
+ if (info->source)
+ ret &= match_type(net, NULL, iph->saddr, info->source) ^
+ info->invert_source;
+ if (info->dest)
+ ret &= match_type(net, NULL, iph->daddr, info->dest) ^
+ info->invert_dest;
+
+ return ret;
+}
+
+static bool
+addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ const struct xt_addrtype_info_v1 *info = par->matchinfo;
+ const struct iphdr *iph;
+ const struct net_device *dev = NULL;
+ bool ret = true;
+
+ if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN)
+ dev = par->in;
+ else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
+ dev = par->out;
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ if (par->family == NFPROTO_IPV6)
+ return addrtype_mt6(net, dev, skb, info);
+#endif
+ iph = ip_hdr(skb);
+ if (info->source)
+ ret &= match_type(net, dev, iph->saddr, info->source) ^
+ (info->flags & XT_ADDRTYPE_INVERT_SOURCE);
+ if (ret && info->dest)
+ ret &= match_type(net, dev, iph->daddr, info->dest) ^
+ !!(info->flags & XT_ADDRTYPE_INVERT_DEST);
+ return ret;
+}
+
+static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_addrtype_info_v1 *info = par->matchinfo;
+
+ if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) {
+ pr_info("both incoming and outgoing "
+ "interface limitation cannot be selected\n");
+ return -EINVAL;
+ }
+
+ if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN)) &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) {
+ pr_info("output interface limitation "
+ "not valid in PREROUTING and INPUT\n");
+ return -EINVAL;
+ }
+
+ if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT)) &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) {
+ pr_info("input interface limitation "
+ "not valid in POSTROUTING and OUTPUT\n");
+ return -EINVAL;
+ }
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ if (par->family == NFPROTO_IPV6) {
+ if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) {
+ pr_err("ipv6 BLACKHOLE matching not supported\n");
+ return -EINVAL;
+ }
+ if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) {
+ pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n");
+ return -EINVAL;
+ }
+ if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) {
+ pr_err("ipv6 does not support BROADCAST matching\n");
+ return -EINVAL;
+ }
+ }
+#endif
+ return 0;
+}
+
+static struct xt_match addrtype_mt_reg[] __read_mostly = {
+ {
+ .name = "addrtype",
+ .family = NFPROTO_IPV4,
+ .match = addrtype_mt_v0,
+ .matchsize = sizeof(struct xt_addrtype_info),
+ .me = THIS_MODULE
+ },
+ {
+ .name = "addrtype",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .match = addrtype_mt_v1,
+ .checkentry = addrtype_mt_checkentry_v1,
+ .matchsize = sizeof(struct xt_addrtype_info_v1),
+ .me = THIS_MODULE
+ }
+};
+
+static int __init addrtype_mt_init(void)
+{
+ return xt_register_matches(addrtype_mt_reg,
+ ARRAY_SIZE(addrtype_mt_reg));
+}
+
+static void __exit addrtype_mt_exit(void)
+{
+ xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
+}
+
+module_init(addrtype_mt_init);
+module_exit(addrtype_mt_exit);
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
new file mode 100644
index 00000000000..bbffdbdaf60
--- /dev/null
+++ b/net/netfilter/xt_bpf.c
@@ -0,0 +1,74 @@
+/* Xtables module to match packets using a BPF filter.
+ * Copyright 2013 Google Inc.
+ * Written by Willem de Bruijn <willemb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+
+#include <linux/netfilter/xt_bpf.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Willem de Bruijn <willemb@google.com>");
+MODULE_DESCRIPTION("Xtables: BPF filter match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_bpf");
+MODULE_ALIAS("ip6t_bpf");
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info *info = par->matchinfo;
+ struct sock_fprog_kern program;
+
+ program.len = info->bpf_program_num_elem;
+ program.filter = info->bpf_program;
+
+ if (sk_unattached_filter_create(&info->filter, &program)) {
+ pr_info("bpf: check failed: parse error\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_bpf_info *info = par->matchinfo;
+
+ return SK_RUN_FILTER(info->filter, skb);
+}
+
+static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_bpf_info *info = par->matchinfo;
+ sk_unattached_filter_destroy(info->filter);
+}
+
+static struct xt_match bpf_mt_reg __read_mostly = {
+ .name = "bpf",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check,
+ .match = bpf_mt,
+ .destroy = bpf_mt_destroy,
+ .matchsize = sizeof(struct xt_bpf_info),
+ .me = THIS_MODULE,
+};
+
+static int __init bpf_mt_init(void)
+{
+ return xt_register_match(&bpf_mt_reg);
+}
+
+static void __exit bpf_mt_exit(void)
+{
+ xt_unregister_match(&bpf_mt_reg);
+}
+
+module_init(bpf_mt_init);
+module_exit(bpf_mt_exit);
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
new file mode 100644
index 00000000000..f4e83300532
--- /dev/null
+++ b/net/netfilter/xt_cgroup.c
@@ -0,0 +1,72 @@
+/*
+ * Xtables module to match the process control group.
+ *
+ * Might be used to implement individual "per-application" firewall
+ * policies in contrast to global policies based on control groups.
+ * Matching is based upon processes tagged to net_cls' classid marker.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_cgroup.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("Xtables: process control group matching");
+MODULE_ALIAS("ipt_cgroup");
+MODULE_ALIAS("ip6t_cgroup");
+
+static int cgroup_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_cgroup_info *info = par->matchinfo;
+
+ if (info->invert & ~1)
+ return -EINVAL;
+
+ return info->id ? 0 : -EINVAL;
+}
+
+static bool
+cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_cgroup_info *info = par->matchinfo;
+
+ if (skb->sk == NULL)
+ return false;
+
+ return (info->id == skb->sk->sk_classid) ^ info->invert;
+}
+
+static struct xt_match cgroup_mt_reg __read_mostly = {
+ .name = "cgroup",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = cgroup_mt_check,
+ .match = cgroup_mt,
+ .matchsize = sizeof(struct xt_cgroup_info),
+ .me = THIS_MODULE,
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+};
+
+static int __init cgroup_mt_init(void)
+{
+ return xt_register_match(&cgroup_mt_reg);
+}
+
+static void __exit cgroup_mt_exit(void)
+{
+ xt_unregister_match(&cgroup_mt_reg);
+}
+
+module_init(cgroup_mt_init);
+module_exit(cgroup_mt_exit);
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 5b138506690..1e634615ab9 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -26,60 +26,62 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)
u_int64_t what = 0; /* initialize to make gcc happy */
u_int64_t bytes = 0;
u_int64_t pkts = 0;
+ const struct nf_conn_acct *acct;
const struct nf_conn_counter *counters;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return false;
- counters = nf_conn_acct_find(ct);
- if (!counters)
+ acct = nf_conn_acct_find(ct);
+ if (!acct)
return false;
+ counters = acct->counter;
switch (sinfo->what) {
case XT_CONNBYTES_PKTS:
switch (sinfo->direction) {
case XT_CONNBYTES_DIR_ORIGINAL:
- what = counters[IP_CT_DIR_ORIGINAL].packets;
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);
break;
case XT_CONNBYTES_DIR_REPLY:
- what = counters[IP_CT_DIR_REPLY].packets;
+ what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
break;
case XT_CONNBYTES_DIR_BOTH:
- what = counters[IP_CT_DIR_ORIGINAL].packets;
- what += counters[IP_CT_DIR_REPLY].packets;
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);
+ what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
break;
}
break;
case XT_CONNBYTES_BYTES:
switch (sinfo->direction) {
case XT_CONNBYTES_DIR_ORIGINAL:
- what = counters[IP_CT_DIR_ORIGINAL].bytes;
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes);
break;
case XT_CONNBYTES_DIR_REPLY:
- what = counters[IP_CT_DIR_REPLY].bytes;
+ what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
break;
case XT_CONNBYTES_DIR_BOTH:
- what = counters[IP_CT_DIR_ORIGINAL].bytes;
- what += counters[IP_CT_DIR_REPLY].bytes;
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes);
+ what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
break;
}
break;
case XT_CONNBYTES_AVGPKT:
switch (sinfo->direction) {
case XT_CONNBYTES_DIR_ORIGINAL:
- bytes = counters[IP_CT_DIR_ORIGINAL].bytes;
- pkts = counters[IP_CT_DIR_ORIGINAL].packets;
+ bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes);
+ pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);
break;
case XT_CONNBYTES_DIR_REPLY:
- bytes = counters[IP_CT_DIR_REPLY].bytes;
- pkts = counters[IP_CT_DIR_REPLY].packets;
+ bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
+ pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
break;
case XT_CONNBYTES_DIR_BOTH:
- bytes = counters[IP_CT_DIR_ORIGINAL].bytes +
- counters[IP_CT_DIR_REPLY].bytes;
- pkts = counters[IP_CT_DIR_ORIGINAL].packets +
- counters[IP_CT_DIR_REPLY].packets;
+ bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) +
+ atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
+ pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) +
+ atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
break;
}
if (pkts != 0)
@@ -87,10 +89,10 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)
break;
}
- if (sinfo->count.to)
+ if (sinfo->count.to >= sinfo->count.from)
return what <= sinfo->count.to && what >= sinfo->count.from;
- else
- return what >= sinfo->count.from;
+ else /* inverted */
+ return what < sinfo->count.to || what > sinfo->count.from;
}
static int connbytes_mt_check(const struct xt_mtchk_param *par)
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
new file mode 100644
index 00000000000..9f8719df200
--- /dev/null
+++ b/net/netfilter/xt_connlabel.c
@@ -0,0 +1,99 @@
+/*
+ * (C) 2013 Astaro GmbH & Co KG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
+MODULE_ALIAS("ipt_connlabel");
+MODULE_ALIAS("ip6t_connlabel");
+
+static bool
+connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_connlabel_mtinfo *info = par->matchinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ bool invert = info->options & XT_CONNLABEL_OP_INVERT;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return invert;
+
+ if (info->options & XT_CONNLABEL_OP_SET)
+ return (nf_connlabel_set(ct, info->bit) == 0) ^ invert;
+
+ return nf_connlabel_match(ct, info->bit) ^ invert;
+}
+
+static int connlabel_mt_check(const struct xt_mtchk_param *par)
+{
+ const int options = XT_CONNLABEL_OP_INVERT |
+ XT_CONNLABEL_OP_SET;
+ struct xt_connlabel_mtinfo *info = par->matchinfo;
+ int ret;
+ size_t words;
+
+ if (info->bit > XT_CONNLABEL_MAXBIT)
+ return -ERANGE;
+
+ if (info->options & ~options) {
+ pr_err("Unknown options in mask %x\n", info->options);
+ return -EINVAL;
+ }
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0) {
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+ }
+
+ par->net->ct.labels_used++;
+ words = BITS_TO_LONGS(info->bit+1);
+ if (words > par->net->ct.label_words)
+ par->net->ct.label_words = words;
+
+ return ret;
+}
+
+static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ par->net->ct.labels_used--;
+ if (par->net->ct.labels_used == 0)
+ par->net->ct.label_words = 0;
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match connlabels_mt_reg __read_mostly = {
+ .name = "connlabel",
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connlabel_mt_check,
+ .match = connlabel_mt,
+ .matchsize = sizeof(struct xt_connlabel_mtinfo),
+ .destroy = connlabel_mt_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init connlabel_mt_init(void)
+{
+ return xt_register_match(&connlabels_mt_reg);
+}
+
+static void __exit connlabel_mt_exit(void)
+{
+ xt_unregister_match(&connlabels_mt_reg);
+}
+
+module_init(connlabel_mt_init);
+module_exit(connlabel_mt_exit);
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 5c5b6b921b8..fbc66bb250d 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -19,6 +19,7 @@
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/list.h>
+#include <linux/rbtree.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/skbuff.h>
@@ -31,23 +32,44 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#define CONNLIMIT_SLOTS 256U
+
+#ifdef CONFIG_LOCKDEP
+#define CONNLIMIT_LOCK_SLOTS 8U
+#else
+#define CONNLIMIT_LOCK_SLOTS 256U
+#endif
+
+#define CONNLIMIT_GC_MAX_NODES 8
+
/* we will save the tuples of all connections we care about */
struct xt_connlimit_conn {
- struct list_head list;
- struct nf_conntrack_tuple tuple;
+ struct hlist_node node;
+ struct nf_conntrack_tuple tuple;
+ union nf_inet_addr addr;
};
+struct xt_connlimit_rb {
+ struct rb_node node;
+ struct hlist_head hhead; /* connections/hosts in same subnet */
+ union nf_inet_addr addr; /* search key */
+};
+
+static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp;
+
struct xt_connlimit_data {
- struct list_head iphash[256];
- spinlock_t lock;
+ struct rb_root climit_root4[CONNLIMIT_SLOTS];
+ struct rb_root climit_root6[CONNLIMIT_SLOTS];
};
static u_int32_t connlimit_rnd __read_mostly;
-static bool connlimit_rnd_inited __read_mostly;
+static struct kmem_cache *connlimit_rb_cachep __read_mostly;
+static struct kmem_cache *connlimit_conn_cachep __read_mostly;
static inline unsigned int connlimit_iphash(__be32 addr)
{
- return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF;
+ return jhash_1word((__force __u32)addr,
+ connlimit_rnd) % CONNLIMIT_SLOTS;
}
static inline unsigned int
@@ -60,7 +82,8 @@ connlimit_iphash6(const union nf_inet_addr *addr,
for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)
res.ip6[i] = addr->ip6[i] & mask->ip6[i];
- return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF;
+ return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6),
+ connlimit_rnd) % CONNLIMIT_SLOTS;
}
static inline bool already_closed(const struct nf_conn *conn)
@@ -72,13 +95,14 @@ static inline bool already_closed(const struct nf_conn *conn)
return 0;
}
-static inline unsigned int
+static int
same_source_net(const union nf_inet_addr *addr,
const union nf_inet_addr *mask,
const union nf_inet_addr *u3, u_int8_t family)
{
if (family == NFPROTO_IPV4) {
- return (addr->ip & mask->ip) == (u3->ip & mask->ip);
+ return ntohl(addr->ip & mask->ip) -
+ ntohl(u3->ip & mask->ip);
} else {
union nf_inet_addr lh, rh;
unsigned int i;
@@ -88,88 +112,205 @@ same_source_net(const union nf_inet_addr *addr,
rh.ip6[i] = u3->ip6[i] & mask->ip6[i];
}
- return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0;
+ return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6));
}
}
-static int count_them(struct net *net,
- struct xt_connlimit_data *data,
+static bool add_hlist(struct hlist_head *head,
const struct nf_conntrack_tuple *tuple,
- const union nf_inet_addr *addr,
- const union nf_inet_addr *mask,
- u_int8_t family)
+ const union nf_inet_addr *addr)
+{
+ struct xt_connlimit_conn *conn;
+
+ conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
+ if (conn == NULL)
+ return false;
+ conn->tuple = *tuple;
+ conn->addr = *addr;
+ hlist_add_head(&conn->node, head);
+ return true;
+}
+
+static unsigned int check_hlist(struct net *net,
+ struct hlist_head *head,
+ const struct nf_conntrack_tuple *tuple,
+ bool *addit)
{
const struct nf_conntrack_tuple_hash *found;
struct xt_connlimit_conn *conn;
- struct xt_connlimit_conn *tmp;
+ struct hlist_node *n;
struct nf_conn *found_ct;
- struct list_head *hash;
- bool addit = true;
- int matches = 0;
-
- if (family == NFPROTO_IPV6)
- hash = &data->iphash[connlimit_iphash6(addr, mask)];
- else
- hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)];
+ unsigned int length = 0;
+ *addit = true;
rcu_read_lock();
/* check the saved connections */
- list_for_each_entry_safe(conn, tmp, hash, list) {
+ hlist_for_each_entry_safe(conn, n, head, node) {
found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE,
&conn->tuple);
- found_ct = NULL;
+ if (found == NULL) {
+ hlist_del(&conn->node);
+ kmem_cache_free(connlimit_conn_cachep, conn);
+ continue;
+ }
- if (found != NULL)
- found_ct = nf_ct_tuplehash_to_ctrack(found);
+ found_ct = nf_ct_tuplehash_to_ctrack(found);
- if (found_ct != NULL &&
- nf_ct_tuple_equal(&conn->tuple, tuple) &&
- !already_closed(found_ct))
+ if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
/*
* Just to be sure we have it only once in the list.
* We should not see tuples twice unless someone hooks
* this into a table without "-p tcp --syn".
*/
- addit = false;
-
- if (found == NULL) {
- /* this one is gone */
- list_del(&conn->list);
- kfree(conn);
- continue;
- }
-
- if (already_closed(found_ct)) {
+ *addit = false;
+ } else if (already_closed(found_ct)) {
/*
* we do not care about connections which are
* closed already -> ditch it
*/
nf_ct_put(found_ct);
- list_del(&conn->list);
- kfree(conn);
+ hlist_del(&conn->node);
+ kmem_cache_free(connlimit_conn_cachep, conn);
continue;
}
- if (same_source_net(addr, mask, &conn->tuple.src.u3, family))
- /* same source network -> be counted! */
- ++matches;
nf_ct_put(found_ct);
+ length++;
}
rcu_read_unlock();
- if (addit) {
- /* save the new connection in our list */
- conn = kzalloc(sizeof(*conn), GFP_ATOMIC);
- if (conn == NULL)
- return -ENOMEM;
- conn->tuple = *tuple;
- list_add(&conn->list, hash);
- ++matches;
+ return length;
+}
+
+static void tree_nodes_free(struct rb_root *root,
+ struct xt_connlimit_rb *gc_nodes[],
+ unsigned int gc_count)
+{
+ struct xt_connlimit_rb *rbconn;
+
+ while (gc_count) {
+ rbconn = gc_nodes[--gc_count];
+ rb_erase(&rbconn->node, root);
+ kmem_cache_free(connlimit_rb_cachep, rbconn);
+ }
+}
+
+static unsigned int
+count_tree(struct net *net, struct rb_root *root,
+ const struct nf_conntrack_tuple *tuple,
+ const union nf_inet_addr *addr, const union nf_inet_addr *mask,
+ u8 family)
+{
+ struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
+ struct rb_node **rbnode, *parent;
+ struct xt_connlimit_rb *rbconn;
+ struct xt_connlimit_conn *conn;
+ unsigned int gc_count;
+ bool no_gc = false;
+
+ restart:
+ gc_count = 0;
+ parent = NULL;
+ rbnode = &(root->rb_node);
+ while (*rbnode) {
+ int diff;
+ bool addit;
+
+ rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
+
+ parent = *rbnode;
+ diff = same_source_net(addr, mask, &rbconn->addr, family);
+ if (diff < 0) {
+ rbnode = &((*rbnode)->rb_left);
+ } else if (diff > 0) {
+ rbnode = &((*rbnode)->rb_right);
+ } else {
+ /* same source network -> be counted! */
+ unsigned int count;
+ count = check_hlist(net, &rbconn->hhead, tuple, &addit);
+
+ tree_nodes_free(root, gc_nodes, gc_count);
+ if (!addit)
+ return count;
+
+ if (!add_hlist(&rbconn->hhead, tuple, addr))
+ return 0; /* hotdrop */
+
+ return count + 1;
+ }
+
+ if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+ continue;
+
+ /* only used for GC on hhead, retval and 'addit' ignored */
+ check_hlist(net, &rbconn->hhead, tuple, &addit);
+ if (hlist_empty(&rbconn->hhead))
+ gc_nodes[gc_count++] = rbconn;
+ }
+
+ if (gc_count) {
+ no_gc = true;
+ tree_nodes_free(root, gc_nodes, gc_count);
+ /* tree_node_free before new allocation permits
+ * allocator to re-use newly free'd object.
+ *
+ * This is a rare event; in most cases we will find
+ * existing node to re-use. (or gc_count is 0).
+ */
+ goto restart;
}
- return matches;
+ /* no match, need to insert new node */
+ rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC);
+ if (rbconn == NULL)
+ return 0;
+
+ conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
+ if (conn == NULL) {
+ kmem_cache_free(connlimit_rb_cachep, rbconn);
+ return 0;
+ }
+
+ conn->tuple = *tuple;
+ conn->addr = *addr;
+ rbconn->addr = *addr;
+
+ INIT_HLIST_HEAD(&rbconn->hhead);
+ hlist_add_head(&conn->node, &rbconn->hhead);
+
+ rb_link_node(&rbconn->node, parent, rbnode);
+ rb_insert_color(&rbconn->node, root);
+ return 1;
+}
+
+static int count_them(struct net *net,
+ struct xt_connlimit_data *data,
+ const struct nf_conntrack_tuple *tuple,
+ const union nf_inet_addr *addr,
+ const union nf_inet_addr *mask,
+ u_int8_t family)
+{
+ struct rb_root *root;
+ int count;
+ u32 hash;
+
+ if (family == NFPROTO_IPV6) {
+ hash = connlimit_iphash6(addr, mask);
+ root = &data->climit_root6[hash];
+ } else {
+ hash = connlimit_iphash(addr->ip & mask->ip);
+ root = &data->climit_root4[hash];
+ }
+
+ spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
+
+ count = count_tree(net, root, tuple, addr, mask, family);
+
+ spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
+
+ return count;
}
static bool
@@ -182,35 +323,33 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct nf_conntrack_tuple *tuple_ptr = &tuple;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
- int connections;
+ unsigned int connections;
ct = nf_ct_get(skb, &ctinfo);
if (ct != NULL)
- tuple_ptr = &ct->tuplehash[0].tuple;
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
par->family, &tuple))
goto hotdrop;
if (par->family == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
- memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr));
+ memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
+ &iph->daddr : &iph->saddr, sizeof(addr.ip6));
} else {
const struct iphdr *iph = ip_hdr(skb);
- addr.ip = iph->saddr;
+ addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
+ iph->daddr : iph->saddr;
}
- spin_lock_bh(&info->data->lock);
connections = count_them(net, info->data, tuple_ptr, &addr,
&info->mask, par->family);
- spin_unlock_bh(&info->data->lock);
-
- if (connections < 0) {
+ if (connections == 0)
/* kmalloc failed, drop it entirely */
- par->hotdrop = true;
- return false;
- }
+ goto hotdrop;
- return (connections > info->limit) ^ info->inverse;
+ return (connections > info->limit) ^
+ !!(info->flags & XT_CONNLIMIT_INVERT);
hotdrop:
par->hotdrop = true;
@@ -223,9 +362,13 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
unsigned int i;
int ret;
- if (unlikely(!connlimit_rnd_inited)) {
- get_random_bytes(&connlimit_rnd, sizeof(connlimit_rnd));
- connlimit_rnd_inited = true;
+ if (unlikely(!connlimit_rnd)) {
+ u_int32_t rand;
+
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&connlimit_rnd, 0, rand);
}
ret = nf_ct_l3proto_try_module_get(par->family);
if (ret < 0) {
@@ -241,36 +384,51 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
return -ENOMEM;
}
- spin_lock_init(&info->data->lock);
- for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i)
- INIT_LIST_HEAD(&info->data->iphash[i]);
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+ info->data->climit_root4[i] = RB_ROOT;
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+ info->data->climit_root6[i] = RB_ROOT;
return 0;
}
+static void destroy_tree(struct rb_root *r)
+{
+ struct xt_connlimit_conn *conn;
+ struct xt_connlimit_rb *rbconn;
+ struct hlist_node *n;
+ struct rb_node *node;
+
+ while ((node = rb_first(r)) != NULL) {
+ rbconn = container_of(node, struct xt_connlimit_rb, node);
+
+ rb_erase(node, r);
+
+ hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
+ kmem_cache_free(connlimit_conn_cachep, conn);
+
+ kmem_cache_free(connlimit_rb_cachep, rbconn);
+ }
+}
+
static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_connlimit_info *info = par->matchinfo;
- struct xt_connlimit_conn *conn;
- struct xt_connlimit_conn *tmp;
- struct list_head *hash = info->data->iphash;
unsigned int i;
nf_ct_l3proto_module_put(par->family);
- for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) {
- list_for_each_entry_safe(conn, tmp, &hash[i], list) {
- list_del(&conn->list);
- kfree(conn);
- }
- }
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+ destroy_tree(&info->data->climit_root4[i]);
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+ destroy_tree(&info->data->climit_root6[i]);
kfree(info->data);
}
static struct xt_match connlimit_mt_reg __read_mostly = {
.name = "connlimit",
- .revision = 0,
+ .revision = 1,
.family = NFPROTO_UNSPEC,
.checkentry = connlimit_mt_check,
.match = connlimit_mt,
@@ -281,12 +439,40 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
static int __init connlimit_mt_init(void)
{
- return xt_register_match(&connlimit_mt_reg);
+ int ret, i;
+
+ BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
+ BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
+
+ for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i)
+ spin_lock_init(&xt_connlimit_locks[i]);
+
+ connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
+ sizeof(struct xt_connlimit_conn),
+ 0, 0, NULL);
+ if (!connlimit_conn_cachep)
+ return -ENOMEM;
+
+ connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb",
+ sizeof(struct xt_connlimit_rb),
+ 0, 0, NULL);
+ if (!connlimit_rb_cachep) {
+ kmem_cache_destroy(connlimit_conn_cachep);
+ return -ENOMEM;
+ }
+ ret = xt_register_match(&connlimit_mt_reg);
+ if (ret != 0) {
+ kmem_cache_destroy(connlimit_conn_cachep);
+ kmem_cache_destroy(connlimit_rb_cachep);
+ }
+ return ret;
}
static void __exit connlimit_mt_exit(void)
{
xt_unregister_match(&connlimit_mt_reg);
+ kmem_cache_destroy(connlimit_conn_cachep);
+ kmem_cache_destroy(connlimit_rb_cachep);
}
module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 7278145e6a6..69f78e96fdb 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -17,8 +17,7 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index e536710ad91..188404b9b00 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -3,6 +3,7 @@
* information. (Superset of Rusty's minimalistic state match.)
*
* (C) 2001 Marc Boucher (marc@mbsi.ca).
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
*
* This program is free software; you can redistribute it and/or modify
@@ -112,6 +113,54 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
return true;
}
+static inline bool
+port_match(u16 min, u16 max, u16 port, bool invert)
+{
+ return (port >= min && port <= max) ^ invert;
+}
+
+static inline bool
+ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
+ const struct nf_conn *ct)
+{
+ const struct nf_conntrack_tuple *tuple;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if ((info->match_flags & XT_CONNTRACK_PROTO) &&
+ (nf_ct_protonum(ct) == info->l4proto) ^
+ !(info->invert_flags & XT_CONNTRACK_PROTO))
+ return false;
+
+ /* Shortcut to match all recognized protocols by using ->src.all. */
+ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
+ !port_match(info->origsrc_port, info->origsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
+ !port_match(info->origdst_port, info->origdst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
+ return false;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
+ !port_match(info->replsrc_port, info->replsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
+ !port_match(info->repldst_port, info->repldst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
+ return false;
+
+ return true;
+}
+
static bool
conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
u16 state_mask, u16 status_mask)
@@ -147,7 +196,7 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
return info->match_flags & XT_CONNTRACK_STATE;
if ((info->match_flags & XT_CONNTRACK_DIRECTION) &&
(CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^
- !!(info->invert_flags & XT_CONNTRACK_DIRECTION))
+ !(info->invert_flags & XT_CONNTRACK_DIRECTION))
return false;
if (info->match_flags & XT_CONNTRACK_ORIGSRC)
@@ -170,8 +219,13 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
!(info->invert_flags & XT_CONNTRACK_REPLDST))
return false;
- if (!ct_proto_port_check(info, ct))
- return false;
+ if (par->match->revision != 3) {
+ if (!ct_proto_port_check(info, ct))
+ return false;
+ } else {
+ if (!ct_proto_port_check_v3(par->matchinfo, ct))
+ return false;
+ }
if ((info->match_flags & XT_CONNTRACK_STATUS) &&
(!!(status_mask & ct->status) ^
@@ -207,6 +261,14 @@ conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
return conntrack_mt(skb, par, info->state_mask, info->status_mask);
}
+static bool
+conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_conntrack_mtinfo3 *info = par->matchinfo;
+
+ return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
static int conntrack_mt_check(const struct xt_mtchk_param *par)
{
int ret;
@@ -244,6 +306,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
.destroy = conntrack_mt_destroy,
.me = THIS_MODULE,
},
+ {
+ .name = "conntrack",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .matchsize = sizeof(struct xt_conntrack_mtinfo3),
+ .match = conntrack_mt_v3,
+ .checkentry = conntrack_mt_check,
+ .destroy = conntrack_mt_destroy,
+ .me = THIS_MODULE,
+ },
};
static int __init conntrack_mt_init(void)
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
index b39db8a5cba..c7a2e5466bc 100644
--- a/net/netfilter/xt_cpu.c
+++ b/net/netfilter/xt_cpu.c
@@ -22,6 +22,8 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
MODULE_DESCRIPTION("Xtables: CPU match");
+MODULE_ALIAS("ipt_cpu");
+MODULE_ALIAS("ip6t_cpu");
static int cpu_mt_check(const struct xt_mtchk_param *par)
{
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
new file mode 100644
index 00000000000..d9202cdd25c
--- /dev/null
+++ b/net/netfilter/xt_devgroup.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <linux/netfilter/xt_devgroup.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Device group match");
+MODULE_ALIAS("ipt_devgroup");
+MODULE_ALIAS("ip6t_devgroup");
+
+static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
+ return false;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
+ return false;
+
+ return true;
+}
+
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+ XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD)))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match devgroup_mt_reg __read_mostly = {
+ .name = "devgroup",
+ .match = devgroup_mt,
+ .checkentry = devgroup_mt_checkentry,
+ .matchsize = sizeof(struct xt_devgroup_info),
+ .family = NFPROTO_UNSPEC,
+ .me = THIS_MODULE
+};
+
+static int __init devgroup_mt_init(void)
+{
+ return xt_register_match(&devgroup_mt_reg);
+}
+
+static void __exit devgroup_mt_exit(void)
+{
+ xt_unregister_match(&devgroup_mt_reg);
+}
+
+module_init(devgroup_mt_init);
+module_exit(devgroup_mt_exit);
diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c
new file mode 100644
index 00000000000..3c831a8efeb
--- /dev/null
+++ b/net/netfilter/xt_ecn.c
@@ -0,0 +1,179 @@
+/*
+ * Xtables module for matching the value of the IPv4/IPv6 and TCP ECN bits
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ecn.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ecn");
+MODULE_ALIAS("ip6t_ecn");
+
+static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ecn_info *einfo = par->matchinfo;
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ /* In practice, TCP match does this, so can't fail. But let's
+ * be good citizens.
+ */
+ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+
+ if (einfo->operation & XT_ECN_OP_MATCH_ECE) {
+ if (einfo->invert & XT_ECN_OP_MATCH_ECE) {
+ if (th->ece == 1)
+ return false;
+ } else {
+ if (th->ece == 0)
+ return false;
+ }
+ }
+
+ if (einfo->operation & XT_ECN_OP_MATCH_CWR) {
+ if (einfo->invert & XT_ECN_OP_MATCH_CWR) {
+ if (th->cwr == 1)
+ return false;
+ } else {
+ if (th->cwr == 0)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static inline bool match_ip(const struct sk_buff *skb,
+ const struct xt_ecn_info *einfo)
+{
+ return ((ip_hdr(skb)->tos & XT_ECN_IP_MASK) == einfo->ip_ect) ^
+ !!(einfo->invert & XT_ECN_OP_MATCH_IP);
+}
+
+static bool ecn_mt4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_IP && !match_ip(skb, info))
+ return false;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ !match_tcp(skb, par))
+ return false;
+
+ return true;
+}
+
+static int ecn_mt_check4(const struct xt_mtchk_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+ const struct ipt_ip *ip = par->entryinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->invert & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) {
+ pr_info("cannot match TCP bits in rule for non-tcp packets\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline bool match_ipv6(const struct sk_buff *skb,
+ const struct xt_ecn_info *einfo)
+{
+ return (((ipv6_hdr(skb)->flow_lbl[0] >> 4) & XT_ECN_IP_MASK) ==
+ einfo->ip_ect) ^
+ !!(einfo->invert & XT_ECN_OP_MATCH_IP);
+}
+
+static bool ecn_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_IP && !match_ipv6(skb, info))
+ return false;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ !match_tcp(skb, par))
+ return false;
+
+ return true;
+}
+
+static int ecn_mt_check6(const struct xt_mtchk_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+ const struct ip6t_ip6 *ip = par->entryinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->invert & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ (ip->proto != IPPROTO_TCP || ip->invflags & IP6T_INV_PROTO)) {
+ pr_info("cannot match TCP bits in rule for non-tcp packets\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match ecn_mt_reg[] __read_mostly = {
+ {
+ .name = "ecn",
+ .family = NFPROTO_IPV4,
+ .match = ecn_mt4,
+ .matchsize = sizeof(struct xt_ecn_info),
+ .checkentry = ecn_mt_check4,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "ecn",
+ .family = NFPROTO_IPV6,
+ .match = ecn_mt6,
+ .matchsize = sizeof(struct xt_ecn_info),
+ .checkentry = ecn_mt_check6,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init ecn_mt_init(void)
+{
+ return xt_register_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg));
+}
+
+static void __exit ecn_mt_exit(void)
+{
+ xt_unregister_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg));
+}
+
+module_init(ecn_mt_init);
+module_exit(ecn_mt_exit);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9228ee0dc11..a3910fc2122 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -3,6 +3,7 @@
* separately for each hashbucket (sourceip/sourceport/dstip/dstport)
*
* (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
*
* Development of this code was funded by Astaro AG, http://www.astaro.com/
@@ -21,7 +22,7 @@
#include <linux/mm.h>
#include <linux/in.h>
#include <linux/ip.h>
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
#include <linux/ipv6.h>
#include <net/ipv6.h>
#endif
@@ -64,7 +65,7 @@ struct dsthash_dst {
__be32 src;
__be32 dst;
} ip;
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
struct {
__be32 src[4];
__be32 dst[4];
@@ -107,6 +108,7 @@ struct xt_hashlimit_htable {
/* seq_file stuff */
struct proc_dir_entry *pde;
+ const char *name;
struct net *net;
struct hlist_head hash[0]; /* hashtable itself */
@@ -141,11 +143,10 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
const struct dsthash_dst *dst)
{
struct dsthash_ent *ent;
- struct hlist_node *pos;
u_int32_t hash = hash_dst(ht, dst);
if (!hlist_empty(&ht->hash[hash])) {
- hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+ hlist_for_each_entry_rcu(ent, &ht->hash[hash], node)
if (dst_cmp(ent, dst)) {
spin_lock(&ent->lock);
return ent;
@@ -157,11 +158,22 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
/* allocate dsthash_ent, initialize dst, put in htable and lock it */
static struct dsthash_ent *
dsthash_alloc_init(struct xt_hashlimit_htable *ht,
- const struct dsthash_dst *dst)
+ const struct dsthash_dst *dst, bool *race)
{
struct dsthash_ent *ent;
spin_lock(&ht->lock);
+
+ /* Two or more packets may race to create the same entry in the
+ * hashtable, double check if this packet lost race.
+ */
+ ent = dsthash_find(ht, dst);
+ if (ent != NULL) {
+ spin_unlock(&ht->lock);
+ *race = true;
+ return ent;
+ }
+
/* initialize hash with random val at the time we allocate
* the first hashtable entry */
if (unlikely(!ht->rnd_initialized)) {
@@ -171,15 +183,11 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
if (ht->cfg.max && ht->count >= ht->cfg.max) {
/* FIXME: do something. question is what.. */
- if (net_ratelimit())
- pr_err("max count of %u reached\n", ht->cfg.max);
+ net_err_ratelimited("max count of %u reached\n", ht->cfg.max);
ent = NULL;
} else
ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
- if (!ent) {
- if (net_ratelimit())
- pr_err("cannot allocate dsthash_ent\n");
- } else {
+ if (ent) {
memcpy(&ent->dst, dst, sizeof(ent->dst));
spin_lock_init(&ent->lock);
@@ -247,6 +255,11 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
+ hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+ if (!hinfo->name) {
+ vfree(hinfo);
+ return -ENOMEM;
+ }
spin_lock_init(&hinfo->lock);
hinfo->pde = proc_create_data(minfo->name, 0,
@@ -254,6 +267,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
&dl_file_ops, hinfo);
if (hinfo->pde == NULL) {
+ kfree(hinfo->name);
vfree(hinfo);
return -ENOMEM;
}
@@ -290,8 +304,8 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
spin_lock_bh(&ht->lock);
for (i = 0; i < ht->cfg.size; i++) {
struct dsthash_ent *dh;
- struct hlist_node *pos, *n;
- hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) {
+ struct hlist_node *n;
+ hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {
if ((*select)(ht, dh))
dsthash_free(ht, dh);
}
@@ -311,19 +325,26 @@ static void htable_gc(unsigned long htlong)
add_timer(&ht->timer);
}
-static void htable_destroy(struct xt_hashlimit_htable *hinfo)
+static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)
{
struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net);
struct proc_dir_entry *parent;
- del_timer_sync(&hinfo->timer);
-
if (hinfo->family == NFPROTO_IPV4)
parent = hashlimit_net->ipt_hashlimit;
else
parent = hashlimit_net->ip6t_hashlimit;
- remove_proc_entry(hinfo->pde->name, parent);
+
+ if (parent != NULL)
+ remove_proc_entry(hinfo->name, parent);
+}
+
+static void htable_destroy(struct xt_hashlimit_htable *hinfo)
+{
+ del_timer_sync(&hinfo->timer);
+ htable_remove_proc_entry(hinfo);
htable_selective_cleanup(hinfo, select_all);
+ kfree(hinfo->name);
vfree(hinfo);
}
@@ -333,10 +354,9 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
{
struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
struct xt_hashlimit_htable *hinfo;
- struct hlist_node *pos;
- hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) {
- if (!strcmp(name, hinfo->pde->name) &&
+ hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) {
+ if (!strcmp(name, hinfo->name) &&
hinfo->family == family) {
hinfo->use++;
return hinfo;
@@ -391,9 +411,20 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+/* in byte mode, the lowest possible rate is one packet/second.
+ * credit_cap is used as a counter that tells us how many times we can
+ * refill the "credits available" counter when it becomes empty.
+ */
+#define MAX_CPJ_BYTES (0xFFFFFFFF / HZ)
+#define CREDITS_PER_JIFFY_BYTES POW2_BELOW32(MAX_CPJ_BYTES)
+
+static u32 xt_hashlimit_len_to_chunks(u32 len)
+{
+ return (len >> XT_HASHLIMIT_BYTE_SHIFT) + 1;
+}
+
/* Precision saver. */
-static inline u_int32_t
-user2credits(u_int32_t user)
+static u32 user2credits(u32 user)
{
/* If multiplying would overflow... */
if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
@@ -403,12 +434,53 @@ user2credits(u_int32_t user)
return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
}
-static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+static u32 user2credits_byte(u32 user)
{
- dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY;
- if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
- dh->rateinfo.credit = dh->rateinfo.credit_cap;
+ u64 us = user;
+ us *= HZ * CREDITS_PER_JIFFY_BYTES;
+ return (u32) (us >> 32);
+}
+
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+{
+ unsigned long delta = now - dh->rateinfo.prev;
+ u32 cap;
+
+ if (delta == 0)
+ return;
+
dh->rateinfo.prev = now;
+
+ if (mode & XT_HASHLIMIT_BYTES) {
+ u32 tmp = dh->rateinfo.credit;
+ dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
+ cap = CREDITS_PER_JIFFY_BYTES * HZ;
+ if (tmp >= dh->rateinfo.credit) {/* overflow */
+ dh->rateinfo.credit = cap;
+ return;
+ }
+ } else {
+ dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+ cap = dh->rateinfo.credit_cap;
+ }
+ if (dh->rateinfo.credit > cap)
+ dh->rateinfo.credit = cap;
+}
+
+static void rateinfo_init(struct dsthash_ent *dh,
+ struct xt_hashlimit_htable *hinfo)
+{
+ dh->rateinfo.prev = jiffies;
+ if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
+ dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
+ dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg);
+ dh->rateinfo.credit_cap = hinfo->cfg.burst;
+ } else {
+ dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
+ hinfo->cfg.burst);
+ dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+ dh->rateinfo.credit_cap = dh->rateinfo.credit;
+ }
}
static inline __be32 maskl(__be32 a, unsigned int l)
@@ -416,7 +488,7 @@ static inline __be32 maskl(__be32 a, unsigned int l)
return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0;
}
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static void hashlimit_ipv6_mask(__be32 *i, unsigned int p)
{
switch (p) {
@@ -466,8 +538,11 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
nexthdr = ip_hdr(skb)->protocol;
break;
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
case NFPROTO_IPV6:
+ {
+ __be16 frag_off;
+
if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) {
memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr,
sizeof(dst->ip6.dst));
@@ -483,10 +558,11 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
(XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
return 0;
nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off);
if ((int)protoff < 0)
return -1;
break;
+ }
#endif
default:
BUG();
@@ -510,6 +586,21 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
}
+static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
+{
+ u64 tmp = xt_hashlimit_len_to_chunks(len);
+ tmp = tmp * dh->rateinfo.cost;
+
+ if (unlikely(tmp > CREDITS_PER_JIFFY_BYTES * HZ))
+ tmp = CREDITS_PER_JIFFY_BYTES * HZ;
+
+ if (dh->rateinfo.credit < tmp && dh->rateinfo.credit_cap) {
+ dh->rateinfo.credit_cap--;
+ dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
+ }
+ return (u32) tmp;
+}
+
static bool
hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -518,6 +609,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
unsigned long now = jiffies;
struct dsthash_ent *dh;
struct dsthash_dst dst;
+ bool race = false;
+ u32 cost;
if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
goto hotdrop;
@@ -525,27 +618,32 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
rcu_read_lock_bh();
dh = dsthash_find(hinfo, &dst);
if (dh == NULL) {
- dh = dsthash_alloc_init(hinfo, &dst);
+ dh = dsthash_alloc_init(hinfo, &dst, &race);
if (dh == NULL) {
rcu_read_unlock_bh();
goto hotdrop;
+ } else if (race) {
+ /* Already got an entry, update expiration timeout */
+ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode);
+ } else {
+ dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
+ rateinfo_init(dh, hinfo);
}
- dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
- dh->rateinfo.prev = jiffies;
- dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
} else {
/* update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode);
}
- if (dh->rateinfo.credit >= dh->rateinfo.cost) {
+ if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+ cost = hashlimit_byte_cost(skb->len, dh);
+ else
+ cost = dh->rateinfo.cost;
+
+ if (dh->rateinfo.credit >= cost) {
/* below the limit */
- dh->rateinfo.credit -= dh->rateinfo.cost;
+ dh->rateinfo.credit -= cost;
spin_unlock(&dh->lock);
rcu_read_unlock_bh();
return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
@@ -567,14 +665,6 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
int ret;
- /* Check for overflow. */
- if (info->cfg.burst == 0 ||
- user2credits(info->cfg.avg * info->cfg.burst) <
- user2credits(info->cfg.avg)) {
- pr_info("overflow, try lower: %u/%u\n",
- info->cfg.avg, info->cfg.burst);
- return -ERANGE;
- }
if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
return -EINVAL;
if (info->name[sizeof(info->name)-1] != '\0')
@@ -587,6 +677,26 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->cfg.mode & ~XT_HASHLIMIT_ALL) {
+ pr_info("Unknown mode mask %X, kernel too old?\n",
+ info->cfg.mode);
+ return -EINVAL;
+ }
+
+ /* Check for overflow. */
+ if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
+ if (user2credits_byte(info->cfg.avg) == 0) {
+ pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+ return -EINVAL;
+ }
+ } else if (info->cfg.burst == 0 ||
+ user2credits(info->cfg.avg * info->cfg.burst) <
+ user2credits(info->cfg.avg)) {
+ pr_info("overflow, try lower: %u/%u\n",
+ info->cfg.avg, info->cfg.burst);
+ return -ERANGE;
+ }
+
mutex_lock(&hashlimit_mutex);
info->hinfo = htable_find_get(net, info->name, par->family);
if (info->hinfo == NULL) {
@@ -618,7 +728,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
.destroy = hashlimit_mt_destroy,
.me = THIS_MODULE,
},
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "hashlimit",
.revision = 1,
@@ -679,10 +789,11 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
int res;
+ const struct xt_hashlimit_htable *ht = s->private;
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
- rateinfo_recalc(ent, jiffies);
+ rateinfo_recalc(ent, jiffies, ht->cfg.mode);
switch (family) {
case NFPROTO_IPV4:
@@ -695,7 +806,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
ent->rateinfo.credit, ent->rateinfo.credit_cap,
ent->rateinfo.cost);
break;
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
case NFPROTO_IPV6:
res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
(long)(ent->expires - jiffies)/HZ,
@@ -720,10 +831,9 @@ static int dl_seq_show(struct seq_file *s, void *v)
struct xt_hashlimit_htable *htable = s->private;
unsigned int *bucket = (unsigned int *)v;
struct dsthash_ent *ent;
- struct hlist_node *pos;
if (!hlist_empty(&htable->hash[*bucket])) {
- hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node)
+ hlist_for_each_entry(ent, &htable->hash[*bucket], node)
if (dl_seq_real_show(ent, htable->family, s))
return -1;
}
@@ -743,7 +853,7 @@ static int dl_proc_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- sf->private = PDE(inode)->data;
+ sf->private = PDE_DATA(inode);
}
return ret;
}
@@ -763,10 +873,10 @@ static int __net_init hashlimit_proc_net_init(struct net *net)
hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net);
if (!hashlimit_net->ipt_hashlimit)
return -ENOMEM;
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net);
if (!hashlimit_net->ip6t_hashlimit) {
- proc_net_remove(net, "ipt_hashlimit");
+ remove_proc_entry("ipt_hashlimit", net->proc_net);
return -ENOMEM;
}
#endif
@@ -775,9 +885,23 @@ static int __net_init hashlimit_proc_net_init(struct net *net)
static void __net_exit hashlimit_proc_net_exit(struct net *net)
{
- proc_net_remove(net, "ipt_hashlimit");
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
- proc_net_remove(net, "ip6t_hashlimit");
+ struct xt_hashlimit_htable *hinfo;
+ struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+
+ /* hashlimit_net_exit() is called before hashlimit_mt_destroy().
+ * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc
+ * entries is empty before trying to remove it.
+ */
+ mutex_lock(&hashlimit_mutex);
+ hlist_for_each_entry(hinfo, &hashlimit_net->htables, node)
+ htable_remove_proc_entry(hinfo);
+ hashlimit_net->ipt_hashlimit = NULL;
+ hashlimit_net->ip6t_hashlimit = NULL;
+ mutex_unlock(&hashlimit_mutex);
+
+ remove_proc_entry("ipt_hashlimit", net->proc_net);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ remove_proc_entry("ip6t_hashlimit", net->proc_net);
#endif
}
@@ -791,9 +915,6 @@ static int __net_init hashlimit_net_init(struct net *net)
static void __net_exit hashlimit_net_exit(struct net *net)
{
- struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
-
- BUG_ON(!hlist_empty(&hashlimit_net->htables));
hashlimit_proc_net_exit(net);
}
diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c
index 7d12221ead8..003951149c9 100644
--- a/net/netfilter/xt_hl.c
+++ b/net/netfilter/xt_hl.c
@@ -31,14 +31,14 @@ static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par)
const u8 ttl = ip_hdr(skb)->ttl;
switch (info->mode) {
- case IPT_TTL_EQ:
- return ttl == info->ttl;
- case IPT_TTL_NE:
- return ttl != info->ttl;
- case IPT_TTL_LT:
- return ttl < info->ttl;
- case IPT_TTL_GT:
- return ttl > info->ttl;
+ case IPT_TTL_EQ:
+ return ttl == info->ttl;
+ case IPT_TTL_NE:
+ return ttl != info->ttl;
+ case IPT_TTL_LT:
+ return ttl < info->ttl;
+ case IPT_TTL_GT:
+ return ttl > info->ttl;
}
return false;
@@ -50,14 +50,14 @@ static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par)
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
switch (info->mode) {
- case IP6T_HL_EQ:
- return ip6h->hop_limit == info->hop_limit;
- case IP6T_HL_NE:
- return ip6h->hop_limit != info->hop_limit;
- case IP6T_HL_LT:
- return ip6h->hop_limit < info->hop_limit;
- case IP6T_HL_GT:
- return ip6h->hop_limit > info->hop_limit;
+ case IP6T_HL_EQ:
+ return ip6h->hop_limit == info->hop_limit;
+ case IP6T_HL_NE:
+ return ip6h->hop_limit != info->hop_limit;
+ case IP6T_HL_LT:
+ return ip6h->hop_limit < info->hop_limit;
+ case IP6T_HL_GT:
+ return ip6h->hop_limit > info->hop_limit;
}
return false;
diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c
new file mode 100644
index 00000000000..89d53104c6b
--- /dev/null
+++ b/net/netfilter/xt_ipcomp.c
@@ -0,0 +1,111 @@
+/* Kernel module to match IPComp parameters for IPv4 and IPv6
+ *
+ * Copyright (C) 2013 WindRiver
+ *
+ * Author:
+ * Fan Du <fan.du@windriver.com>
+ *
+ * Based on:
+ * net/netfilter/xt_esp.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter/xt_ipcomp.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fan Du <fan.du@windriver.com>");
+MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+ bool r;
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
+ r = (spi >= min && spi <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ip_comp_hdr _comphdr;
+ const struct ip_comp_hdr *chdr;
+ const struct xt_ipcomp *compinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr);
+ if (chdr == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ pr_debug("Dropping evil IPComp tinygram.\n");
+ par->hotdrop = true;
+ return 0;
+ }
+
+ return spi_match(compinfo->spis[0], compinfo->spis[1],
+ ntohs(chdr->cpi),
+ !!(compinfo->invflags & XT_IPCOMP_INV_SPI));
+}
+
+static int comp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_ipcomp *compinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) {
+ pr_err("unknown flags %X\n", compinfo->invflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match comp_mt_reg[] __read_mostly = {
+ {
+ .name = "ipcomp",
+ .family = NFPROTO_IPV4,
+ .match = comp_mt,
+ .matchsize = sizeof(struct xt_ipcomp),
+ .proto = IPPROTO_COMP,
+ .checkentry = comp_mt_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "ipcomp",
+ .family = NFPROTO_IPV6,
+ .match = comp_mt,
+ .matchsize = sizeof(struct xt_ipcomp),
+ .proto = IPPROTO_COMP,
+ .checkentry = comp_mt_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init comp_mt_init(void)
+{
+ return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg));
+}
+
+static void __exit comp_mt_exit(void)
+{
+ xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg));
+}
+
+module_init(comp_mt_init);
+module_exit(comp_mt_exit);
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index 88f7c3511c7..b46626cddd9 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -31,7 +31,7 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
&iph->saddr,
(info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
- &info->src_max.ip,
+ &info->src_min.ip,
&info->src_max.ip);
return false;
}
@@ -53,15 +53,13 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
}
static inline int
-iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b)
+iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b)
{
unsigned int i;
- int r;
for (i = 0; i < 4; ++i) {
- r = ntohl(a->s6_addr32[i]) - ntohl(b->s6_addr32[i]);
- if (r != 0)
- return r;
+ if (a->s6_addr32[i] != b->s6_addr32[i])
+ return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]);
}
return 0;
@@ -75,18 +73,30 @@ iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)
bool m;
if (info->flags & IPRANGE_SRC) {
- m = iprange_ipv6_sub(&iph->saddr, &info->src_min.in6) < 0;
- m |= iprange_ipv6_sub(&iph->saddr, &info->src_max.in6) > 0;
+ m = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6);
+ m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr);
m ^= !!(info->flags & IPRANGE_SRC_INV);
- if (m)
+ if (m) {
+ pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->saddr,
+ (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
+ &info->src_min.in6,
+ &info->src_max.in6);
return false;
+ }
}
if (info->flags & IPRANGE_DST) {
- m = iprange_ipv6_sub(&iph->daddr, &info->dst_min.in6) < 0;
- m |= iprange_ipv6_sub(&iph->daddr, &info->dst_max.in6) > 0;
+ m = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6);
+ m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr);
m ^= !!(info->flags & IPRANGE_DST_INV);
- if (m)
+ if (m) {
+ pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->daddr,
+ (info->flags & IPRANGE_DST_INV) ? "(INV) " : "",
+ &info->dst_min.in6,
+ &info->dst_max.in6);
return false;
+ }
}
return true;
}
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 9127a3d8aa3..8d47c3780fd 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -67,7 +67,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
goto out;
}
- ip_vs_fill_iphdr(family, skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(family, skb, &iph);
if (data->bitmask & XT_IPVS_PROTO)
if ((iph.protocol == data->l4proto) ^
@@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */);
+ cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */);
if (unlikely(cp == NULL)) {
match = false;
goto out;
diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c
new file mode 100644
index 00000000000..8aee572771f
--- /dev/null
+++ b/net/netfilter/xt_l2tp.c
@@ -0,0 +1,354 @@
+/* Kernel module to match L2TP header parameters. */
+
+/* (C) 2013 James Chapman <jchapman@katalix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <linux/l2tp.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_l2tp.h>
+
+/* L2TP header masks */
+#define L2TP_HDR_T_BIT 0x8000
+#define L2TP_HDR_L_BIT 0x4000
+#define L2TP_HDR_VER 0x000f
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("Xtables: L2TP header match");
+MODULE_ALIAS("ipt_l2tp");
+MODULE_ALIAS("ip6t_l2tp");
+
+/* The L2TP fields that can be matched */
+struct l2tp_data {
+ u32 tid;
+ u32 sid;
+ u8 type;
+ u8 version;
+};
+
+union l2tp_val {
+ __be16 val16[2];
+ __be32 val32;
+};
+
+static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data)
+{
+ if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type))
+ return false;
+
+ if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version))
+ return false;
+
+ /* Check tid only for L2TPv3 control or any L2TPv2 packets */
+ if ((info->flags & XT_L2TP_TID) &&
+ ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) &&
+ (info->tid != data->tid))
+ return false;
+
+ /* Check sid only for L2TP data packets */
+ if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) &&
+ (info->sid != data->sid))
+ return false;
+
+ return true;
+}
+
+/* Parse L2TP header fields when UDP encapsulation is used. Handles
+ * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a
+ * different format. See
+ * RFC2661, Section 3.1, L2TPv2 Header Format
+ * RFC3931, Section 3.2.1, L2TPv3 Control Message Header
+ * RFC3931, Section 3.2.2, L2TPv3 Data Message Header
+ * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP
+ */
+static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ int uhlen = sizeof(struct udphdr);
+ int offs = thoff + uhlen;
+ union l2tp_val *lh;
+ union l2tp_val lhbuf;
+ u16 flags;
+ struct l2tp_data data = { 0, };
+
+ if (par->fragoff != 0)
+ return false;
+
+ /* Extract L2TP header fields. The flags in the first 16 bits
+ * tell us where the other fields are.
+ */
+ lh = skb_header_pointer(skb, offs, 2, &lhbuf);
+ if (lh == NULL)
+ return false;
+
+ flags = ntohs(lh->val16[0]);
+ if (flags & L2TP_HDR_T_BIT)
+ data.type = XT_L2TP_TYPE_CONTROL;
+ else
+ data.type = XT_L2TP_TYPE_DATA;
+ data.version = (u8) flags & L2TP_HDR_VER;
+
+ /* Now extract the L2TP tid/sid. These are in different places
+ * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we
+ * must also check to see if the length field is present,
+ * since this affects the offsets into the packet of the
+ * tid/sid fields.
+ */
+ if (data.version == 3) {
+ lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf);
+ if (lh == NULL)
+ return false;
+ if (data.type == XT_L2TP_TYPE_CONTROL)
+ data.tid = ntohl(lh->val32);
+ else
+ data.sid = ntohl(lh->val32);
+ } else if (data.version == 2) {
+ if (flags & L2TP_HDR_L_BIT)
+ offs += 2;
+ lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf);
+ if (lh == NULL)
+ return false;
+ data.tid = (u32) ntohs(lh->val16[0]);
+ data.sid = (u32) ntohs(lh->val16[1]);
+ } else
+ return false;
+
+ return l2tp_match(info, &data);
+}
+
+/* Parse L2TP header fields for IP encapsulation (no UDP header).
+ * L2TPv3 data packets have a different form with IP encap. See
+ * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP.
+ * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP.
+ */
+static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ union l2tp_val *lh;
+ union l2tp_val lhbuf;
+ struct l2tp_data data = { 0, };
+
+ /* For IP encap, the L2TP sid is the first 32-bits. */
+ lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf);
+ if (lh == NULL)
+ return false;
+ if (lh->val32 == 0) {
+ /* Must be a control packet. The L2TP tid is further
+ * into the packet.
+ */
+ data.type = XT_L2TP_TYPE_CONTROL;
+ lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf),
+ &lhbuf);
+ if (lh == NULL)
+ return false;
+ data.tid = ntohl(lh->val32);
+ } else {
+ data.sid = ntohl(lh->val32);
+ data.type = XT_L2TP_TYPE_DATA;
+ }
+
+ data.version = 3;
+
+ return l2tp_match(info, &data);
+}
+
+static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ u8 ipproto = iph->protocol;
+
+ /* l2tp_mt_check4 already restricts the transport protocol */
+ switch (ipproto) {
+ case IPPROTO_UDP:
+ return l2tp_udp_mt(skb, par, par->thoff);
+ case IPPROTO_L2TP:
+ return l2tp_ip_mt(skb, par, par->thoff);
+ }
+
+ return false;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ unsigned int thoff = 0;
+ unsigned short fragoff = 0;
+ int ipproto;
+
+ ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL);
+ if (fragoff != 0)
+ return false;
+
+ /* l2tp_mt_check6 already restricts the transport protocol */
+ switch (ipproto) {
+ case IPPROTO_UDP:
+ return l2tp_udp_mt(skb, par, thoff);
+ case IPPROTO_L2TP:
+ return l2tp_ip_mt(skb, par, thoff);
+ }
+
+ return false;
+}
+#endif
+
+static int l2tp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+
+ /* Check for invalid flags */
+ if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION |
+ XT_L2TP_TYPE)) {
+ pr_info("unknown flags: %x\n", info->flags);
+ return -EINVAL;
+ }
+
+ /* At least one of tid, sid or type=control must be specified */
+ if ((!(info->flags & XT_L2TP_TID)) &&
+ (!(info->flags & XT_L2TP_SID)) &&
+ ((!(info->flags & XT_L2TP_TYPE)) ||
+ (info->type != XT_L2TP_TYPE_CONTROL))) {
+ pr_info("invalid flags combination: %x\n", info->flags);
+ return -EINVAL;
+ }
+
+ /* If version 2 is specified, check that incompatible params
+ * are not supplied
+ */
+ if (info->flags & XT_L2TP_VERSION) {
+ if ((info->version < 2) || (info->version > 3)) {
+ pr_info("wrong L2TP version: %u\n", info->version);
+ return -EINVAL;
+ }
+
+ if (info->version == 2) {
+ if ((info->flags & XT_L2TP_TID) &&
+ (info->tid > 0xffff)) {
+ pr_info("v2 tid > 0xffff: %u\n", info->tid);
+ return -EINVAL;
+ }
+ if ((info->flags & XT_L2TP_SID) &&
+ (info->sid > 0xffff)) {
+ pr_info("v2 sid > 0xffff: %u\n", info->sid);
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int l2tp_mt_check4(const struct xt_mtchk_param *par)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ const struct ipt_entry *e = par->entryinfo;
+ const struct ipt_ip *ip = &e->ip;
+ int ret;
+
+ ret = l2tp_mt_check(par);
+ if (ret != 0)
+ return ret;
+
+ if ((ip->proto != IPPROTO_UDP) &&
+ (ip->proto != IPPROTO_L2TP)) {
+ pr_info("missing protocol rule (udp|l2tpip)\n");
+ return -EINVAL;
+ }
+
+ if ((ip->proto == IPPROTO_L2TP) &&
+ (info->version == 2)) {
+ pr_info("v2 doesn't support IP mode\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static int l2tp_mt_check6(const struct xt_mtchk_param *par)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ const struct ip6t_entry *e = par->entryinfo;
+ const struct ip6t_ip6 *ip = &e->ipv6;
+ int ret;
+
+ ret = l2tp_mt_check(par);
+ if (ret != 0)
+ return ret;
+
+ if ((ip->proto != IPPROTO_UDP) &&
+ (ip->proto != IPPROTO_L2TP)) {
+ pr_info("missing protocol rule (udp|l2tpip)\n");
+ return -EINVAL;
+ }
+
+ if ((ip->proto == IPPROTO_L2TP) &&
+ (info->version == 2)) {
+ pr_info("v2 doesn't support IP mode\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+#endif
+
+static struct xt_match l2tp_mt_reg[] __read_mostly = {
+ {
+ .name = "l2tp",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = l2tp_mt4,
+ .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)),
+ .checkentry = l2tp_mt_check4,
+ .hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD)),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "l2tp",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .match = l2tp_mt6,
+ .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)),
+ .checkentry = l2tp_mt_check6,
+ .hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD)),
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init l2tp_mt_init(void)
+{
+ return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg));
+}
+
+static void __exit l2tp_mt_exit(void)
+{
+ xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg));
+}
+
+module_init(l2tp_mt_init);
+module_exit(l2tp_mt_exit);
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 32b7a579a03..bef85059655 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -1,5 +1,6 @@
/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
* (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -88,8 +89,7 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
/* Precision saver. */
-static u_int32_t
-user2credits(u_int32_t user)
+static u32 user2credits(u32 user)
{
/* If multiplying would overflow... */
if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
@@ -118,12 +118,12 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
/* For SMP, we only want to use one set of state. */
r->master = priv;
+ /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
+ 128. */
+ priv->prev = jiffies;
+ priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
if (r->cost == 0) {
- /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
- 128. */
- priv->prev = jiffies;
- priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
- r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
+ r->credit_cap = priv->credit; /* Credits full. */
r->cost = user2credits(r->avg);
}
return 0;
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 8160f6b1435..d5b4fd4f91e 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -36,7 +36,7 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
if (skb_mac_header(skb) + ETH_HLEN > skb->data)
return false;
- ret = compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr) == 0;
+ ret = ether_addr_equal(eth_hdr(skb)->h_source, info->srcaddr);
ret ^= info->invert;
return ret;
}
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
new file mode 100644
index 00000000000..bea7464cc43
--- /dev/null
+++ b/net/netfilter/xt_nat.c
@@ -0,0 +1,170 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat_core.h>
+
+static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (mr->rangesize != 1) {
+ pr_info("%s: multiple ranges no longer supported\n",
+ par->target->name);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void xt_nat_convert_range(struct nf_nat_range *dst,
+ const struct nf_nat_ipv4_range *src)
+{
+ memset(&dst->min_addr, 0, sizeof(dst->min_addr));
+ memset(&dst->max_addr, 0, sizeof(dst->max_addr));
+
+ dst->flags = src->flags;
+ dst->min_addr.ip = src->min_ip;
+ dst->max_addr.ip = src->max_ip;
+ dst->min_proto = src->min;
+ dst->max_proto = src->max;
+}
+
+static unsigned int
+xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range range;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ xt_nat_convert_range(&range, &mr->range[0]);
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range range;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ xt_nat_convert_range(&range, &mr->range[0]);
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+static unsigned int
+xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST);
+}
+
+static struct xt_target xt_nat_target_reg[] __read_mostly = {
+ {
+ .name = "SNAT",
+ .revision = 0,
+ .checkentry = xt_nat_checkentry_v0,
+ .target = xt_snat_target_v0,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .family = NFPROTO_IPV4,
+ .table = "nat",
+ .hooks = (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNAT",
+ .revision = 0,
+ .checkentry = xt_nat_checkentry_v0,
+ .target = xt_dnat_target_v0,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .family = NFPROTO_IPV4,
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SNAT",
+ .revision = 1,
+ .target = xt_snat_target_v1,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNAT",
+ .revision = 1,
+ .target = xt_dnat_target_v1,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init xt_nat_init(void)
+{
+ return xt_register_targets(xt_nat_target_reg,
+ ARRAY_SIZE(xt_nat_target_reg));
+}
+
+static void __exit xt_nat_exit(void)
+{
+ xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg));
+}
+
+module_init(xt_nat_init);
+module_exit(xt_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS("ipt_SNAT");
+MODULE_ALIAS("ipt_DNAT");
+MODULE_ALIAS("ip6t_SNAT");
+MODULE_ALIAS("ip6t_DNAT");
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
new file mode 100644
index 00000000000..8c646ed9c92
--- /dev/null
+++ b/net/netfilter/xt_nfacct.c
@@ -0,0 +1,79 @@
+/*
+ * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 (or any
+ * later at your option) as published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/nfnetlink_acct.h>
+#include <linux/netfilter/xt_nfacct.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: match for the extended accounting infrastructure");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_nfacct");
+MODULE_ALIAS("ip6t_nfacct");
+
+static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ int overquota;
+ const struct xt_nfacct_match_info *info = par->targinfo;
+
+ nfnl_acct_update(skb, info->nfacct);
+
+ overquota = nfnl_acct_overquota(skb, info->nfacct);
+
+ return overquota == NFACCT_UNDERQUOTA ? false : true;
+}
+
+static int
+nfacct_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_nfacct_match_info *info = par->matchinfo;
+ struct nf_acct *nfacct;
+
+ nfacct = nfnl_acct_find_get(info->name);
+ if (nfacct == NULL) {
+ pr_info("xt_nfacct: accounting object with name `%s' "
+ "does not exists\n", info->name);
+ return -ENOENT;
+ }
+ info->nfacct = nfacct;
+ return 0;
+}
+
+static void
+nfacct_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_nfacct_match_info *info = par->matchinfo;
+
+ nfnl_acct_put(info->nfacct);
+}
+
+static struct xt_match nfacct_mt_reg __read_mostly = {
+ .name = "nfacct",
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfacct_mt_checkentry,
+ .match = nfacct_mt,
+ .destroy = nfacct_mt_destroy,
+ .matchsize = sizeof(struct xt_nfacct_match_info),
+ .me = THIS_MODULE,
+};
+
+static int __init nfacct_mt_init(void)
+{
+ return xt_register_match(&nfacct_mt_reg);
+}
+
+static void __exit nfacct_mt_exit(void)
+{
+ xt_unregister_match(&nfacct_mt_reg);
+}
+
+module_init(nfacct_mt_init);
+module_exit(nfacct_mt_exit);
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 4327e101c04..c529161cdbf 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -13,8 +13,7 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -62,13 +61,6 @@ static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
[OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) },
};
-static void xt_osf_finger_free_rcu(struct rcu_head *rcu_head)
-{
- struct xt_osf_finger *f = container_of(rcu_head, struct xt_osf_finger, rcu_head);
-
- kfree(f);
-}
-
static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const osf_attrs[])
@@ -133,7 +125,7 @@ static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb,
* We are protected by nfnl mutex.
*/
list_del_rcu(&sf->finger_entry);
- call_rcu(&sf->rcu_head, xt_osf_finger_free_rcu);
+ kfree_rcu(sf, rcu_head);
err = 0;
break;
@@ -208,6 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
unsigned char opts[MAX_IPOPTLEN];
const struct xt_osf_finger *kf;
const struct xt_osf_user_finger *f;
+ struct net *net = dev_net(p->in ? p->in : p->out);
if (!info)
return false;
@@ -276,7 +269,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
mss <<= 8;
mss |= optp[2];
- mss = ntohs(mss);
+ mss = ntohs((__force __be16)mss);
break;
case OSFOPT_TS:
loop_cont = 1;
@@ -332,7 +325,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
fcount++;
if (info->flags & XT_OSF_LOG)
- nf_log_packet(p->family, p->hooknum, skb,
+ nf_log_packet(net, p->family, p->hooknum, skb,
p->in, p->out, NULL,
"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
f->genre, f->version, f->subtype,
@@ -348,7 +341,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
rcu_read_unlock();
if (!fcount && (info->flags & XT_OSF_LOG))
- nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL,
+ nf_log_packet(net, p->family, p->hooknum, skb, p->in,
+ p->out, NULL,
"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
&ip->saddr, ntohs(tcp->source),
&ip->daddr, ntohs(tcp->dest));
@@ -414,7 +408,7 @@ static void __exit xt_osf_fini(void)
list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
list_del_rcu(&f->finger_entry);
- call_rcu(&f->rcu_head, xt_osf_finger_free_rcu);
+ kfree_rcu(f, rcu_head);
}
}
rcu_read_unlock();
@@ -428,4 +422,6 @@ module_exit(xt_osf_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
MODULE_DESCRIPTION("Passive OS fingerprint matching.");
+MODULE_ALIAS("ipt_osf");
+MODULE_ALIAS("ip6t_osf");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 772d7389b33..ca2e577ed8a 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -17,6 +17,17 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_owner.h>
+static int owner_check(const struct xt_mtchk_param *par)
+{
+ struct xt_owner_match_info *info = par->matchinfo;
+
+ /* For now only allow adding matches from the initial user namespace */
+ if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) &&
+ (current_user_ns() != &init_user_ns))
+ return -EINVAL;
+ return 0;
+}
+
static bool
owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -37,17 +48,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ((info->match ^ info->invert) &
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
- if (info->match & XT_OWNER_UID)
- if ((filp->f_cred->fsuid >= info->uid_min &&
- filp->f_cred->fsuid <= info->uid_max) ^
+ if (info->match & XT_OWNER_UID) {
+ kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
+ kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+ if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
+ uid_lte(filp->f_cred->fsuid, uid_max)) ^
!(info->invert & XT_OWNER_UID))
return false;
+ }
- if (info->match & XT_OWNER_GID)
- if ((filp->f_cred->fsgid >= info->gid_min &&
- filp->f_cred->fsgid <= info->gid_max) ^
+ if (info->match & XT_OWNER_GID) {
+ kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
+ kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+ if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max)) ^
!(info->invert & XT_OWNER_GID))
return false;
+ }
return true;
}
@@ -56,6 +73,7 @@ static struct xt_match owner_mt_reg __read_mostly = {
.name = "owner",
.revision = 1,
.family = NFPROTO_UNSPEC,
+ .checkentry = owner_check,
.match = owner_mt,
.matchsize = sizeof(struct xt_owner_match_info),
.hooks = (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 70eb2b4984d..44c8eb4c9d6 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -9,6 +9,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_quota.h>
+#include <linux/module.h>
struct xt_quota_priv {
spinlock_t lock;
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 76a083184d8..7720b036d76 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,7 +18,7 @@ static bool
xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateest_match_info *info = par->matchinfo;
- struct gnet_stats_rate_est *r;
+ struct gnet_stats_rate_est64 *r;
u_int32_t bps1, bps2, pps1, pps2;
bool ret = true;
@@ -78,7 +78,7 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
{
struct xt_rateest_match_info *info = par->matchinfo;
struct xt_rateest *est1, *est2;
- int ret = false;
+ int ret = -EINVAL;
if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS |
XT_RATEEST_MATCH_REL)) != 1)
@@ -101,13 +101,12 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
if (!est1)
goto err1;
+ est2 = NULL;
if (info->flags & XT_RATEEST_MATCH_REL) {
est2 = xt_rateest_lookup(info->name2);
if (!est2)
goto err2;
- } else
- est2 = NULL;
-
+ }
info->est1 = est1;
info->est2 = est2;
@@ -116,7 +115,7 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
err2:
xt_rateest_put(est1);
err1:
- return -EINVAL;
+ return ret;
}
static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index d2ff15a2412..a9faae89f95 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -29,6 +29,7 @@
#include <linux/skbuff.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
@@ -75,6 +76,7 @@ struct recent_entry {
struct recent_table {
struct list_head list;
char name[XT_RECENT_NAME_LEN];
+ union nf_inet_addr mask;
unsigned int refcnt;
unsigned int entries;
struct list_head lru_list;
@@ -228,10 +230,10 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
struct net *net = dev_net(par->in ? par->in : par->out);
struct recent_net *recent_net = recent_pernet(net);
- const struct xt_recent_mtinfo *info = par->matchinfo;
+ const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
struct recent_entry *e;
- union nf_inet_addr addr = {};
+ union nf_inet_addr addr = {}, addr_mask;
u_int8_t ttl;
bool ret = info->invert;
@@ -261,12 +263,15 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
spin_lock_bh(&recent_lock);
t = recent_table_lookup(recent_net, info->name);
- e = recent_entry_lookup(t, &addr, par->family,
+
+ nf_inet_addr_mask(&addr, &addr_mask, &t->mask);
+
+ e = recent_entry_lookup(t, &addr_mask, par->family,
(info->check_set & XT_RECENT_TTL) ? ttl : 0);
if (e == NULL) {
if (!(info->check_set & XT_RECENT_SET))
goto out;
- e = recent_entry_init(t, &addr, par->family, ttl);
+ e = recent_entry_init(t, &addr_mask, par->family, ttl);
if (e == NULL)
par->hotdrop = true;
ret = !ret;
@@ -306,16 +311,24 @@ out:
return ret;
}
-static int recent_mt_check(const struct xt_mtchk_param *par)
+static void recent_table_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int recent_mt_check(const struct xt_mtchk_param *par,
+ const struct xt_recent_mtinfo_v1 *info)
{
struct recent_net *recent_net = recent_pernet(par->net);
- const struct xt_recent_mtinfo *info = par->matchinfo;
struct recent_table *t;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *pde;
+ kuid_t uid;
+ kgid_t gid;
#endif
- unsigned i;
+ unsigned int i;
int ret = -EINVAL;
+ size_t sz;
if (unlikely(!hash_rnd_inited)) {
get_random_bytes(&hash_rnd, sizeof(hash_rnd));
@@ -354,27 +367,38 @@ static int recent_mt_check(const struct xt_mtchk_param *par)
goto out;
}
- t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
- GFP_KERNEL);
+ sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size;
+ if (sz <= PAGE_SIZE)
+ t = kzalloc(sz, GFP_KERNEL);
+ else
+ t = vzalloc(sz);
if (t == NULL) {
ret = -ENOMEM;
goto out;
}
t->refcnt = 1;
+
+ memcpy(&t->mask, &info->mask, sizeof(t->mask));
strcpy(t->name, info->name);
INIT_LIST_HEAD(&t->lru_list);
for (i = 0; i < ip_list_hash_size; i++)
INIT_LIST_HEAD(&t->iphash[i]);
#ifdef CONFIG_PROC_FS
+ uid = make_kuid(&init_user_ns, ip_list_uid);
+ gid = make_kgid(&init_user_ns, ip_list_gid);
+ if (!uid_valid(uid) || !gid_valid(gid)) {
+ recent_table_free(t);
+ ret = -EINVAL;
+ goto out;
+ }
pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
&recent_mt_fops, t);
if (pde == NULL) {
- kfree(t);
+ recent_table_free(t);
ret = -ENOMEM;
goto out;
}
- pde->uid = ip_list_uid;
- pde->gid = ip_list_gid;
+ proc_set_user(pde, uid, gid);
#endif
spin_lock_bh(&recent_lock);
list_add_tail(&t->list, &recent_net->tables);
@@ -385,10 +409,28 @@ out:
return ret;
}
+static int recent_mt_check_v0(const struct xt_mtchk_param *par)
+{
+ const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo;
+ struct xt_recent_mtinfo_v1 info_v1;
+
+ /* Copy revision 0 structure to revision 1 */
+ memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo));
+ /* Set default mask to ensure backward compatible behaviour */
+ memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all));
+
+ return recent_mt_check(par, &info_v1);
+}
+
+static int recent_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ return recent_mt_check(par, par->matchinfo);
+}
+
static void recent_mt_destroy(const struct xt_mtdtor_param *par)
{
struct recent_net *recent_net = recent_pernet(par->net);
- const struct xt_recent_mtinfo *info = par->matchinfo;
+ const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
mutex_lock(&recent_mutex);
@@ -398,10 +440,11 @@ static void recent_mt_destroy(const struct xt_mtdtor_param *par)
list_del(&t->list);
spin_unlock_bh(&recent_lock);
#ifdef CONFIG_PROC_FS
- remove_proc_entry(t->name, recent_net->xt_recent);
+ if (recent_net->xt_recent != NULL)
+ remove_proc_entry(t->name, recent_net->xt_recent);
#endif
recent_table_flush(t);
- kfree(t);
+ recent_table_free(t);
}
mutex_unlock(&recent_mutex);
}
@@ -478,14 +521,13 @@ static const struct seq_operations recent_seq_ops = {
static int recent_seq_open(struct inode *inode, struct file *file)
{
- struct proc_dir_entry *pde = PDE(inode);
struct recent_iter_state *st;
st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
if (st == NULL)
return -ENOMEM;
- st->table = pde->data;
+ st->table = PDE_DATA(inode);
return 0;
}
@@ -493,8 +535,7 @@ static ssize_t
recent_mt_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *loff)
{
- const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
- struct recent_table *t = pde->data;
+ struct recent_table *t = PDE_DATA(file_inode(file));
struct recent_entry *e;
char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
const char *c = buf;
@@ -582,7 +623,21 @@ static int __net_init recent_proc_net_init(struct net *net)
static void __net_exit recent_proc_net_exit(struct net *net)
{
- proc_net_remove(net, "xt_recent");
+ struct recent_net *recent_net = recent_pernet(net);
+ struct recent_table *t;
+
+ /* recent_net_exit() is called before recent_mt_destroy(). Make sure
+ * that the parent xt_recent proc entry is is empty before trying to
+ * remove it.
+ */
+ spin_lock_bh(&recent_lock);
+ list_for_each_entry(t, &recent_net->tables, list)
+ remove_proc_entry(t->name, recent_net->xt_recent);
+
+ recent_net->xt_recent = NULL;
+ spin_unlock_bh(&recent_lock);
+
+ remove_proc_entry("xt_recent", net->proc_net);
}
#else
static inline int recent_proc_net_init(struct net *net)
@@ -605,9 +660,6 @@ static int __net_init recent_net_init(struct net *net)
static void __net_exit recent_net_exit(struct net *net)
{
- struct recent_net *recent_net = recent_pernet(net);
-
- BUG_ON(!list_empty(&recent_net->tables));
recent_proc_net_exit(net);
}
@@ -625,7 +677,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = recent_mt,
.matchsize = sizeof(struct xt_recent_mtinfo),
- .checkentry = recent_mt_check,
+ .checkentry = recent_mt_check_v0,
.destroy = recent_mt_destroy,
.me = THIS_MODULE,
},
@@ -635,10 +687,30 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = recent_mt,
.matchsize = sizeof(struct xt_recent_mtinfo),
- .checkentry = recent_mt_check,
+ .checkentry = recent_mt_check_v0,
+ .destroy = recent_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "recent",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .match = recent_mt,
+ .matchsize = sizeof(struct xt_recent_mtinfo_v1),
+ .checkentry = recent_mt_check_v1,
.destroy = recent_mt_destroy,
.me = THIS_MODULE,
},
+ {
+ .name = "recent",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = recent_mt,
+ .matchsize = sizeof(struct xt_recent_mtinfo_v1),
+ .checkentry = recent_mt_check_v1,
+ .destroy = recent_mt_destroy,
+ .me = THIS_MODULE,
+ }
};
static int __init recent_mt_init(void)
diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h
index 6efe4e5a81c..8fd324116e6 100644
--- a/net/netfilter/xt_repldata.h
+++ b/net/netfilter/xt_repldata.h
@@ -5,23 +5,35 @@
* they serve as the hanging-off data accessed through repl.data[].
*/
+/* tbl has the following structure equivalent, but is C99 compliant:
+ * struct {
+ * struct type##_replace repl;
+ * struct type##_standard entries[nhooks];
+ * struct type##_error term;
+ * } *tbl;
+ */
+
#define xt_alloc_initial_table(type, typ2) ({ \
unsigned int hook_mask = info->valid_hooks; \
unsigned int nhooks = hweight32(hook_mask); \
unsigned int bytes = 0, hooknum = 0, i = 0; \
struct { \
struct type##_replace repl; \
- struct type##_standard entries[nhooks]; \
- struct type##_error term; \
- } *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \
+ struct type##_standard entries[]; \
+ } *tbl; \
+ struct type##_error *term; \
+ size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \
+ __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \
+ tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \
if (tbl == NULL) \
return NULL; \
+ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \
strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
- tbl->term = (struct type##_error)typ2##_ERROR_INIT; \
+ *term = (struct type##_error)typ2##_ERROR_INIT; \
tbl->repl.valid_hooks = hook_mask; \
tbl->repl.num_entries = nhooks + 1; \
tbl->repl.size = nhooks * sizeof(struct type##_standard) + \
- sizeof(struct type##_error); \
+ sizeof(struct type##_error); \
for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \
if (!(hook_mask & 1)) \
continue; \
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
new file mode 100644
index 00000000000..80c2e2d603e
--- /dev/null
+++ b/net/netfilter/xt_set.c
@@ -0,0 +1,523 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module which implements the set match and SET target
+ * for netfilter/iptables. */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("Xtables: IP set match and target module");
+MODULE_ALIAS("xt_SET");
+MODULE_ALIAS("ipt_set");
+MODULE_ALIAS("ip6t_set");
+MODULE_ALIAS("ipt_SET");
+MODULE_ALIAS("ip6t_SET");
+
+static inline int
+match_set(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, int inv)
+{
+ if (ip_set_test(index, skb, par, opt))
+ inv = !inv;
+ return inv;
+}
+
+#define ADT_OPT(n, f, d, fs, cfs, t) \
+struct ip_set_adt_opt n = { \
+ .family = f, \
+ .dim = d, \
+ .flags = fs, \
+ .cmdflags = cfs, \
+ .ext.timeout = t, \
+}
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
+static bool
+set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v0 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
+ info->match_set.u.compat.flags, 0, UINT_MAX);
+
+ return match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.u.compat.flags & IPSET_INV_MATCH);
+}
+
+static void
+compat_flags(struct xt_set_info_v0 *info)
+{
+ u_int8_t i;
+
+ /* Fill out compatibility data according to enum ip_set_kopt */
+ info->u.compat.dim = IPSET_DIM_ZERO;
+ if (info->u.flags[0] & IPSET_MATCH_INV)
+ info->u.compat.flags |= IPSET_INV_MATCH;
+ for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+ info->u.compat.dim++;
+ if (info->u.flags[i] & IPSET_SRC)
+ info->u.compat.flags |= (1<<info->u.compat.dim);
+ }
+}
+
+static int
+set_match_v0_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find set identified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warning("Protocol error: set match dimension "
+ "is over the limit!\n");
+ ip_set_nfnl_put(par->net, info->match_set.index);
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->match_set);
+
+ return 0;
+}
+
+static void
+set_match_v0_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+
+ ip_set_nfnl_put(par->net, info->match_set.index);
+}
+
+/* Revision 1 match */
+
+static bool
+set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v1 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.dim,
+ info->match_set.flags, 0, UINT_MAX);
+
+ if (opt.flags & IPSET_RETURN_NOMATCH)
+ opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH;
+
+ return match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.flags & IPSET_INV_MATCH);
+}
+
+static int
+set_match_v1_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match_v1 *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find set identified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.dim > IPSET_DIM_MAX) {
+ pr_warning("Protocol error: set match dimension "
+ "is over the limit!\n");
+ ip_set_nfnl_put(par->net, info->match_set.index);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_match_v1_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match_v1 *info = par->matchinfo;
+
+ ip_set_nfnl_put(par->net, info->match_set.index);
+}
+
+/* Revision 3 match */
+
+static bool
+match_counter(u64 counter, const struct ip_set_counter_match *info)
+{
+ switch (info->op) {
+ case IPSET_COUNTER_NONE:
+ return true;
+ case IPSET_COUNTER_EQ:
+ return counter == info->value;
+ case IPSET_COUNTER_NE:
+ return counter != info->value;
+ case IPSET_COUNTER_LT:
+ return counter < info->value;
+ case IPSET_COUNTER_GT:
+ return counter > info->value;
+ }
+ return false;
+}
+
+static bool
+set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v3 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.dim,
+ info->match_set.flags, info->flags, UINT_MAX);
+ int ret;
+
+ if (info->packets.op != IPSET_COUNTER_NONE ||
+ info->bytes.op != IPSET_COUNTER_NONE)
+ opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
+
+ ret = match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.flags & IPSET_INV_MATCH);
+
+ if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
+ return ret;
+
+ if (!match_counter(opt.ext.packets, &info->packets))
+ return 0;
+ return match_counter(opt.ext.bytes, &info->bytes);
+}
+
+#define set_match_v3_checkentry set_match_v1_checkentry
+#define set_match_v3_destroy set_match_v1_destroy
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
+static unsigned int
+set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+ ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
+ info->add_set.u.compat.flags, 0, UINT_MAX);
+ ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
+ info->del_set.u.compat.flags, 0, UINT_MAX);
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par, &add_opt);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par, &del_opt);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_v0_checkentry(const struct xt_tgchk_param *par)
+{
+ struct xt_set_info_target_v0 *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
+ info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warning("Protocol error: SET target dimension "
+ "is over the limit!\n");
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->add_set);
+ compat_flags(&info->del_set);
+
+ return 0;
+}
+
+static void
+set_target_v0_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+}
+
+/* Revision 1 target */
+
+static unsigned int
+set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v1 *info = par->targinfo;
+ ADT_OPT(add_opt, par->family, info->add_set.dim,
+ info->add_set.flags, 0, UINT_MAX);
+ ADT_OPT(del_opt, par->family, info->del_set.dim,
+ info->del_set.flags, 0, UINT_MAX);
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par, &add_opt);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par, &del_opt);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_v1_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target_v1 *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.dim > IPSET_DIM_MAX ||
+ info->del_set.dim > IPSET_DIM_MAX) {
+ pr_warning("Protocol error: SET target dimension "
+ "is over the limit!\n");
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_target_v1_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target_v1 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+}
+
+/* Revision 2 target */
+
+static unsigned int
+set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v2 *info = par->targinfo;
+ ADT_OPT(add_opt, par->family, info->add_set.dim,
+ info->add_set.flags, info->flags, info->timeout);
+ ADT_OPT(del_opt, par->family, info->del_set.dim,
+ info->del_set.flags, 0, UINT_MAX);
+
+ /* Normalize to fit into jiffies */
+ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
+ add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par, &add_opt);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par, &del_opt);
+
+ return XT_CONTINUE;
+}
+
+#define set_target_v2_checkentry set_target_v1_checkentry
+#define set_target_v2_destroy set_target_v1_destroy
+
+static struct xt_match set_matches[] __read_mostly = {
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .match = set_match_v0,
+ .matchsize = sizeof(struct xt_set_info_match_v0),
+ .checkentry = set_match_v0_checkentry,
+ .destroy = set_match_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ /* --return-nomatch flag support */
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 2,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 2,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ /* counters support: update, match */
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 3,
+ .match = set_match_v3,
+ .matchsize = sizeof(struct xt_set_info_match_v3),
+ .checkentry = set_match_v3_checkentry,
+ .destroy = set_match_v3_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 3,
+ .match = set_match_v3,
+ .matchsize = sizeof(struct xt_set_info_match_v3),
+ .checkentry = set_match_v3_checkentry,
+ .destroy = set_match_v3_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static struct xt_target set_targets[] __read_mostly = {
+ {
+ .name = "SET",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v0,
+ .targetsize = sizeof(struct xt_set_info_target_v0),
+ .checkentry = set_target_v0_checkentry,
+ .destroy = set_target_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v1,
+ .targetsize = sizeof(struct xt_set_info_target_v1),
+ .checkentry = set_target_v1_checkentry,
+ .destroy = set_target_v1_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .target = set_target_v1,
+ .targetsize = sizeof(struct xt_set_info_target_v1),
+ .checkentry = set_target_v1_checkentry,
+ .destroy = set_target_v1_destroy,
+ .me = THIS_MODULE
+ },
+ /* --timeout and --exist flags support */
+ {
+ .name = "SET",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v2,
+ .targetsize = sizeof(struct xt_set_info_target_v2),
+ .checkentry = set_target_v2_checkentry,
+ .destroy = set_target_v2_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .target = set_target_v2,
+ .targetsize = sizeof(struct xt_set_info_target_v2),
+ .checkentry = set_target_v2_checkentry,
+ .destroy = set_target_v2_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static int __init xt_set_init(void)
+{
+ int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches));
+
+ if (!ret) {
+ ret = xt_register_targets(set_targets,
+ ARRAY_SIZE(set_targets));
+ if (ret)
+ xt_unregister_matches(set_matches,
+ ARRAY_SIZE(set_matches));
+ }
+ return ret;
+}
+
+static void __exit xt_set_fini(void)
+{
+ xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches));
+ xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets));
+}
+
+module_init(xt_set_init);
+module_exit(xt_set_fini);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 00d6ae83830..1ba67931eb1 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -19,18 +19,18 @@
#include <net/icmp.h>
#include <net/sock.h>
#include <net/inet_sock.h>
-#include <net/netfilter/nf_tproxy_core.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
#define XT_SOCKET_HAVE_IPV6 1
#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/inet6_hashtables.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
#include <linux/netfilter/xt_socket.h>
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#define XT_SOCKET_HAVE_CONNTRACK 1
#include <net/netfilter/nf_conntrack.h>
#endif
@@ -92,16 +92,53 @@ extract_icmp4_fields(const struct sk_buff *skb,
return 0;
}
+/* "socket" match based redirection (no specific rule)
+ * ===================================================
+ *
+ * There are connections with dynamic endpoints (e.g. FTP data
+ * connection) that the user is unable to add explicit rules
+ * for. These are taken care of by a generic "socket" rule. It is
+ * assumed that the proxy application is trusted to open such
+ * connections without explicit iptables rule (except of course the
+ * generic 'socket' rule). In this case the following sockets are
+ * matched in preference order:
+ *
+ * - match: if there's a fully established connection matching the
+ * _packet_ tuple
+ *
+ * - match: if there's a non-zero bound listener (possibly with a
+ * non-local address) We don't accept zero-bound listeners, since
+ * then local services could intercept traffic going through the
+ * box.
+ */
+static struct sock *
+xt_socket_get_sock_v4(struct net *net, const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return __inet_lookup(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+ return NULL;
+}
+
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
{
const struct iphdr *iph = ip_hdr(skb);
struct udphdr _hdr, *hp = NULL;
- struct sock *sk;
- __be32 daddr, saddr;
- __be16 dport, sport;
- u8 protocol;
+ struct sock *sk = skb->sk;
+ __be32 uninitialized_var(daddr), uninitialized_var(saddr);
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ u8 uninitialized_var(protocol);
#ifdef XT_SOCKET_HAVE_CONNTRACK
struct nf_conn const *ct;
enum ip_conntrack_info ctinfo;
@@ -134,9 +171,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct) &&
((iph->protocol != IPPROTO_ICMP &&
- ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) ||
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
(iph->protocol == IPPROTO_ICMP &&
- ctinfo == IP_CT_IS_REPLY + IP_CT_RELATED)) &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
(ct->status & IPS_SRC_NAT_DONE)) {
daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
@@ -146,25 +183,31 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
}
#endif
- sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
- saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
- if (sk != NULL) {
+ if (!sk)
+ sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol,
+ saddr, daddr, sport, dport,
+ par->in);
+ if (sk) {
bool wildcard;
bool transparent = true;
- /* Ignore sockets listening on INADDR_ANY */
- wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+ /* Ignore sockets listening on INADDR_ANY,
+ * unless XT_SOCKET_NOWILDCARD is set
+ */
+ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
+ sk->sk_state != TCP_TIME_WAIT &&
inet_sk(sk)->inet_rcv_saddr == 0);
/* Ignore non-transparent sockets,
if XT_SOCKET_TRANSPARENT is used */
- if (info && info->flags & XT_SOCKET_TRANSPARENT)
+ if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = ((sk->sk_state != TCP_TIME_WAIT &&
inet_sk(sk)->transparent) ||
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
- nf_tproxy_put_sock(sk);
+ if (sk != skb->sk)
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -181,11 +224,15 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
static bool
socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
- return socket_match(skb, par, NULL);
+ static struct xt_socket_mtinfo1 xt_info_v0 = {
+ .flags = 0,
+ };
+
+ return socket_match(skb, par, &xt_info_v0);
}
static bool
-socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, par->matchinfo);
}
@@ -205,6 +252,7 @@ extract_icmp6_fields(const struct sk_buff *skb,
struct icmp6hdr *icmph, _icmph;
__be16 *ports, _ports[2];
u8 inside_nexthdr;
+ __be16 inside_fragoff;
int inside_hdrlen;
icmph = skb_header_pointer(skb, outside_hdrlen,
@@ -220,7 +268,8 @@ extract_icmp6_fields(const struct sk_buff *skb,
return 1;
inside_nexthdr = inside_iph->nexthdr;
- inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph),
+ &inside_nexthdr, &inside_fragoff);
if (inside_hdrlen < 0)
return 1; /* hjm: Packet has no/incomplete transport layer headers. */
@@ -244,18 +293,37 @@ extract_icmp6_fields(const struct sk_buff *skb,
return 0;
}
+static struct sock *
+xt_socket_get_sock_v6(struct net *net, const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet6_lookup(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+
+ return NULL;
+}
+
static bool
-socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
struct udphdr _hdr, *hp = NULL;
- struct sock *sk;
- struct in6_addr *daddr, *saddr;
- __be16 dport, sport;
- int thoff, tproto;
+ struct sock *sk = skb->sk;
+ struct in6_addr *daddr = NULL, *saddr = NULL;
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ int thoff = 0, uninitialized_var(tproto);
const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
if (tproto < 0) {
pr_debug("unable to find transport header in IPv6 packet, dropping\n");
return NF_DROP;
@@ -280,25 +348,31 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
return false;
}
- sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
- saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
- if (sk != NULL) {
+ if (!sk)
+ sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto,
+ saddr, daddr, sport, dport,
+ par->in);
+ if (sk) {
bool wildcard;
bool transparent = true;
- /* Ignore sockets listening on INADDR_ANY */
- wildcard = (sk->sk_state != TCP_TIME_WAIT &&
- ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+ /* Ignore sockets listening on INADDR_ANY
+ * unless XT_SOCKET_NOWILDCARD is set
+ */
+ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
+ sk->sk_state != TCP_TIME_WAIT &&
+ ipv6_addr_any(&sk->sk_v6_rcv_saddr));
/* Ignore non-transparent sockets,
if XT_SOCKET_TRANSPARENT is used */
- if (info && info->flags & XT_SOCKET_TRANSPARENT)
+ if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = ((sk->sk_state != TCP_TIME_WAIT &&
inet_sk(sk)->transparent) ||
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
- nf_tproxy_put_sock(sk);
+ if (sk != skb->sk)
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -314,6 +388,28 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
}
#endif
+static int socket_mt_v1_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+ if (info->flags & ~XT_SOCKET_FLAGS_V1) {
+ pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int socket_mt_v2_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
+
+ if (info->flags & ~XT_SOCKET_FLAGS_V2) {
+ pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
+ return -EINVAL;
+ }
+ return 0;
+}
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
@@ -328,7 +424,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV4,
- .match = socket_mt4_v1,
+ .match = socket_mt4_v1_v2,
+ .checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -339,7 +436,32 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV6,
- .match = socket_mt6_v1,
+ .match = socket_mt6_v1_v2,
+ .checkentry = socket_mt_v1_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#endif
+ {
+ .name = "socket",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .match = socket_mt4_v1_v2,
+ .checkentry = socket_mt_v2_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#ifdef XT_SOCKET_HAVE_IPV6
+ {
+ .name = "socket",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .match = socket_mt6_v1_v2,
+ .checkentry = socket_mt_v2_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 42ecb71d445..11de55e7a86 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -16,6 +16,7 @@
#include <linux/netfilter/xt_statistic.h>
#include <linux/netfilter/x_tables.h>
+#include <linux/module.h>
struct xt_statistic_priv {
atomic_t count;
@@ -36,7 +37,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
switch (info->mode) {
case XT_STATISTIC_MODE_RANDOM:
- if ((net_random() & 0x7FFFFFFF) < info->u.random.probability)
+ if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)
ret = !ret;
break;
case XT_STATISTIC_MODE_NTH:
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index c48975ff8ea..0ae55a36f49 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -42,6 +42,7 @@ static const u_int16_t days_since_leapyear[] = {
*/
enum {
DSE_FIRST = 2039,
+ SECONDS_PER_DAY = 86400,
};
static const u_int16_t days_since_epoch[] = {
/* 2039 - 2030 */
@@ -78,7 +79,7 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time)
unsigned int v, w;
/* Each day has 86400s, so finding the hour/minute is actually easy. */
- v = time % 86400;
+ v = time % SECONDS_PER_DAY;
r->second = v % 60;
w = v / 60;
r->minute = w % 60;
@@ -199,6 +200,18 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (packet_time < info->daytime_start &&
packet_time > info->daytime_stop)
return false;
+
+ /** if user asked to ignore 'next day', then e.g.
+ * '1 PM Wed, August 1st' should be treated
+ * like 'Tue 1 PM July 31st'.
+ *
+ * This also causes
+ * 'Monday, "23:00 to 01:00", to match for 2 hours, starting
+ * Monday 23:00 to Tuesday 01:00.
+ */
+ if ((info->flags & XT_TIME_CONTIGUOUS) &&
+ packet_time <= info->daytime_stop)
+ stamp -= SECONDS_PER_DAY;
}
localtime_2(&current_time, stamp);
@@ -227,6 +240,15 @@ static int time_mt_check(const struct xt_mtchk_param *par)
return -EDOM;
}
+ if (info->flags & ~XT_TIME_ALL_FLAGS) {
+ pr_info("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS);
+ return -EINVAL;
+ }
+
+ if ((info->flags & XT_TIME_CONTIGUOUS) &&
+ info->daytime_start < info->daytime_stop)
+ return -EINVAL;
+
return 0;
}